eBPF Talk: 動態或靜態 tailcall
動態 tailcall
?靜態 tailcall
?爲什麼 tailcall
會有動靜之分呢?
其實,就是看在使用 bpf_taill_call()
時,傳入的 index
參數是常量還是變量。
動態 tailcall
在使用 bpf_tail_call()
時,傳入的 index
參數是變量;而且,該變量的值是在運行時才能確定的。如果該值在編譯時就能推算出來,就會變成靜態 tailcall
。
比如,該 index
是從某個 bpf map 裏取出來的、或者從 ctx
裏取出來的。
SEC("kprobe/inet_csk_complete_hashdance")
int k_icsk_complete_hashdance(struct pt_regs *ctx)
{
struct sock *sk;
sk = (typeof(sk))PT_REGS_PARM2(ctx);
__u32 key = 0;
bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);
u32 idx = BPF_CORE_READ(sk, __sk_common.skc_daddr);
bpf_tail_call(ctx, &progs, idx); // dynamic tailcall
return 0;
}
靜態 tailcall
在使用 bpf_tail_call()
時,傳入的 index
參數是常量;或者,index
的值在編譯時就能推算出來。
SEC("kprobe/tcp_connect")
int k_tcp_connect(struct pt_regs *ctx)
{
struct sock *sk;
sk = (typeof(sk))PT_REGS_PARM1(ctx);
__u32 key = 0;
bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);
bpf_tail_call_static(ctx, &progs, 0); // static tailcall
return 0;
}
P.S. demo 源代碼:GitHub - Asphaltt/learn-by-example[1]。
其中 bpf_tail_call_static()
的定義如下:
#if __clang_major__ >= 8 && defined(__bpf__)
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
if (!__builtin_constant_p(slot))
__bpf_unreachable();
/*
* Provide a hard guarantee that LLVM won't optimize setting r2 (map
* pointer) and r3 (constant map index) from _different paths_ ending
* up at the _same_ call insn as otherwise we won't be able to use the
* jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
* given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
* tracking for prog array pokes") for details on verifier tracking.
*
* Note on clobber list: we need to stay in-line with BPF calling
* convention, so even if we don't end up using r0, r4, r5, we need
* to mark them as clobber so that LLVM doesn't end up using them
* before / after the call.
*/
asm volatile("r1 = %[ctx]\n\t"
"r2 = %[map]\n\t"
"r3 = %[slot]\n\t"
"call 12"
:: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
: "r0", "r1", "r2", "r3", "r4", "r5");
}
#endif
所以,在使用靜態 tailcall
時,最好使用 bpf_tail_call_static()
封裝一下。
動靜之分
爲什麼 tailcall
會有動靜之分呢?
從運行時出發,靜態 tailcall
性能更好、更安全。而動態 tailcall
的性能稍差,因爲要查詢一次數組。
直接看 x86 JIT 對 tailcall
的處理:
// ${KERNEL}/arch/x86/net/bpf_jit_comp.c
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
// ...
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
bpf_prog->aux->stack_depth,
ctx);
else
emit_bpf_tail_call_indirect(&prog,
callee_regs_used,
bpf_prog->aux->stack_depth,
image + addrs[i - 1],
ctx);
break;
// ...
}
/*
* Generate the following code:
*
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
* if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
* goto out;
* goto *(prog->bpf_func + prologue_size);
* out:
*/
static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
// ...
}
static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
u8 **pprog, u8 *ip,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
{
// ...
}
再來對比一下對應的 x86 彙編:
# static tailcall, aka. emit_bpf_tail_call_direct()
0xffffffffc004effa: mov -0x14(%rbp),%eax # %eax = tailcall count
0xffffffffc004f000: cmp $0x21,%eax # %eax >= MAX_TAIL_CALL_CNT
0xffffffffc004f003: jae 0xffffffffc004f023 # goto out
0xffffffffc004f005: add $0x1,%eax # %eax++ == tailcall count++
0xffffffffc004f008: mov %eax,-0x14(%rbp) # save tailcall count to stack
0xffffffffc004f00e: nopl 0x0(%rax,%rax,1) # do nothing at stub
0xffffffffc004f013: pop %r13 # restore callee saved registers
0xffffffffc004f015: pop %rbx # restore callee saved registers
0xffffffffc004f016: pop %rax # restore callee saved registers
0xffffffffc004f017: add $0x10,%rsp # adjust stack depth
0xffffffffc004f01e: jmp 0xffffffffc004fae3 # long jmp to target tailcall prog's after-prologue part
0xffffffffc004f023: xor %eax,%eax
# dynamic tailcall, aka. emit_bpf_tail_call_indirect()
/*
* rdi - pointer to ctx
* rsi - pointer to bpf_array
* rdx - index in bpf_array
*/
0xffffffffc004fddb: mov -0x10(%rbp),%edx # %edx = index
0xffffffffc004fdde: mov %rbx,%rdi # %rdi = ctx
0xffffffffc004fde1: movabs $0xffff9e67028fee00,%rsi # %rsi = PROG_ARRAY bpf map
0xffffffffc004fdeb: mov %edx,%edx # %edx = index
0xffffffffc004fded: cmp %edx,0x24(%rsi) # %edx >= array->map.max_entries
0xffffffffc004fdf0: jbe 0xffffffffc004fe29 # goto out
0xffffffffc004fdf2: mov -0x14(%rbp),%eax # %eax = tailcall count
0xffffffffc004fdf8: cmp $0x21,%eax # %eax >= MAX_TAIL_CALL_CNT
0xffffffffc004fdfb: jae 0xffffffffc004fe29 # goto out
0xffffffffc004fdfd: add $0x1,%eax # %eax++ == tailcall count++
0xffffffffc004fe00: mov %eax,-0x14(%rbp) # save tailcall count to stack
0xffffffffc004fe06: mov 0x110(%rsi,%rdx,8),%rcx # %rcx = array->ptrs[index]
0xffffffffc004fe0e: test %rcx,%rcx # %rcx == NULL
0xffffffffc004fe11: je 0xffffffffc004fe29 # goto out
0xffffffffc004fe13: pop %rbx # restore callee saved registers
0xffffffffc004fe14: pop %rax # restore callee saved registers
0xffffffffc004fe15: add $0x10,%rsp # adjust stack depth
0xffffffffc004fe1c: mov 0x30(%rcx),%rcx # %rcx = prog->bpf_func
0xffffffffc004fe20: add $0xb,%rcx # %rcx += X86_TAIL_CALL_OFFSET
0xffffffffc004fe24: jmp 0xffffffffb58d8fc0 # long jmp to target tailcall prog's after-prologue part
0xffffffffc004fe29: xor %eax,%eax
靜態 tailcall
使用了 11 條指令,而動態 tailcall
使用了 20 條指令。
這是因爲靜態 tailcall
不需要查詢數組,而動態 tailcall
需要查詢數組。
而且,在 PROG_ARRAY 沒有更新的情況下,靜態 tailcall
不會生成 long jmp
指令,而會使用 jmp
指令跳過幾條指令。
# static tailcall, aka. emit_bpf_tail_call_direct(), without update
0xffffffffc004effe: mov -0x14(%rbp),%eax
0xffffffffc004f004: cmp $0x21,%eax
0xffffffffc004f007: jae 0xffffffffc004f027
0xffffffffc004f009: add $0x1,%eax
0xffffffffc004f00c: mov %eax,-0x14(%rbp)
0xffffffffc004f012: jmp 0xffffffffc004f027 # short jmp == goto out
0xffffffffc004f017: pop %r13
0xffffffffc004f019: pop %rbx
0xffffffffc004f01a: pop %rax
0xffffffffc004f01b: add $0x10,%rsp
0xffffffffc004f022: nopl 0x0(%rax,%rax,1)
0xffffffffc004f027: xor %eax,%eax
小結
-
tailcall
有動靜之分; -
靜態
tailcall
性能更好、更安全; -
動態
tailcall
性能稍差,更靈活;
更多 tailcall
細節,敬請期待 eBPF Talk: 更新 tailcall PROG_ARRAY bpf map。
參考資料
[1]
GitHub - Asphaltt/learn-by-example: https://github.com/Asphaltt/learn-by-example/tree/main/ebpf/tailcall
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/B64C-CdRJDi9BPpqzyoeOA