eBPF Talk: 嘗試 trace tailcall 程序？！

在 eBPF Talk: trace tailcall 程序？NO！裏，我們知道 tailcall 程序是不能直接使用 fentry/fexit 進行 trace 的。

如果通過內核模塊，使用比較 hack 的方式，能否 trace tailcall 程序呢？

TL;DR 能對靜態 tailcall 進行 trace，還不能對動態 tailcall 進行 trace。

設計內核模塊

經過好幾次試驗，最終設計出了一個可以對靜態 tailcall 進行 trace 的內核模塊。

如上圖所示，該內核模塊的設計思路是：

劫持 bpf_tail_call_static() 所使用的 jmp 指令。
劫持後的 jmp 指令跳轉到內核模塊準備好了的 trampoline。
在 trampoline 裏，使用 call 指令調用用來 trace 的 bpf prog。
最後，使用 jmp 指令跳轉回 bpf_tail_call_static() 的目標 bpf prog。

最終效果如下：

# ./bpf-tailcall-tracer
2023/08/26 15:12:07 Attached kprobe(tcp_connect)
2023/08/26 15:12:07 Attached kprobe(inet_csk_complete_hashdance)
2023/08/26 15:12:07 Listening events...
2023/08/26 15:26:27 new tcp connection: 192.168.64.11:59106 -> 142.251.12.113:80 (fentry on index: 2)
2023/08/26 15:26:27 new tcp connection: 192.168.64.11:59106 -> 142.251.12.113:80 (kprobe)
2023/08/26 15:26:31 new tcp connection: 192.168.64.11:22 -> 192.168.64.1:62039 (fentry on index: 3)
2023/08/26 15:26:31 new tcp connection: 192.168.64.11:22 -> 192.168.64.1:62039 (kprobe)

實現靜態 tailcall 的 tracer

今年 2 月的時候，和一位大佬討論過如何 trace tailcall 程序。

而後，到 7 月的時候，確認無法直接使用 fentry/fexit trace tailcall 程序。

接着，歷時 1 個多月，使用內核模塊這樣 hack 的方式，從想法到實驗，終於能夠對靜態 tailcall 進行 trace 了。

實現細節 1：設計 trampoline

經歷了 3 個想法後，最終使用 trampoline 的方式實現了對靜態 tailcall 的 trace。

該 trampoline 比較簡單，如下：

/*
 * trampoline image:
 * 1: push %rax                     // tail_call_cnt
 * 2: push %rdi                     // first arg, aka ctx
 * 3: mov %esi, ${index}            // second arg, array index
 * 4: call ${fentry_tailcall}       // call fentry bpf prog
 * 5: pop %rdi                      // pop stack
 * 6: pop %rax                      // pop stack
 * 7: jmp ${tgt_prog}               // jump to target prog
 * 8: nop                           // extra space
 */

爲了生成這段彙編，直接從 ${KERNEL}/arch/x86/net/bpf_jit_comp.c 裏抄了不少代碼；也從 ${KERNEL}/kernel/bpf/trampoline.c 裏抄了管理 trampoline image 的代碼。

最終，能夠生成如下 trampoline image：

   0xffffffffc0ab3000:  push   %rax
   0xffffffffc0ab3001:  push   %rdi
   0xffffffffc0ab3002:  mov    $0x0,%esi
   0xffffffffc0ab3007:  call   0xffffffffc00fb598
   0xffffffffc0ab300c:  pop    %rdi
   0xffffffffc0ab300d:  pop    %rax
   0xffffffffc0ab300e:  jmp    0xffffffffc00fb1b3
   0xffffffffc0ab3013:  nop

實現細節 2：多個 trampoline

如上的 trampoline 存在一個問題：它使用的數組索引是固定的，而不是動態的。

所以，爲了支持多個 tailcall bpf prog，需要多個 trampoline；每個靜態 tailcall 對應一個 trampoline。

簡單起見，最多支持 PAGE_SIZE / 20 個 trampoline；這是因爲每個 trampoline 的大小是 20 字節。

#define TRAMP_IMAGE_SIZE 20
#define TRAMP_IMAGE_CAP (PAGE_SIZE / TRAMP_IMAGE_SIZE)
static bool __tramp_constructed[TRAMP_IMAGE_CAP] = {};
static void *bpf_tailcall_tramp_image = NULL;

static int
__construct_tramp_images(struct bpf_array *array, struct bpf_prog *fentry_prog)
{
    struct bpf_prog *bp;
    u8 *prog;
    int ret;
    u32 i;

    for (i = 0; i<array->map.max_entries && i<TRAMP_IMAGE_CAP; i++) {
        if (__has_contructed(i))
            continue;

        bp = (struct bpf_prog *) array->ptrs[i];
        if (!bp)
            continue;

        prog = __get_tramp_image(i);

        ret = __construct_tramp_image(prog, bp, fentry_prog, i);
        if (unlikely(ret)) {
            pr_err("[X] __construct_tramp_image failed: %d\n", ret);
            return ret;
        }

        __mark_constructed(i);
    }

    /* int3 */
    __fill_hole(prog, PAGE_SIZE - (TRAMP_IMAGE_SIZE * i));

    return 0;
}

實現細節 3：劫持 `jmp` 指令

參考 eBPF Talk: 更新 tailcall PROG_ARRAY bpf map 裏的 prog_array_map_poke_run()，劫持 jmp 指令的代碼如下：

static int
__bpf_poke_progs(struct bpf_array *array, bool is_hack)
{
    struct bpf_array_aux *aux = array->aux;
    struct prog_poke_elem *elem;

    list_for_each_entry(elem, &aux->poke_progs, list) {
        // ...

        for (i = 0; i < elem->aux->size_poke_tab; i++) {
            poke = &elem->aux->poke_tab[i];

            if (poke->tail_call.map != map)
                continue;

            key = poke->tail_call.key;
            // ...

            prog = (struct bpf_prog *) array->ptrs[key];
            p = (u8 *) prog->bpf_func + X86_TAIL_CALL_OFFSET;
            pp = __get_tramp_image(key);

            if (is_hack) {
                from = p;
                to = pp;
            } else {
                from = pp;
                to = p;
            }
            ret = bpf_arch_text_poke_fn(poke->tailcall_target,
                            BPF_MOD_JUMP,
                            from, to);
            if (ret)
                pr_err("[X] bpf_arch_text_poke failed: %d\n", ret);
        }
    }

    return 0;
}

static int
__bpf_poke_tailcall(struct bpf_map *map, bool is_hack)
{
    struct bpf_array *array = container_of(map, struct bpf_array, map);
    struct bpf_array_aux *aux = array->aux;
    int ret;

    mutex_lock(&aux->poke_mutex);

    ret = __bpf_poke_progs(array, is_hack);

    mutex_unlock(&aux->poke_mutex);

    return ret;
}

使用 poke 時，將 bpf prog 上 jmp 的地址改爲 trampoline 的地址；而在退出時，將 bpf prog 上 jmp 的地址還原爲 tailcall bpf prog 的地址。

實現細節 4：設計用來 trace 的 bpf prog

已知 tailcall bpf prog 的上下文是 kprobe，所以，需要構造一個能夠接收 struct pt_regs *ctx 和 u32 index 這兩個參數的 bpf prog。

static __noinline void
__fn(struct pt_regs *regs, u32 index)
{
    // This is the actual function that will be called by kernel module.

    bpf_printk("tcpconn, __fn, regs: %p, index: %u\n", regs, index);

    __u32 key = 0;
    struct sock **skp = bpf_map_lookup_elem(&socks, &key);
    if (!skp)
        return;

    struct sock *sk = *skp;
    __handle_new_connection(regs, sk, PROBE_TYPE_FENTRY, index);
}

SEC("kprobe/tailcall")
int fentry_tailcall(struct pt_regs *regs)
{
    bpf_printk("tcpconn, fentry_tailcall, regs: %p\n", regs);

    __fn(regs, 2);

    /* This is to avoid clang optimization.
     * Or, the index in __fn() will be optimized to 2.
     */
    __fn(regs, 3);

    return 0;
}

這便是 bpf2bpf 的用武之地了：使用 bpf2bpf 構造一個內核模塊能夠調用的、又滿足需求的 bpf prog。

此時，內核模塊使用的 trace bpf prog 是 __fn() 而不是 fentry_tailcall()。

但是，fentry_tailcall() 裏爲什麼要調用 2 次 __fn() 呢？

因爲，如果只有 1 次調用，那麼 __fn() 裏的 index 就會被優化成常量 2；導致內核模塊傳遞過來的 index 被忽略掉了。

實現細節 5：使用 Go 將它們串起來

因爲內核模塊裏需要的是 PROG_ARRAY bpf map 的 ID，和 fentry_tailcall() bpf prog 的 ID，所以需要在 Go 裏獲取它們，並通過內核模塊參數的方式傳遞給內核模塊。

    mapInfo, err := obj.Progs.Info()
    mapID, ok := mapInfo.ID()

    progInfo, err := ffObj.FentryTailcall.Info()
    progID, ok := progInfo.ID()

    if out, err := exec.Command("insmod",
        "./kernel/bpf-tailcall-trace.ko",
        fmt.Sprintf("bpf_prog_id=%d", progID),
        fmt.Sprintf("bpf_map_id=%d", mapID),
    ).CombinedOutput(); err != nil {
        log.Printf("Failed to load bpf-tailcall-trace.ko: %v\n%s", err, out)
        return
    }
    defer func() {
        if out, err := exec.Command("rmmod", "bpf-tailcall-trace").CombinedOutput(); err != nil {
            log.Printf("Failed to unload bpf-tailcall-trace.ko: %v\n%s", err, out)
        }
    }()

而在 insmod 之前，需要填充好 PROG_ARRAY bpf map。

更多代碼細節，請查看源代碼：GitHub - Asphaltt/bpf-tailcall-tracer[1]。

小結

已然能夠對靜態 tailcall 進行 trace，怎麼才能對動態 tailcall 進行 trace 呢？

使用內核模塊這樣 hack 的方式，理論上是可以的，只要能夠定位到動態 tailcall 所使用的 jmp 指令位置。

不過，花大力氣搞這麼複雜的內核模塊，不如花更大力氣去改進內核，讓它支持 trace tailcall。

參考資料

[1]

GitHub - Asphaltt/bpf-tailcall-tracer: https://github.com/Asphaltt/bpf-tailcall-tracer

本文由 Readfog 進行 AMP 轉碼，版權歸原作者所有。
來源：https://mp.weixin.qq.com/s/9dvPSwqVnI4q9A8uIWyylQ