eBPF Talk: introduce tracepoint

想弄清楚 tracepoint 的工作原理,實在太難了;網絡上的資料比較少,而且不夠深入,甚至是 kernel 文檔。

本文嘗試從源代碼的角度來分析 tracepoint 的工作原理。

拋磚引玉,歡迎大家指正。

tracepoint demo

在 eBPF 裏使用 tracepoint,是比較簡單的。

struct netlink_extack_error_ctx {
    unsigned long unused;

    __u32 msg; // __data_loc char[] msg;
};

SEC("tp/netlink/netlink_extack")
int tp__netlink_extack(struct netlink_extack_error_ctx *ctx)
{
    char *msg = (void *)(__u64) ((void *) ctx + (__u64) ((ctx->msg) & 0xFFFF));

    __output_msg(ctx, msg, PROBE_TYPE_DEFAULT, 0);

    return 0;
}

其中,需要自定義 ctx 結構體,這是 bpf 裏的做法。

ctx 結構體的第一個屬性必須是 unsigned long unused,而且不能在 bpf 裏使用,這是預留給 tracepoint 自身使用的字段。

P.S. demo 源代碼:GitHub Asphaltt/learn-by-example/ebpf/tracepoint[2]

如何確定 tracepointctx 結構體的其它字段信息呢?

# cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format
name: netlink_extack
ID: 1568
format:
    field:unsigned short common_type;   offset:0;   size:2; signed:0;
    field:unsigned char common_flags;   offset:2;   size:1; signed:0;
    field:unsigned char common_preempt_count;   offset:3;   size:1; signed:0;
    field:int common_pid;   offset:4;   size:4; signed:1;

    field:__data_loc char[] msg;    offset:8;   size:4; signed:0;

print fmt: "msg=%s", __get_str(msg)

# bpftrace -lv 'tracepoint:netlink:netlink_extack'
tracepoint:netlink:netlink_extack
    __data_loc char[] msg

通過上面兩中方式,可以得到 tracepointctx 結構體的其它字段信息:

  1. msg 字段的類型是 __data_loc char[],即 char *

不過,msg 字段的真實類型真的是 char * 嗎?答案請查看:

不過,話說回來,cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format 裏已說明該字段的詳細信息:

    field:__data_loc char[] msg;    offset:8;   size:4; signed:0;

# offset:該字段在 `ctx` 結構體中的偏移量;
# size:該字段的大小;
# signed:該字段是否是有符號的。
# 最終在 ctx struct 裏,該字段的類型是 `__u32`,而不是 `char *`。

而在 Go 代碼裏,只需要:

    if tp, err := link.Tracepoint("netlink""netlink_extack", obj.TpNetlinkExtack, nil); err != nil {
        log.Printf("Failed to attach tracepoint(netlink_extack): %v", err)
        return
    } else {
        log.Printf("Attached to tracepoint(netlink_extack)")
        defer tp.Close()
    }

即可將 tracepoint 程序 attach 到 tracepoint 上。

tracepoint 定義

在內核裏,一個 tracepoint 是怎麼定義出來的呢?

// ${KERNEL}/include/trace/events/netlink.h

TRACE_EVENT(netlink_extack,

    TP_PROTO(const char *msg),

    TP_ARGS(msg),

    TP_STRUCT__entry(
        __string(   msg,    msg )
    ),

    TP_fast_assign(
        __assign_str(msg, msg);
    ),

    TP_printk("msg=%s", __get_str(msg))
);

將這些宏一層一層打開來看:

TP_PROTO(const char *msg)
// 宏定義: // ${KERNEL}/include/linux/tracepoint.h
#define TP_PROTO(args...)   args

TP_ARGS(msg)
// 宏定義: // ${KERNEL}/include/linux/tracepoint.h
#define TP_ARGS(args...)    args

TP_STRUCT__entry(               // 定義 entry struct
    __string(   msg,    msg )
)
// 宏定義: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define TP_STRUCT__entry(args...) args

__string(   msg,    msg )
// 宏定義: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define __dynamic_array(type, item, len) u32 __data_loc_##item;
#define __string(item, src) __dynamic_array(char, item, -1)

TP_fast_assign(
    __assign_str(msg, msg);
)
// 宏定義: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define TP_fast_assign(args...) args

__assign_str(msg, msg)
// 宏定義: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define __assign_str(dst, src)                      \
    strcpy(__get_str(dst)(src) ? (const char *)(src) : "(null)");

TP_printk("msg=%s", __get_str(msg))
// 宏定義: // ${KERNEL}/include/trace/stages/stage3_trace_output.h
#define TP_printk(fmt, args...) fmt "\n", args

// *******************************

TRACE_EVENT(netlink_extack,

    TP_PROTO(const char *msg),

    TP_ARGS(msg),

    TP_STRUCT__entry(
        __string(   msg,    msg )
    ),

    TP_fast_assign(
        __assign_str(msg, msg);
    ),

    TP_printk("msg=%s", __get_str(msg))
);
// 宏定義: // ${KERNEL}/include/trace/trace_events.h
#define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \
    DECLARE_EVENT_CLASS(name,                                   \
                 PARAMS(proto),                                 \
                 PARAMS(args),                                  \
                 PARAMS(tstruct),                               \
                 PARAMS(assign),                                \
                 PARAMS(print));                                \
    DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));

// DECLARE_EVENET_CLASS() 宏定義:  // ${KERNEL}/include/trace/perf.h
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                                                     \
perf_trace_##call(void *__data, proto)                                  \
{                                                                       \
    struct trace_event_call *event_call = __data;                       \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;                               \
    struct pt_regs *__regs;                                             \
    u64 __count = 1;                                                    \
    struct task_struct *__task = NULL;                                  \
    struct hlist_head *head;                                            \
    int __entry_size;                                                   \
    int __data_size;                                                    \
    int rctx;                                                           \
                                                                        \
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                                                        \
    head = this_cpu_ptr(event_call->perf_events);                       \
    if (!bpf_prog_array_valid(event_call) &&                            \
        __builtin_constant_p(!__task) && !__task &&                     \
        hlist_empty(head))                                              \
        return;                                                         \
                                                                        \
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),    \
                 sizeof(u64));                                          \
    __entry_size -= sizeof(u32);                                        \
                                                                        \
    entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);         \
    if (!entry)                                                         \
        return;                                                         \
                                                                        \
    perf_fetch_caller_regs(__regs);                                     \
                                                                        \
    tstruct                                                             \
                                                                        \
    { assign; }                                                         \
                                                                        \
    perf_trace_run_bpf_submit(entry, __entry_size, rctx,                \
                  event_call, __count, __regs,                          \
                  head, __task);                                        \
}

// DEFINE_EVENT() 宏定義:  // ${KERNEL}/include/linux/tracepoint.h
#define DEFINE_EVENT(template, name, proto, args)       \
    DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
// 繼續:
#define DECLARE_TRACE(name, proto, args)                \
    __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
            cpu_online(raw_smp_processor_id()),     \
            PARAMS(void *__data, proto))
// 繼續:
#define __DECLARE_TRACE(name, proto, args, cond, data_proto)            \
    extern int __traceiter_##name(data_proto);                          \
    DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);            \
    extern struct tracepoint __tracepoint_##name;                       \
    static inline void trace_##name(proto)                              \
    {                                                                   \
        if (static_key_false(&__tracepoint_##name.key))                 \
            __DO_TRACE(name,                                            \
                TP_ARGS(args),                                          \
                TP_CONDITION(cond), 0);                                 \
        if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {                     \
            WARN_ON_ONCE(!rcu_is_watching());                           \
        }                                                               \
    }                                                                   \
    __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),              \
                PARAMS(cond))                                           \
    static inline int                                                   \
    register_trace_##name(void (*probe)(data_proto), void *data)        \
    {                                                                   \
        return tracepoint_probe_register(&__tracepoint_##name,          \
                        (void *)probe, data);                           \
    }                                                                   \
    static inline int                                                   \
    register_trace_prio_##name(void (*probe)(data_proto), void *data,   \
                   int prio)                                            \
    {                                                                   \
        return tracepoint_probe_register_prio(&__tracepoint_##name,     \
                          (void *)probe, data, prio);                   \
    }                                                                   \
    static inline int                                                   \
    unregister_trace_##name(void (*probe)(data_proto), void *data)      \
    {                                                                   \
        return tracepoint_probe_unregister(&__tracepoint_##name,        \
                        (void *)probe, data);                           \
    }                                                                   \
    static inline void                                                  \
    check_trace_callback_type_##name(void (*cb)(data_proto))            \
    {                                                                   \
    }                                                                   \
    static inline bool                                                  \
    trace_##name##_enabled(void)                                        \
    {                                                                   \
        return static_key_false(&__tracepoint_##name.key);              \
    }
// __DO_TRACE() 的宏定義:
#define __DO_TRACE(name, args, cond, rcuidle)                           \
    do {                                                                \
        int __maybe_unused __idx = 0;                                   \
                                                                        \
        if (!(cond))                                                    \
            return;                                                     \
                                                                        \
        if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle)))                        \
            return;                                                     \
                                                                        \
        /* keep srcu and sched-rcu usage consistent */                  \
        preempt_disable_notrace();                                      \
                                                                        \
        /*                                                              \
         * For rcuidle callers, use srcu since sched-rcu                \
         * doesn't work from the idle path.                             \
         */                                                             \
        if (rcuidle) {                                                  \
            __idx = srcu_read_lock_notrace(&tracepoint_srcu);           \
            ct_irq_enter_irqson();                                      \
        }                                                               \
                                                                        \
        __DO_TRACE_CALL(name, TP_ARGS(args));                           \
                                                                        \
        if (rcuidle) {                                                  \
            ct_irq_exit_irqson();                                       \
            srcu_read_unlock_notrace(&tracepoint_srcu, __idx);          \
        }                                                               \
                                                                        \
        preempt_enable_notrace();                                       \
    } while (0)
// __DO_TRACE_CALL() 的宏定義:
#define __DO_TRACE_CALL(name, args)                                     \
    do {                                                                \
        struct tracepoint_func *it_func_ptr;                            \
        void *__data;                                                   \
        it_func_ptr =                                                   \
            rcu_dereference_raw((&__tracepoint_##name)->funcs);         \
        if (it_func_ptr) {                                              \
            __data = (it_func_ptr)->data;                               \
            static_call(tp_func_##name)(__data, args);                  \
        }                                                               \
    } while (0)



#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
_TRACE_PERF_PROTO(call, PARAMS(proto));                                 \
static char print_fmt_##call[] = print;                                 \
static struct trace_event_class __used __refdata event_class_##call = { \
    .system         = TRACE_SYSTEM_STRING,                              \
    .fields_array   = trace_event_fields_##call,                        \
    .fields         = LIST_HEAD_INIT(event_class_##call.fields),        \
    .raw_init       = trace_event_raw_init,                             \
    .probe          = trace_event_raw_event_##call,                     \
    .reg            = trace_event_reg,                                  \
    _TRACE_PERF_INIT(call)                                              \
};

#define _TRACE_PERF_PROTO(call, proto)                                  \
    static notrace void                                                 \
    perf_trace_##call(void *__data, proto);

#define _TRACE_PERF_INIT(call)                                          \
    .perf_probe     = perf_trace_##call,

/* 爛尾了 */

至此,tracepoint 的定義基本明瞭。

// ${KERNEL}/net/netlink/af_netlink.c

void do_trace_netlink_extack(const char *msg)
{
    trace_netlink_extack(msg);
}

trace_netlink_extack()
|-->static_call(tp_func_netlink_extack)(__data, args); // DECLARE_TRACE() -> __DECLARE_TRACE() -> __DO_TRACE() -> __DO_TRACE_CALL()
     *
      * (未知調用鏈)
       *
        perf_trace_netlink_extack()
        |-->perf_trace_run_bpf_submit()         // ${KERNEL}/kernel/events/core.c
            |-->trace_call_bpf()                // ${KERNEL}/kernel/trace/bpf_trace.c
                |-->bpf_prog_run_array(rcu_dereference(call->prog_array), ctx, bpf_prog_run);   // ${KERNEL}/include/linux/bpf.h

bpf attach 到 tracepoint

從 Go 代碼出發,如何將 bpf attach 到 tracepoint 上呢?

Tracepoint()                                    // ${cilium/ebpf}/link/tracepoint.go
|-->attachPerfEvent()                           // ${cilium/ebpf}/link/perf_event.go
    |-->attachPerfEventLink()
        |-->LinkCreatePerfEvent()               // ${cilium/ebpf}/internal/types.go
            |--> BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))

接着看內核對應的源代碼:

__sys_bpf()                                     // ${KERNEL}/kernel/bpf/syscall.c
|-->link_create()
    |-->bpf_perf_link_attach()
        |-->perf_event_set_bpf_prog()           // ${KERNEL}/kernel/events/core.c
            |-->perf_event_attach_bpf_prog() {  // ${KERNEL}/kernel/trace/bpf_trace.c
                    old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
                    bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
                    rcu_assign_pointer(event->tp_event->prog_array, new_array);
                }

好吧,即使看到這裏,只是大概弄明白了 tracepoint bpf 程序是怎麼跑起來的,但還是不知道 tracepoint 的工作原理。

小結

本文嘗試從源代碼的角度來分析 tracepoint 的工作原理;不過嘗試並未成功。

參考資料

[1]

Using the Linux Kernel Tracepoints: https://docs.kernel.org/trace/tracepoints.html

[2]

GitHub Asphaltt/learn-by-example/ebpf/tracepoint: https://github.com/Asphaltt/learn-by-example

本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/dp5byc_OuNVS3ma9CglNCw