eBPF Talk: introduce tracepoint
想弄清楚 tracepoint
的工作原理,實在太難了;網絡上的資料比較少,而且不夠深入,甚至是 kernel 文檔。
- Using the Linux Kernel Tracepoints[1]
本文嘗試從源代碼的角度來分析 tracepoint
的工作原理。
拋磚引玉,歡迎大家指正。
tracepoint
demo
在 eBPF 裏使用 tracepoint
,是比較簡單的。
struct netlink_extack_error_ctx {
unsigned long unused;
__u32 msg; // __data_loc char[] msg;
};
SEC("tp/netlink/netlink_extack")
int tp__netlink_extack(struct netlink_extack_error_ctx *ctx)
{
char *msg = (void *)(__u64) ((void *) ctx + (__u64) ((ctx->msg) & 0xFFFF));
__output_msg(ctx, msg, PROBE_TYPE_DEFAULT, 0);
return 0;
}
其中,需要自定義 ctx
結構體,這是 bpf 裏的做法。
該 ctx
結構體的第一個屬性必須是 unsigned long unused
,而且不能在 bpf 裏使用,這是預留給 tracepoint
自身使用的字段。
P.S. demo 源代碼:GitHub Asphaltt/learn-by-example/ebpf/tracepoint[2]
如何確定 tracepoint
的 ctx
結構體的其它字段信息呢?
# cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format
name: netlink_extack
ID: 1568
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:__data_loc char[] msg; offset:8; size:4; signed:0;
print fmt: "msg=%s", __get_str(msg)
# bpftrace -lv 'tracepoint:netlink:netlink_extack'
tracepoint:netlink:netlink_extack
__data_loc char[] msg
通過上面兩中方式,可以得到 tracepoint
的 ctx
結構體的其它字段信息:
msg
字段的類型是__data_loc char[]
,即char *
。
不過,msg
字段的真實類型真的是 char *
嗎?答案請查看:
不過,話說回來,cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format
裏已說明該字段的詳細信息:
field:__data_loc char[] msg; offset:8; size:4; signed:0;
# offset:該字段在 `ctx` 結構體中的偏移量;
# size:該字段的大小;
# signed:該字段是否是有符號的。
# 最終在 ctx struct 裏,該字段的類型是 `__u32`,而不是 `char *`。
而在 Go 代碼裏,只需要:
if tp, err := link.Tracepoint("netlink", "netlink_extack", obj.TpNetlinkExtack, nil); err != nil {
log.Printf("Failed to attach tracepoint(netlink_extack): %v", err)
return
} else {
log.Printf("Attached to tracepoint(netlink_extack)")
defer tp.Close()
}
即可將 tracepoint
程序 attach 到 tracepoint
上。
tracepoint
定義
在內核裏,一個 tracepoint
是怎麼定義出來的呢?
// ${KERNEL}/include/trace/events/netlink.h
TRACE_EVENT(netlink_extack,
TP_PROTO(const char *msg),
TP_ARGS(msg),
TP_STRUCT__entry(
__string( msg, msg )
),
TP_fast_assign(
__assign_str(msg, msg);
),
TP_printk("msg=%s", __get_str(msg))
);
將這些宏一層一層打開來看:
TP_PROTO(const char *msg)
// 宏定義: // ${KERNEL}/include/linux/tracepoint.h
#define TP_PROTO(args...) args
TP_ARGS(msg)
// 宏定義: // ${KERNEL}/include/linux/tracepoint.h
#define TP_ARGS(args...) args
TP_STRUCT__entry( // 定義 entry struct
__string( msg, msg )
)
// 宏定義: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define TP_STRUCT__entry(args...) args
__string( msg, msg )
// 宏定義: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define __dynamic_array(type, item, len) u32 __data_loc_##item;
#define __string(item, src) __dynamic_array(char, item, -1)
TP_fast_assign(
__assign_str(msg, msg);
)
// 宏定義: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define TP_fast_assign(args...) args
__assign_str(msg, msg)
// 宏定義: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define __assign_str(dst, src) \
strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
TP_printk("msg=%s", __get_str(msg))
// 宏定義: // ${KERNEL}/include/trace/stages/stage3_trace_output.h
#define TP_printk(fmt, args...) fmt "\n", args
// *******************************
TRACE_EVENT(netlink_extack,
TP_PROTO(const char *msg),
TP_ARGS(msg),
TP_STRUCT__entry(
__string( msg, msg )
),
TP_fast_assign(
__assign_str(msg, msg);
),
TP_printk("msg=%s", __get_str(msg))
);
// 宏定義: // ${KERNEL}/include/trace/trace_events.h
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DECLARE_EVENT_CLASS(name, \
PARAMS(proto), \
PARAMS(args), \
PARAMS(tstruct), \
PARAMS(assign), \
PARAMS(print)); \
DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
// DECLARE_EVENET_CLASS() 宏定義: // ${KERNEL}/include/trace/perf.h
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct pt_regs *__regs; \
u64 __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
int __data_size; \
int rctx; \
\
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
if (!bpf_prog_array_valid(event_call) && \
__builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), \
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \
if (!entry) \
return; \
\
perf_fetch_caller_regs(__regs); \
\
tstruct \
\
{ assign; } \
\
perf_trace_run_bpf_submit(entry, __entry_size, rctx, \
event_call, __count, __regs, \
head, __task); \
}
// DEFINE_EVENT() 宏定義: // ${KERNEL}/include/linux/tracepoint.h
#define DEFINE_EVENT(template, name, proto, args) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
// 繼續:
#define DECLARE_TRACE(name, proto, args) \
__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
cpu_online(raw_smp_processor_id()), \
PARAMS(void *__data, proto))
// 繼續:
#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
static inline void trace_##name(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
TP_ARGS(args), \
TP_CONDITION(cond), 0); \
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
WARN_ON_ONCE(!rcu_is_watching()); \
} \
} \
__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
PARAMS(cond)) \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_register(&__tracepoint_##name, \
(void *)probe, data); \
} \
static inline int \
register_trace_prio_##name(void (*probe)(data_proto), void *data, \
int prio) \
{ \
return tracepoint_probe_register_prio(&__tracepoint_##name, \
(void *)probe, data, prio); \
} \
static inline int \
unregister_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_unregister(&__tracepoint_##name, \
(void *)probe, data); \
} \
static inline void \
check_trace_callback_type_##name(void (*cb)(data_proto)) \
{ \
} \
static inline bool \
trace_##name##_enabled(void) \
{ \
return static_key_false(&__tracepoint_##name.key); \
}
// __DO_TRACE() 的宏定義:
#define __DO_TRACE(name, args, cond, rcuidle) \
do { \
int __maybe_unused __idx = 0; \
\
if (!(cond)) \
return; \
\
if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
return; \
\
/* keep srcu and sched-rcu usage consistent */ \
preempt_disable_notrace(); \
\
/* \
* For rcuidle callers, use srcu since sched-rcu \
* doesn't work from the idle path. \
*/ \
if (rcuidle) { \
__idx = srcu_read_lock_notrace(&tracepoint_srcu); \
ct_irq_enter_irqson(); \
} \
\
__DO_TRACE_CALL(name, TP_ARGS(args)); \
\
if (rcuidle) { \
ct_irq_exit_irqson(); \
srcu_read_unlock_notrace(&tracepoint_srcu, __idx); \
} \
\
preempt_enable_notrace(); \
} while (0)
// __DO_TRACE_CALL() 的宏定義:
#define __DO_TRACE_CALL(name, args) \
do { \
struct tracepoint_func *it_func_ptr; \
void *__data; \
it_func_ptr = \
rcu_dereference_raw((&__tracepoint_##name)->funcs); \
if (it_func_ptr) { \
__data = (it_func_ptr)->data; \
static_call(tp_func_##name)(__data, args); \
} \
} while (0)
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
_TRACE_PERF_PROTO(call, PARAMS(proto)); \
static char print_fmt_##call[] = print; \
static struct trace_event_class __used __refdata event_class_##call = { \
.system = TRACE_SYSTEM_STRING, \
.fields_array = trace_event_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_##call.fields), \
.raw_init = trace_event_raw_init, \
.probe = trace_event_raw_event_##call, \
.reg = trace_event_reg, \
_TRACE_PERF_INIT(call) \
};
#define _TRACE_PERF_PROTO(call, proto) \
static notrace void \
perf_trace_##call(void *__data, proto);
#define _TRACE_PERF_INIT(call) \
.perf_probe = perf_trace_##call,
/* 爛尾了 */
至此,tracepoint
的定義基本明瞭。
// ${KERNEL}/net/netlink/af_netlink.c
void do_trace_netlink_extack(const char *msg)
{
trace_netlink_extack(msg);
}
trace_netlink_extack()
|-->static_call(tp_func_netlink_extack)(__data, args); // DECLARE_TRACE() -> __DECLARE_TRACE() -> __DO_TRACE() -> __DO_TRACE_CALL()
*
* (未知調用鏈)
*
perf_trace_netlink_extack()
|-->perf_trace_run_bpf_submit() // ${KERNEL}/kernel/events/core.c
|-->trace_call_bpf() // ${KERNEL}/kernel/trace/bpf_trace.c
|-->bpf_prog_run_array(rcu_dereference(call->prog_array), ctx, bpf_prog_run); // ${KERNEL}/include/linux/bpf.h
bpf attach 到 tracepoint
上
從 Go 代碼出發,如何將 bpf attach 到 tracepoint
上呢?
Tracepoint() // ${cilium/ebpf}/link/tracepoint.go
|-->attachPerfEvent() // ${cilium/ebpf}/link/perf_event.go
|-->attachPerfEventLink()
|-->LinkCreatePerfEvent() // ${cilium/ebpf}/internal/types.go
|--> BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
接着看內核對應的源代碼:
__sys_bpf() // ${KERNEL}/kernel/bpf/syscall.c
|-->link_create()
|-->bpf_perf_link_attach()
|-->perf_event_set_bpf_prog() // ${KERNEL}/kernel/events/core.c
|-->perf_event_attach_bpf_prog() { // ${KERNEL}/kernel/trace/bpf_trace.c
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
rcu_assign_pointer(event->tp_event->prog_array, new_array);
}
好吧,即使看到這裏,只是大概弄明白了 tracepoint
bpf 程序是怎麼跑起來的,但還是不知道 tracepoint
的工作原理。
小結
本文嘗試從源代碼的角度來分析 tracepoint
的工作原理;不過嘗試並未成功。
參考資料
[1]
Using the Linux Kernel Tracepoints: https://docs.kernel.org/trace/tracepoints.html
[2]
GitHub Asphaltt/learn-by-example/ebpf/tracepoint: https://github.com/Asphaltt/learn-by-example
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/dp5byc_OuNVS3ma9CglNCw