从源码浅析:tracepoint、bpf和perf
前言
最近bpf一直都比较火,从公司部门层面对bpf的研究也有了一些苗条,前两年我算是公司较早接触bpf的工程师,但受项目影响学习总是一段一段的没有连续,直到现在,我对bpf也是一知半解,只知道这是一个虚拟机。
正好趁着项目收尾阶段,事情不太多,想静下心来好好研究研究技术,要想看懂一个新特性或新功能,最有效的还是得从源码看起,抛砖引玉步步深入就从这篇文章开始吧。
Trace子系统
Linux Trace子系统由3个部分组成,前端工具、Trace框架和Trace实现,从底向上看Trace实现是万物之源,在这定义了我们trace系统运行的方式,即是如何跟踪函数调用的,Trace框架定义了进一步从事件收集数据输送到用户空间的方式,前端工具是用户空间的程序与内核的交互,将trace收集的结果展现出来。
Tracepoint
include/trace/trace_events.h
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DECLARE_EVENT_CLASS(name, \
PARAMS(proto), \
PARAMS(args), \
PARAMS(tstruct), \
PARAMS(assign), \
PARAMS(print)); \
DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
include/trace/perf.h
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct pt_regs *__regs; \
u64 __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
int __data_size; \
int rctx; \
\
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
if (!bpf_prog_array_valid(event_call) && \
__builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \
if (!entry) \
return; \
\
perf_fetch_caller_regs(__regs); \
\
tstruct \
\
{ assign; } \
\
perf_trace_run_bpf_submit(entry, __entry_size, rctx, \
event_call, __count, __regs, \
head, __task); \
}
kernel/events/core.c
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct trace_event_call *call, u64 count,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
kernel/trace/bpf_trace.c
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
* Can be used from static tracepoints in the future.
*
* Return: BPF programs always return an integer which is interpreted by
* kprobe handler as:
* 0 - return from kprobe (event is filtered out)
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/*
* since some bpf program is already running on this cpu,
* don't call into another bpf program (same or different)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
ret = 0;
goto out;
}
/*
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
* a heuristic to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
* proper rcu_dereference() under RCU lock.
* If it turns out that prog_array is NULL then, we bail out.
* For the opposite, if the bpf_prog_array_valid() fetched pointer
* was NULL, you'll skip the prog_array with the risk of missing
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
return ret;
}
include/linux/trace_events.h
struct trace_event_call {
struct list_head list;
struct trace_event_class *class;
union {
char *name;
/* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
struct tracepoint *tp;
};
struct trace_event event;
char *print_fmt;
struct event_filter *filter;
void *mod;
void *data;
/* See the TRACE_EVENT_FL_* flags above */
int flags; /* static flags of different events */
#ifdef CONFIG_PERF_EVENTS
int perf_refcount;
struct hlist_head __percpu *perf_events;
struct bpf_prog_array __rcu *prog_array;
int (*perf_perm)(struct trace_event_call *,
struct perf_event *);
#endif
};
perf
bpf
参考链接:
1、Exploring USDT Probes on Linux 2、Using the Linux Kernel Tracepoints