From 3eaea21b4d27cff0017c20549aeb53034c58fc23 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:26 -0700 Subject: [PATCH 01/17] uprobes: encapsulate preparation of uprobe args buffer Move the logic of fetching temporary per-CPU uprobe buffer and storing uprobes args into it to a new helper function. Store data size as part of this buffer, simplifying interfaces a bit, as now we only pass single uprobe_cpu_buffer reference around, instead of pointer + dsize. This logic was duplicated across uprobe_dispatcher and uretprobe_dispatcher, and now will be centralized. All this is also in preparation to make this uprobe_cpu_buffer handling logic optional in the next patch. Link: https://lore.kernel.org/all/20240318181728.2795838-2-andrii@kernel.org/ [Masami: update for v6.9-rc3 kernel] Signed-off-by: Andrii Nakryiko Reviewed-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 78 +++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9e461362450a..155cba8e6e7e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; @@ -943,9 +944,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; @@ -956,14 +974,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,14 +995,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct event_file_link *link; @@ -993,7 +1011,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1019,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } @@ -1335,7 +1353,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; @@ -1356,7 +1374,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1397,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; - - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1410,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucb); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucb); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1475,10 +1490,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; - int dsize, esize; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1490,18 +1503,14 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + ucb = prepare_uprobe_buffer(tu, regs); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1513,7 +1522,6 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, struct trace_uprobe *tu; struct uprobe_dispatch_data udd; struct uprobe_cpu_buffer *ucb; - int dsize, esize; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1533,14 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + ucb = prepare_uprobe_buffer(tu, regs); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, ucb); #endif uprobe_buffer_put(ucb); return 0; From 1b8f85defbc82e2eb8f27c5f6060ea507ad4d5a3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:27 -0700 Subject: [PATCH 02/17] uprobes: prepare uprobe args buffer lazily MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uprobe_cpu_buffer and corresponding logic to store uprobe args into it are used for uprobes/uretprobes that are created through tracefs or perf events. BPF is yet another user of uprobe/uretprobe infrastructure, but doesn't need uprobe_cpu_buffer and associated data. For BPF-only use cases this buffer handling and preparation is a pure overhead. At the same time, BPF-only uprobe/uretprobe usage is very common in practice. Also, for a lot of cases applications are very senstivie to performance overheads, as they might be tracing a very high frequency functions like malloc()/free(), so every bit of performance improvement matters. All that is to say that this uprobe_cpu_buffer preparation is an unnecessary overhead that each BPF user of uprobes/uretprobe has to pay. This patch is changing this by making uprobe_cpu_buffer preparation optional. It will happen only if either tracefs-based or perf event-based uprobe/uretprobe consumer is registered for given uprobe/uretprobe. For BPF-only use cases this step will be skipped. We used uprobe/uretprobe benchmark which is part of BPF selftests (see [0]) to estimate the improvements. We have 3 uprobe and 3 uretprobe scenarios, which vary an instruction that is replaced by uprobe: nop (fastest uprobe case), `push rbp` (typical case), and non-simulated `ret` instruction (slowest case). Benchmark thread is constantly calling user space function in a tight loop. User space function has attached BPF uprobe or uretprobe program doing nothing but atomic counter increments to count number of triggering calls. Benchmark emits throughput in millions of executions per second. BEFORE these changes ==================== uprobe-nop : 2.657 ± 0.024M/s uprobe-push : 2.499 ± 0.018M/s uprobe-ret : 1.100 ± 0.006M/s uretprobe-nop : 1.356 ± 0.004M/s uretprobe-push : 1.317 ± 0.019M/s uretprobe-ret : 0.785 ± 0.007M/s AFTER these changes =================== uprobe-nop : 2.732 ± 0.022M/s (+2.8%) uprobe-push : 2.621 ± 0.016M/s (+4.9%) uprobe-ret : 1.105 ± 0.007M/s (+0.5%) uretprobe-nop : 1.396 ± 0.007M/s (+2.9%) uretprobe-push : 1.347 ± 0.008M/s (+2.3%) uretprobe-ret : 0.800 ± 0.006M/s (+1.9) So the improvements on this particular machine seems to be between 2% and 5%. [0] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/all/20240318181728.2795838-3-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 49 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 155cba8e6e7e..cbab487e8522 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -941,15 +941,21 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) { struct uprobe_cpu_buffer *ucb; int dsize, esize; + if (*ucbp) + return *ucbp; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); dsize = __get_data_size(&tu->tp, regs, NULL); @@ -958,22 +964,25 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + *ucbp = ucb; return ucb; } static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, + struct uprobe_cpu_buffer **ucbp, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; + struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); + ucb = prepare_uprobe_buffer(tu, regs, ucbp); if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; @@ -1002,7 +1011,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; @@ -1011,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, link->file); + __uprobe_trace_func(tu, 0, regs, ucbp, link->file); rcu_read_unlock(); return 0; @@ -1019,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, link->file); + __uprobe_trace_func(tu, func, regs, ucbp, link->file); rcu_read_unlock(); } @@ -1353,10 +1362,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1374,6 +1384,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + ucb = prepare_uprobe_buffer(tu, regs, ucbp); size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) @@ -1410,21 +1421,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1489,7 +1500,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); @@ -1503,14 +1514,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - ucb = prepare_uprobe_buffer(tu, regs); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1521,7 +1530,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1533,14 +1542,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - ucb = prepare_uprobe_buffer(tu, regs); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; From cdf355cc60e388d992bdd205b8ee70dc4d533461 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Mar 2024 11:17:28 -0700 Subject: [PATCH 03/17] uprobes: add speculative lockless system-wide uprobe filter check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's very common with BPF-based uprobe/uretprobe use cases to have a system-wide (not PID specific) probes used. In this case uprobe's trace_uprobe_filter->nr_systemwide counter is bumped at registration time, and actual filtering is short circuited at the time when uprobe/uretprobe is triggered. This is a great optimization, and the only issue with it is that to even get to checking this counter uprobe subsystem is taking read-side trace_uprobe_filter->rwlock. This is actually noticeable in profiles and is just another point of contention when uprobe is triggered on multiple CPUs simultaneously. This patch moves this nr_systemwide check outside of filter list's rwlock scope, as rwlock is meant to protect list modification, while nr_systemwide-based check is speculative and racy already, despite the lock (as discussed in [0]). trace_uprobe_filter_remove() and trace_uprobe_filter_add() already check for filter->nr_systewide explicitly outside of __uprobe_perf_filter, so no modifications are required there. Confirming with BPF selftests's based benchmarks. BEFORE (based on changes in previous patch) =========================================== uprobe-nop : 2.732 ± 0.022M/s uprobe-push : 2.621 ± 0.016M/s uprobe-ret : 1.105 ± 0.007M/s uretprobe-nop : 1.396 ± 0.007M/s uretprobe-push : 1.347 ± 0.008M/s uretprobe-ret : 0.800 ± 0.006M/s AFTER ===== uprobe-nop : 2.878 ± 0.017M/s (+5.5%, total +8.3%) uprobe-push : 2.753 ± 0.013M/s (+5.3%, total +10.2%) uprobe-ret : 1.142 ± 0.010M/s (+3.8%, total +3.8%) uretprobe-nop : 1.444 ± 0.008M/s (+3.5%, total +6.5%) uretprobe-push : 1.410 ± 0.010M/s (+4.8%, total +7.1%) uretprobe-ret : 0.816 ± 0.002M/s (+2.0%, total +3.9%) In the above, first percentage value is based on top of previous patch (lazy uprobe buffer optimization), while the "total" percentage is based on kernel without any of the changes in this patch set. As can be seen, we get about 4% - 10% speed up, in total, with both lazy uprobe buffer and speculative filter check optimizations. [0] https://lore.kernel.org/bpf/20240313131926.GA19986@redhat.com/ Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/all/20240318181728.2795838-4-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cbab487e8522..8541fa1494ae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1226,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1353,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); From d9b15224dd8ff83b2aef87e4cd5ad10c875ef7d6 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:04 +0800 Subject: [PATCH 04/17] tracing/probes: support '%pd' type for print struct dentry's name During fault locating, the file name needs to be printed based on the dentry address. The offset needs to be calculated each time, which is troublesome. Similar to printk, kprobe support print type '%pd' for print dentry's name. For example "name=$arg1:%pd" casts the `$arg1` as (struct dentry *), dereferences the "d_name.name" field and stores it to "name" argument as a kernel string. Here is an example: [tracing]# echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events [tracing]# echo 1 > events/kprobes/testprobe/enable [tracing]# grep -q "1" events/kprobes/testprobe/enable [tracing]# echo 0 > events/kprobes/testprobe/enable [tracing]# cat trace | grep "enable" bash-14844 [002] ..... 16912.889543: testprobe: (dput+0x4/0x30) name="enable" grep-15389 [003] ..... 16922.834182: testprobe: (dput+0x4/0x30) name="enable" grep-15389 [003] ..... 16922.836103: testprobe: (dput+0x4/0x30) name="enable" bash-14844 [001] ..... 16931.820909: testprobe: (dput+0x4/0x30) name="enable" Note that this expects the given argument (e.g. $arg1) is an address of struct dentry. User must ensure it. Link: https://lore.kernel.org/all/20240322064308.284457-2-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_fprobe.c | 6 +++++ kernel/trace/trace_kprobe.c | 6 +++++ kernel/trace/trace_probe.c | 50 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_probe.h | 2 ++ 5 files changed, 65 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..869978d84672 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5540,7 +5540,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, \\[\\]\n" + "\t symstr, %pd, \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4f4280815522..62e6a8f4aae9 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc, is_return); @@ -1154,6 +1159,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14099cc17fc9..c68d4e830fbe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -782,6 +782,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -933,6 +934,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); @@ -979,6 +984,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..f7ee97e45d1f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1739,6 +1739,56 @@ error: return ERR_PTR(ret); } +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + if (glob_match("*:%pd", argv[i])) { + char *tmp; + char *equal; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + tmp[strlen(argv[i]) - 4] = '\0'; + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; + } + } + + *buf = tmpbuf; + return 0; +nomem: + kfree(tmpbuf); + return -ENOMEM; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cef3a50628a3..5803e6a41570 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX #define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) @@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); From 20fe4d07bde67ec26716835b61be2c4eeca1c6bc Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:05 +0800 Subject: [PATCH 05/17] tracing/probes: support '%pD' type for print struct file's name As like '%pd' type, this patch supports print type '%pD' for print file's name. For example "name=$arg1:%pD" casts the `$arg1` as (struct file*), dereferences the "file.f_path.dentry.d_name.name" field and stores it to "name" argument as a kernel string. Here is an example: [tracing]# echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_event [tracing]# echo 1 > events/kprobes/testprobe/enable [tracing]# grep -q "1" events/kprobes/testprobe/enable [tracing]# echo 0 > events/kprobes/testprobe/enable [tracing]# grep "vfs_read" trace | grep "enable" grep-15108 [003] ..... 5228.328609: testprobe: (vfs_read+0x4/0xbb0) name="enable" Note that this expects the given argument (e.g. $arg1) is an address of struct file. User must ensure it. Link: https://lore.kernel.org/all/20240322064308.284457-3-yebin10@huawei.com/ [Masami: replaced "previous patch" with '%pd' type] Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_probe.c | 55 +++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 869978d84672..7fe4b539a423 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5540,7 +5540,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, %pd, \\[\\]\n" + "\t symstr, %pd/%pD, \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index f7ee97e45d1f..d4a887dac821 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include +#include #include "trace_btf.h" #include "trace_probe.h" @@ -1751,35 +1752,47 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) used = 0; for (i = 0; i < argc; i++) { - if (glob_match("*:%pd", argv[i])) { - char *tmp; - char *equal; + char *tmp; + char *equal; + size_t arg_len; - if (!tmpbuf) { - tmpbuf = kmalloc(bufsize, GFP_KERNEL); - if (!tmpbuf) - return -ENOMEM; - } + if (!glob_match("*:%p[dD]", argv[i])) + continue; - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) - goto nomem; + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } - equal = strchr(tmp, '='); - if (equal) - *equal = '\0'; - tmp[strlen(argv[i]) - 4] = '\0'; + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') ret = snprintf(tmpbuf + used, bufsize - used, "%s%s+0x0(+0x%zx(%s)):string", equal ? tmp : "", equal ? "=" : "", offsetof(struct dentry, d_name.name), equal ? equal + 1 : tmp); - kfree(tmp); - if (ret >= bufsize - used) - goto nomem; - argv[i] = tmpbuf + used; - used += ret + 1; - } + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; } *buf = tmpbuf; From 5e37460f5f9266bcd037137f1bd9eb24b9940faf Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:06 +0800 Subject: [PATCH 06/17] Documentation: tracing: add new type '%pd' and '%pD' for kprobe Similar to printk() '%pd' is for fetch dentry's name from struct dentry's pointer, and '%pD' is for fetch file's name from struct file's pointer. Link: https://lore.kernel.org/all/20240322064308.284457-4-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- Documentation/trace/kprobetrace.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index a49662ccd53c..195c942d9bec 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -58,8 +58,9 @@ Synopsis of kprobe_events NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types - (x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr" - and bitfield are supported. + (x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char", + "string", "ustring", "symbol", "symstr" and bitfield are + supported. (\*1) only for the probe on function entry (offs == 0). Note, this argument access is best effort, because depending on the argument type, it may be passed on @@ -122,6 +123,9 @@ With 'symstr' type, you can filter the event with wildcard pattern of the symbols, and you don't need to solve symbol name by yourself. For $comm, the default type is "string"; any other type is invalid. +VFS layer common type(%pd/%pD) is a special type, which fetches dentry's or +file's name from struct dentry's address or struct file's address. + .. _user_mem_access: User Memory Access From c01768b05e306d8c8d5997356ff427f5d53c7d20 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:07 +0800 Subject: [PATCH 07/17] selftests/ftrace: add kprobe test cases for VFS type "%pd" and "%pD" This patch adds test cases for new print format type "%pd/%pD".The test cases test the following items: 1. Test README if add "%pd/%pD" type; 2. Test "%pd" type for dput(); 3. Test "%pD" type for vfs_read(); This test case require enable CONFIG_HAVE_FUNCTION_ARG_ACCESS_API configuration. Link: https://lore.kernel.org/all/20240322064308.284457-5-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- .../ftrace/test.d/kprobe/kprobe_args_vfs.tc | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 000000000000..21a54be6894c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace From ee97e5e135c6fc937c4c81e0341c6513e2e6d44c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:08 +0800 Subject: [PATCH 08/17] selftests/ftrace: add fprobe test cases for VFS type "%pd" and "%pD" This patch adds fprobe test cases for new print format type "%pd/%pD".The test cases test the following items: 1. Test "%pd" type for dput(); 2. Test "%pD" type for vfs_read(); This test case require enable CONFIG_HAVE_FUNCTION_ARG_ACCESS_API configuration. Link: https://lore.kernel.org/all/20240322064308.284457-6-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- .../ftrace/test.d/dynevent/fprobe_args_vfs.tc | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 000000000000..49a833bf334c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace From 73142cab3af1b99157837297f437b306d7a70bff Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 7 Feb 2024 16:35:47 +0100 Subject: [PATCH 09/17] fprobe: Add entry/exit callbacks types We are going to store callbacks in following change, so this will ease up the code. Link: https://lore.kernel.org/all/20240207153550.856536-2-jolsa@kernel.org/ Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 3e03758151f4..f39869588117 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,16 @@ #include #include +struct fprobe; + +typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); + +typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); + /** * struct fprobe - ftrace based probe. * @ops: The ftrace_ops. @@ -34,12 +44,8 @@ struct fprobe { size_t entry_data_size; int nr_maxactive; - int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, - void *entry_data); - void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, - void *entry_data); + fprobe_entry_cb entry_handler; + fprobe_exit_cb exit_handler; }; /* This fprobe is soft-disabled. */ From 5120d167e21c674afd0630c65e7f6a00fa0667f1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 8 Apr 2024 10:51:40 -0700 Subject: [PATCH 10/17] rethook: Remove warning messages printed for finding return address of a frame. The function rethook_find_ret_addr() prints a warning message and returns 0 when the target task is running and is not the "current" task in order to prevent the incorrect return address, although it still may return an incorrect address. However, the warning message turns into noise when BPF profiling programs call bpf_get_task_stack() on running tasks in a firm with a large number of hosts. The callers should be aware and willing to take the risk of receiving an incorrect return address from a task that is currently running other than the "current" one. A warning is not needed here as the callers are intent on it. Link: https://lore.kernel.org/all/20240408175140.60223-1-thinker.li@gmail.com/ Acked-by: Andrii Nakryiko Acked-by: John Fastabend Signed-off-by: Kui-Feng Lee Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/rethook.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..4297a132a7ae 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -248,7 +248,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { From 0dc715295d4143d1659879f7f50ad4e9a6f6a99c Mon Sep 17 00:00:00 2001 From: Jonathan Haslam Date: Mon, 22 Apr 2024 03:23:05 -0700 Subject: [PATCH 11/17] uprobes: reduce contention on uprobes_tree access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Active uprobes are stored in an RB tree and accesses to this tree are dominated by read operations. Currently these accesses are serialized by a spinlock but this leads to enormous contention when large numbers of threads are executing active probes. This patch converts the spinlock used to serialize access to the uprobes_tree RB tree into a reader-writer spinlock. This lock type aligns naturally with the overwhelmingly read-only nature of the tree usage here. Although the addition of reader-writer spinlocks are discouraged [0], this fix is proposed as an interim solution while an RCU based approach is implemented (that work is in a nascent form). This fix also has the benefit of being trivial, self contained and therefore simple to backport. We have used a uprobe benchmark from the BPF selftests [1] to estimate the improvements. Each block of results below show 1 line per execution of the benchmark ("the "Summary" line) and each line is a run with one more thread added - a thread is a "producer". The lines are edited to remove extraneous output. The tests were executed with this driver script: for num_threads in {1..20} do sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary done SPINLOCK (BEFORE) ================== Summary: hits 1.396 ± 0.007M/s ( 1.396M/prod) Summary: hits 1.656 ± 0.016M/s ( 0.828M/prod) Summary: hits 2.246 ± 0.008M/s ( 0.749M/prod) Summary: hits 2.114 ± 0.010M/s ( 0.529M/prod) Summary: hits 2.013 ± 0.009M/s ( 0.403M/prod) Summary: hits 1.753 ± 0.008M/s ( 0.292M/prod) Summary: hits 1.847 ± 0.001M/s ( 0.264M/prod) Summary: hits 1.889 ± 0.001M/s ( 0.236M/prod) Summary: hits 1.833 ± 0.006M/s ( 0.204M/prod) Summary: hits 1.900 ± 0.003M/s ( 0.190M/prod) Summary: hits 1.918 ± 0.006M/s ( 0.174M/prod) Summary: hits 1.925 ± 0.002M/s ( 0.160M/prod) Summary: hits 1.837 ± 0.001M/s ( 0.141M/prod) Summary: hits 1.898 ± 0.001M/s ( 0.136M/prod) Summary: hits 1.799 ± 0.016M/s ( 0.120M/prod) Summary: hits 1.850 ± 0.005M/s ( 0.109M/prod) Summary: hits 1.816 ± 0.002M/s ( 0.101M/prod) Summary: hits 1.787 ± 0.001M/s ( 0.094M/prod) Summary: hits 1.764 ± 0.002M/s ( 0.088M/prod) RW SPINLOCK (AFTER) =================== Summary: hits 1.444 ± 0.020M/s ( 1.444M/prod) Summary: hits 2.279 ± 0.011M/s ( 1.139M/prod) Summary: hits 3.422 ± 0.014M/s ( 1.141M/prod) Summary: hits 3.565 ± 0.017M/s ( 0.891M/prod) Summary: hits 2.671 ± 0.013M/s ( 0.534M/prod) Summary: hits 2.409 ± 0.005M/s ( 0.401M/prod) Summary: hits 2.485 ± 0.008M/s ( 0.355M/prod) Summary: hits 2.496 ± 0.003M/s ( 0.312M/prod) Summary: hits 2.585 ± 0.002M/s ( 0.287M/prod) Summary: hits 2.908 ± 0.011M/s ( 0.291M/prod) Summary: hits 2.346 ± 0.016M/s ( 0.213M/prod) Summary: hits 2.804 ± 0.004M/s ( 0.234M/prod) Summary: hits 2.556 ± 0.001M/s ( 0.197M/prod) Summary: hits 2.754 ± 0.004M/s ( 0.197M/prod) Summary: hits 2.482 ± 0.002M/s ( 0.165M/prod) Summary: hits 2.412 ± 0.005M/s ( 0.151M/prod) Summary: hits 2.710 ± 0.003M/s ( 0.159M/prod) Summary: hits 2.826 ± 0.005M/s ( 0.157M/prod) Summary: hits 2.718 ± 0.001M/s ( 0.143M/prod) Summary: hits 2.844 ± 0.006M/s ( 0.142M/prod) The numbers in parenthesis give averaged throughput per thread which is of greatest interest here as a measure of scalability. Improvements are in the order of 22 - 68% with this particular benchmark (mean = 43%). V2: - Updated commit message to include benchmark results. [0] https://docs.kernel.org/locking/spinlocks.html [1] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Link: https://lore.kernel.org/all/20240422102306.6026-1-jonathan.haslam@gmail.com/ Signed-off-by: Jonathan Haslam Acked-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- kernel/events/uprobes.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..8ae0eefc3a34 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); return u; } @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe) if (WARN_ON(!uprobe_is_active(uprobe))) return; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe); } @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode, get_uprobe(u); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } From b0e28a4b5becea84ae6fca5cbd8a6b80a134e223 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Apr 2024 12:09:08 -0700 Subject: [PATCH 12/17] ftrace: make extra rcu_is_watching() validation check optional Introduce CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING config option to control whether ftrace low-level code performs additional rcu_is_watching()-based validation logic in an attempt to catch noinstr violations. This check is expected to never be true and is mostly useful for low-level validation of ftrace subsystem invariants. For most users it should probably be kept disabled to eliminate unnecessary runtime overhead. This improves BPF multi-kretprobe (relying on ftrace and rethook infrastructure) runtime throughput by 2%, according to BPF benchmarks ([0]). [0] https://lore.kernel.org/bpf/CAEf4BzauQ2WKMjZdc9s0rBWa01BYbgwHN6aNDXQSHYia47pQ-w@mail.gmail.com/ Link: https://lore.kernel.org/all/20240418190909.704286-1-andrii@kernel.org/ Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Paul E. McKenney Acked-by: Masami Hiramatsu (Google) Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- include/linux/trace_recursion.h | 2 +- kernel/trace/Kconfig | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index d48cd92d2364..24ea8ac049b4 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,7 +135,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif -#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#ifdef CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING # define trace_warn_on_no_rcu(ip) \ ({ \ bool __ret = !rcu_is_watching(); \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 47345bf1d4a9..d093e16d01c1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION From e03c05ac9813410d15c9c39ccf02c84efe563533 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Apr 2024 12:09:09 -0700 Subject: [PATCH 13/17] rethook: honor CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING in rethook_try_get() Take into account CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING when validating that RCU is watching when trying to setup rethooko on a function entry. One notable exception when we force rcu_is_watching() check is CONFIG_KPROBE_EVENTS_ON_NOTRACE=y case, in which case kretprobes will use old-style int3-based workflow instead of relying on ftrace, making RCU watching check important to validate. This further (in addition to improvements in the previous patch) improves BPF multi-kretprobe (which rely on rethook) runtime throughput by 2.3%, according to BPF benchmarks ([0]). [0] https://lore.kernel.org/bpf/CAEf4BzauQ2WKMjZdc9s0rBWa01BYbgwHN6aNDXQSHYia47pQ-w@mail.gmail.com/ Link: https://lore.kernel.org/all/20240418190909.704286-2-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Paul E. McKenney Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/rethook.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 4297a132a7ae..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } From a3b00f10da808bd4a354f890b551cba471082d0e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Apr 2024 14:52:13 -0700 Subject: [PATCH 14/17] objpool: enable inlining objpool_push() and objpool_pop() operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit objpool_push() and objpool_pop() are very performance-critical functions and can be called very frequently in kretprobe triggering path. As such, it makes sense to allow compiler to inline them completely to eliminate function calls overhead. Luckily, their logic is quite well isolated and doesn't have any sprawling dependencies. This patch moves both objpool_push() and objpool_pop() into include/linux/objpool.h and marks them as static inline functions, enabling inlining. To avoid anyone using internal helpers (objpool_try_get_slot, objpool_try_add_slot), rename them to use leading underscores. We used kretprobe microbenchmark from BPF selftests (bench trig-kprobe and trig-kprobe-multi benchmarks) running no-op BPF kretprobe/kretprobe.multi programs in a tight loop to evaluate the effect. BPF own overhead in this case is minimal and it mostly stresses the rest of in-kernel kretprobe infrastructure overhead. Results are in millions of calls per second. This is not super scientific, but shows the trend nevertheless. BEFORE ====== kretprobe : 9.794 ± 0.086M/s kretprobe-multi: 10.219 ± 0.032M/s AFTER ===== kretprobe : 9.937 ± 0.174M/s (+1.5%) kretprobe-multi: 10.440 ± 0.108M/s (+2.2%) Link: https://lore.kernel.org/all/20240424215214.3956041-2-andrii@kernel.org/ Cc: Matt (Qiang) Wu Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- include/linux/objpool.h | 101 +++++++++++++++++++++++++++++++++++++++- lib/objpool.c | 100 --------------------------------------- 2 files changed, 99 insertions(+), 102 deletions(-) diff --git a/include/linux/objpool.h b/include/linux/objpool.h index 15aff4a17f0c..d8b1f7b91128 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -5,6 +5,10 @@ #include #include +#include +#include +#include +#include /* * objpool: ring-array based lockless MPMC queue @@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, gfp_t gfp, void *context, objpool_init_obj_cb objinit, objpool_fini_cb release); +/* try to retrieve object from slot */ +static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu) +{ + struct objpool_slot *slot = pool->cpu_slots[cpu]; + /* load head snapshot, other cpus may change it */ + uint32_t head = smp_load_acquire(&slot->head); + + while (head != READ_ONCE(slot->last)) { + void *obj; + + /* + * data visibility of 'last' and 'head' could be out of + * order since memory updating of 'last' and 'head' are + * performed in push() and pop() independently + * + * before any retrieving attempts, pop() must guarantee + * 'last' is behind 'head', that is to say, there must + * be available objects in slot, which could be ensured + * by condition 'last != head && last - head <= nr_objs' + * that is equivalent to 'last - head - 1 < nr_objs' as + * 'last' and 'head' are both unsigned int32 + */ + if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { + head = READ_ONCE(slot->head); + continue; + } + + /* obj must be retrieved before moving forward head */ + obj = READ_ONCE(slot->entries[head & slot->mask]); + + /* move head forward to mark it's consumption */ + if (try_cmpxchg_release(&slot->head, &head, head + 1)) + return obj; + } + + return NULL; +} + /** * objpool_pop() - allocate an object from objpool * @pool: object pool * * return value: object ptr or NULL if failed */ -void *objpool_pop(struct objpool_head *pool); +static inline void *objpool_pop(struct objpool_head *pool) +{ + void *obj = NULL; + unsigned long flags; + int i, cpu; + + /* disable local irq to avoid preemption & interruption */ + raw_local_irq_save(flags); + + cpu = raw_smp_processor_id(); + for (i = 0; i < num_possible_cpus(); i++) { + obj = __objpool_try_get_slot(pool, cpu); + if (obj) + break; + cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); + } + raw_local_irq_restore(flags); + + return obj; +} + +/* adding object to slot, abort if the slot was already full */ +static inline int +__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) +{ + struct objpool_slot *slot = pool->cpu_slots[cpu]; + uint32_t head, tail; + + /* loading tail and head as a local snapshot, tail first */ + tail = READ_ONCE(slot->tail); + + do { + head = READ_ONCE(slot->head); + /* fault caught: something must be wrong */ + WARN_ON_ONCE(tail - head > pool->nr_objs); + } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); + + /* now the tail position is reserved for the given obj */ + WRITE_ONCE(slot->entries[tail & slot->mask], obj); + /* update sequence to make this obj available for pop() */ + smp_store_release(&slot->last, tail + 1); + + return 0; +} /** * objpool_push() - reclaim the object and return back to objpool @@ -134,7 +219,19 @@ void *objpool_pop(struct objpool_head *pool); * return: 0 or error code (it fails only when user tries to push * the same object multiple times or wrong "objects" into objpool) */ -int objpool_push(void *obj, struct objpool_head *pool); +static inline int objpool_push(void *obj, struct objpool_head *pool) +{ + unsigned long flags; + int rc; + + /* disable local irq to avoid preemption & interruption */ + raw_local_irq_save(flags); + rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id()); + raw_local_irq_restore(flags); + + return rc; +} + /** * objpool_drop() - discard the object and deref objpool diff --git a/lib/objpool.c b/lib/objpool.c index cfdc02420884..f696308fc026 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, } EXPORT_SYMBOL_GPL(objpool_init); -/* adding object to slot, abort if the slot was already full */ -static inline int -objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - uint32_t head, tail; - - /* loading tail and head as a local snapshot, tail first */ - tail = READ_ONCE(slot->tail); - - do { - head = READ_ONCE(slot->head); - /* fault caught: something must be wrong */ - WARN_ON_ONCE(tail - head > pool->nr_objs); - } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); - - /* now the tail position is reserved for the given obj */ - WRITE_ONCE(slot->entries[tail & slot->mask], obj); - /* update sequence to make this obj available for pop() */ - smp_store_release(&slot->last, tail + 1); - - return 0; -} - -/* reclaim an object to object pool */ -int objpool_push(void *obj, struct objpool_head *pool) -{ - unsigned long flags; - int rc; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id()); - raw_local_irq_restore(flags); - - return rc; -} -EXPORT_SYMBOL_GPL(objpool_push); - -/* try to retrieve object from slot */ -static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - /* load head snapshot, other cpus may change it */ - uint32_t head = smp_load_acquire(&slot->head); - - while (head != READ_ONCE(slot->last)) { - void *obj; - - /* - * data visibility of 'last' and 'head' could be out of - * order since memory updating of 'last' and 'head' are - * performed in push() and pop() independently - * - * before any retrieving attempts, pop() must guarantee - * 'last' is behind 'head', that is to say, there must - * be available objects in slot, which could be ensured - * by condition 'last != head && last - head <= nr_objs' - * that is equivalent to 'last - head - 1 < nr_objs' as - * 'last' and 'head' are both unsigned int32 - */ - if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { - head = READ_ONCE(slot->head); - continue; - } - - /* obj must be retrieved before moving forward head */ - obj = READ_ONCE(slot->entries[head & slot->mask]); - - /* move head forward to mark it's consumption */ - if (try_cmpxchg_release(&slot->head, &head, head + 1)) - return obj; - } - - return NULL; -} - -/* allocate an object from object pool */ -void *objpool_pop(struct objpool_head *pool) -{ - void *obj = NULL; - unsigned long flags; - int i, cpu; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - - cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { - obj = objpool_try_get_slot(pool, cpu); - if (obj) - break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); - } - raw_local_irq_restore(flags); - - return obj; -} -EXPORT_SYMBOL_GPL(objpool_pop); - /* release whole objpool forcely */ void objpool_free(struct objpool_head *pool) { From 78d0b16127daa26d016c215a089ae330878291f7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Apr 2024 14:52:14 -0700 Subject: [PATCH 15/17] objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profiling shows that calling nr_possible_cpus() in objpool_pop() takes a noticeable amount of CPU (when profiled on 80-core machine), as we need to recalculate number of set bits in a CPU bit mask. This number can't change, so there is no point in paying the price for recalculating it. As such, cache this value in struct objpool_head and use it in objpool_pop(). On the other hand, cached pool->nr_cpus isn't necessary, as it's not used in hot path and is also a pretty trivial value to retrieve. So drop pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size of struct objpool_head remains the same, which is a nice bonus. Same BPF selftests benchmarks were used to evaluate the effect. Using changes in previous patch (inlining of objpool_pop/objpool_push) as baseline, here are the differences: BASELINE ======== kretprobe : 9.937 ± 0.174M/s kretprobe-multi: 10.440 ± 0.108M/s AFTER ===== kretprobe : 10.106 ± 0.120M/s (+1.7%) kretprobe-multi: 10.515 ± 0.180M/s (+0.7%) Link: https://lore.kernel.org/all/20240424215214.3956041-3-andrii@kernel.org/ Cc: Matt (Qiang) Wu Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- include/linux/objpool.h | 6 +++--- lib/objpool.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/objpool.h b/include/linux/objpool.h index d8b1f7b91128..cb1758eaa2d3 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); * struct objpool_head - object pooling metadata * @obj_size: object size, aligned to sizeof(void *) * @nr_objs: total objs (to be pre-allocated with objpool) - * @nr_cpus: local copy of nr_cpu_ids + * @nr_possible_cpus: cached value of num_possible_cpus() * @capacity: max objs can be managed by one objpool_slot * @gfp: gfp flags for kmalloc & vmalloc * @ref: refcount of objpool @@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); struct objpool_head { int obj_size; int nr_objs; - int nr_cpus; + int nr_possible_cpus; int capacity; gfp_t gfp; refcount_t ref; @@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool) raw_local_irq_save(flags); cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { + for (i = 0; i < pool->nr_possible_cpus; i++) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; diff --git a/lib/objpool.c b/lib/objpool.c index f696308fc026..234f9d0bd081 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, { int i, cpu_count = 0; - for (i = 0; i < pool->nr_cpus; i++) { + for (i = 0; i < nr_cpu_ids; i++) { struct objpool_slot *slot; int nodes, size, rc; @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, continue; /* compute how many objects to be allocated with this slot */ - nodes = nr_objs / num_possible_cpus(); - if (cpu_count < (nr_objs % num_possible_cpus())) + nodes = nr_objs / pool->nr_possible_cpus; + if (cpu_count < (nr_objs % pool->nr_possible_cpus)) nodes++; cpu_count++; @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) if (!pool->cpu_slots) return; - for (i = 0; i < pool->nr_cpus; i++) + for (i = 0; i < nr_cpu_ids; i++) kvfree(pool->cpu_slots[i]); kfree(pool->cpu_slots); } @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, /* initialize objpool pool */ memset(pool, 0, sizeof(struct objpool_head)); - pool->nr_cpus = nr_cpu_ids; + pool->nr_possible_cpus = num_possible_cpus(); pool->obj_size = object_size; pool->capacity = capacity; pool->gfp = gfp & ~__GFP_ZERO; pool->context = context; pool->release = release; - slot_size = pool->nr_cpus * sizeof(struct objpool_slot); + slot_size = nr_cpu_ids * sizeof(struct objpool_slot); pool->cpu_slots = kzalloc(slot_size, pool->gfp); if (!pool->cpu_slots) return -ENOMEM; From b7bd96ec1b709f5079fd203b680261dabc0050aa Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 4 May 2024 09:36:56 +0900 Subject: [PATCH 16/17] selftests/ftrace: Fix required features for VFS type test case Since the VFS type argument test case uses fprobe events, it must check the availablity of dynamic_events file and fprobe events syntax in README. Without this fix, the test fails if CONFIG_FPROBE_EVENTS=n. Link: https://lore.kernel.org/all/171478301645.110267.464634740467398506.stgit@devnote2/ Fixes: ee97e5e135c6 ("selftests/ftrace: add fprobe test cases for VFS type "%pd" and "%pD"") Signed-off-by: Masami Hiramatsu (Google) --- .../selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc index 49a833bf334c..c6a9d2466a71 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -1,7 +1,8 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Fprobe event VFS type argument -# requires: kprobe_events "%pd/%pD":README +# requires: dynamic_events "%pd/%pD":README "f[:[/][]] [%return] []":README + : "Test argument %pd with name for fprobe" echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events From 1a7d0890dd4a502a202aaec792a6c04e6e049547 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 1 May 2024 09:29:56 -0700 Subject: [PATCH 17/17] kprobe/ftrace: bail out if ftrace was killed If an error happens in ftrace, ftrace_kill() will prevent disarming kprobes. Eventually, the ftrace_ops associated with the kprobes will be freed, yet the kprobes will still be active, and when triggered, they will use the freed memory, likely resulting in a page fault and panic. This behavior can be reproduced quite easily, by creating a kprobe and then triggering a ftrace_kill(). For simplicity, we can simulate an ftrace error with a kernel module like [1]: [1]: https://github.com/brenns10/kernel_stuff/tree/master/ftrace_killer sudo perf probe --add commit_creds sudo perf trace -e probe:commit_creds # In another terminal make sudo insmod ftrace_killer.ko # calls ftrace_kill(), simulating bug # Back to perf terminal # ctrl-c sudo perf probe --del commit_creds After a short period, a page fault and panic would occur as the kprobe continues to execute and uses the freed ftrace_ops. While ftrace_kill() is supposed to be used only in extreme circumstances, it is invoked in FTRACE_WARN_ON() and so there are many places where an unexpected bug could be triggered, yet the system may continue operating, possibly without the administrator noticing. If ftrace_kill() does not panic the system, then we should do everything we can to continue operating, rather than leave a ticking time bomb. Link: https://lore.kernel.org/all/20240501162956.229427-1-stephen.s.brennan@oracle.com/ Signed-off-by: Stephen Brennan Acked-by: Masami Hiramatsu (Google) Acked-by: Guo Ren Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- arch/csky/kernel/probes/ftrace.c | 3 +++ arch/loongarch/kernel/ftrace_dyn.c | 3 +++ arch/parisc/kernel/ftrace.c | 3 +++ arch/powerpc/kernel/kprobes-ftrace.c | 3 +++ arch/riscv/kernel/probes/ftrace.c | 3 +++ arch/s390/kernel/ftrace.c | 3 +++ arch/x86/kernel/kprobes/ftrace.c | 3 +++ include/linux/kprobes.h | 7 +++++++ kernel/kprobes.c | 6 ++++++ kernel/trace/ftrace.c | 1 + 10 files changed, 35 insertions(+) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 834cffcfbce3..7ba4b98076de 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -12,6 +12,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; struct pt_regs *regs; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index 73858c9029cc..bff058317062 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -287,6 +287,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct kprobe_ctlblk *kcb; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 621a4b386ae4..c91f9c2e61ed 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -206,6 +206,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 072ebe7f290b..f8208c027148 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct pt_regs *regs; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index 7142ec42e889..a69dfa610aa8 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -11,6 +11,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..7f6f8c438c26 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -296,6 +296,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb77..15af7e98e161 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0ff44d6633e3..5fcbc254d186 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -378,11 +378,15 @@ static inline void wait_for_kprobe_optimizer(void) { } extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */ +extern bool kprobe_ftrace_disabled __read_mostly; +extern void kprobe_ftrace_kill(void); #else static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) { return -EINVAL; } +static inline void kprobe_ftrace_kill(void) {} #endif /* CONFIG_KPROBES_ON_FTRACE */ /* Get the kprobe at this addr (if any) - called with preemption disabled */ @@ -495,6 +499,9 @@ static inline void kprobe_flush_task(struct task_struct *tk) static inline void kprobe_free_init_mem(void) { } +static inline void kprobe_ftrace_kill(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -EOPNOTSUPP; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 65adc815fc6e..166ebf81dc45 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1068,6 +1068,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; +bool kprobe_ftrace_disabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1136,6 +1137,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } + +void kprobe_ftrace_kill() +{ + kprobe_ftrace_disabled = true; +} #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int arm_kprobe_ftrace(struct kprobe *p) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..96db99c347b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7895,6 +7895,7 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /**