From dfbc411e0a5ea72fdd563b2c7d627e9d993d865c Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sun, 17 Mar 2024 17:44:42 +0100 Subject: [PATCH 01/21] perf/x86/rapl: Prefer struct_size() over open coded arithmetic This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows: https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments https://github.com/KSPP/linux/issues/160 As the "rapl_pmus" variable is a pointer to "struct rapl_pmus" and this structure ends in a flexible array: struct rapl_pmus { [...] struct rapl_pmu *pmus[] __counted_by(maxdie); }; the preferred way in the kernel is to use the struct_size() helper to do the arithmetic instead of the calculation "size + count * size" in the kzalloc() function. This way, the code is more readable and safer. Signed-off-by: Erick Archer Signed-off-by: Ingo Molnar Reviewed-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240317164442.6729-1-erick.archer@gmx.com --- arch/x86/events/rapl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index fb2b1961e5a3..8ef08b5d55a7 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { int maxdie = topology_max_packages() * topology_max_dies_per_package(); - size_t size; - size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); - rapl_pmus = kzalloc(size, GFP_KERNEL); + rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; From 0dbf66fa7e80024629f816c2ec7a9f3d39637822 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:21:15 -0700 Subject: [PATCH 02/21] perf/x86/amd: Ensure amd_pmu_core_disable_all() is always inlined In the following patches we will enable LBR capture on AMD CPUs at arbitrary point in time, which means that LBR recording won't be frozen by hardware automatically as part of hardware overflow event. So we need to take care to minimize amount of branches and function calls/returns on the path to freezing LBR, minimizing LBR snapshot altering as much as possible. amd_pmu_core_disable_all() is one of the functions on this path, and is already marked as __always_inline. But it calls amd_pmu_set_global_ctl() which is marked as just inline. So to guarantee no function call will be generated thoughout mark amd_pmu_set_global_ctl() as __always_inline as well. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20240402022118.1046049-2-andrii@kernel.org --- arch/x86/events/amd/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 985ef3b47919..9b15afda0326 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -647,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu) } } -static inline void amd_pmu_set_global_ctl(u64 ctl) +static __always_inline void amd_pmu_set_global_ctl(u64 ctl) { wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } From 1eddf187e5d087de4560ec7c3baa2f8283920710 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:21:16 -0700 Subject: [PATCH 03/21] perf/x86/amd: Avoid taking branches before disabling LBR In the following patches we will enable LBR capture on AMD CPUs at arbitrary point in time, which means that LBR recording won't be frozen by hardware automatically as part of hardware overflow event. So we need to take care to minimize amount of branches and function calls/returns on the path to freezing LBR, minimizing LBR snapshot altering as much as possible. As such, split out LBR disabling logic from the sanity checking logic inside amd_pmu_lbr_disable_all(). This will ensure that no branches are taken before LBR is frozen in the functionality added in the next patch. Use __always_inline to also eliminate any possible function calls. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20240402022118.1046049-3-andrii@kernel.org --- arch/x86/events/amd/lbr.c | 9 +-------- arch/x86/events/perf_event.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 5149830c7c4f..33d0a45c0cd3 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -414,18 +414,11 @@ void amd_pmu_lbr_enable_all(void) void amd_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 dbg_ctl, dbg_extn_cfg; if (!cpuc->lbr_users || !x86_pmu.lbr_nr) return; - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); - - if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - } + __amd_pmu_lbr_disable(); } __init int amd_pmu_lbr_init(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index fb56518356ec..72b022a1e16c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); +static __always_inline void __amd_pmu_lbr_disable(void) +{ + u64 dbg_ctl, dbg_extn_cfg; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + } +} + #ifdef CONFIG_PERF_EVENTS_AMD_BRS #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ From a4d18112e5317c120bcadeb486fbe950f749bb5e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:21:17 -0700 Subject: [PATCH 04/21] perf/x86/amd: Support capturing LBR from software events Upstream commit c22ac2a3d4bd ("perf: Enable branch record for software events") added ability to capture LBR (Last Branch Records) on Intel CPUs from inside BPF program at pretty much any arbitrary point. This is extremely useful capability that allows to figure out otherwise hard to debug problems, because LBR is now available based on some application-defined conditions, not just hardware-supported events. 'retsnoop' is one such tool that takes a huge advantage of this functionality and has proved to be an extremely useful tool in practice: https://github.com/anakryiko/retsnoop Now, AMD Zen4 CPUs got support for similar LBR functionality, but necessary wiring inside the kernel is not yet setup. This patch seeks to rectify this and follows a similar approach to the original patch for Intel CPUs. We implement an AMD-specific callback set to be called through perf_snapshot_branch_stack static call. Previous preparatory patches ensured that amd_pmu_core_disable_all() and __amd_pmu_lbr_disable() will be completely inlined and will have no branches, so LBR snapshot contamination will be minimized. This was tested on AMD Bergamo CPU and worked well when utilized from the aforementioned retsnoop tool. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20240402022118.1046049-4-andrii@kernel.org --- arch/x86/events/amd/core.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9b15afda0326..1fc4ce44e743 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -907,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) return amd_pmu_adjust_nmi_window(handled); } +/* + * AMD-specific callback invoked through perf_snapshot_branch_stack static + * call, defined in include/linux/perf_event.h. See its definition for API + * details. It's up to caller to provide enough space in *entries* to fit all + * LBR records, otherwise returned result will be truncated to *cnt* entries. + */ +static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + struct cpu_hw_events *cpuc; + unsigned long flags; + + /* + * The sequence of steps to freeze LBR should be completely inlined + * and contain no branches to minimize contamination of LBR snapshot + */ + local_irq_save(flags); + amd_pmu_core_disable_all(); + __amd_pmu_lbr_disable(); + + cpuc = this_cpu_ptr(&cpu_hw_events); + + amd_pmu_lbr_read(); + cnt = min(cnt, x86_pmu.lbr_nr); + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); + + amd_pmu_v2_enable_all(0); + local_irq_restore(flags); + + return cnt; +} + static int amd_pmu_v2_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1443,6 +1474,10 @@ static int __init amd_core_pmu_init(void) static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + + /* Only support branch_stack snapshot on perfmon v2 */ + if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq) + static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack); } else if (!amd_brs_init()) { /* * BRS requires special event constraints and flushing on ctxsw. From 9794563d4d053b1b46a0cc91901f0a11d8678c19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:21:18 -0700 Subject: [PATCH 05/21] perf/x86/amd: Don't reject non-sampling events with configured LBR Now that it's possible to capture LBR on AMD CPU from BPF at arbitrary point, there is no reason to artificially limit this feature to just sampling events. So corresponding check is removed. AFAIU, there is no correctness implications of doing this (and it was possible to bypass this check by just setting perf_event's sample_period to 1 anyways, so it doesn't guard all that much). Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20240402022118.1046049-5-andrii@kernel.org --- arch/x86/events/amd/lbr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 33d0a45c0cd3..19c7b76e21bc 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event) { int ret = 0; - /* LBR is not recommended in counting mode */ - if (!is_sampling_event(event)) - return -EINVAL; - ret = amd_pmu_lbr_setup_filter(event); if (!ret) event->attach_state |= PERF_ATTACH_SCHED_CB; From 0259bf63f71e2accfeca4a4e346ede8edcc86aab Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Feb 2024 21:05:44 -0800 Subject: [PATCH 06/21] perf/core: Optimize perf_adjust_freq_unthr_context() It was unnecessarily disabling and enabling PMUs for each event. It should be done at PMU level. Add pmu_ctx->nr_freq counter to check it at each PMU. As PMU context has separate active lists for pinned group and flexible group, factor out a new function to do the job. Another minor optimization is that it can skip PMUs w/ CAP_NO_INTERRUPT even if it needs to unthrottle sampling events. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Mingwei Zhang Reviewed-by: Ian Rogers Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240207050545.2727923-1-namhyung@kernel.org --- include/linux/perf_event.h | 6 ++++ kernel/events/core.c | 70 ++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d2a15c0c6f8a..3e33b366347a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -883,6 +883,7 @@ struct perf_event_pmu_context { unsigned int nr_events; unsigned int nr_cgroups; + unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; @@ -897,6 +898,11 @@ struct perf_event_pmu_context { int rotate_necessary; }; +static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) +{ + return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); +} + struct perf_event_groups { struct rb_root tree; u64 index; diff --git a/kernel/events/core.c b/kernel/events/core.c index 724e6d7e128f..9566cfb27355 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; + epc->nr_freq--; + } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; @@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; - + epc->nr_freq++; + } if (event->attr.exclusive) cpc->exclusive = 1; @@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo } } -/* - * combine freq adjustment with unthrottling to avoid two passes over the - * events. At the same time, make sure, having freq events does not change - * the rate of unthrottling as that would introduce bias. - */ -static void -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; - /* - * only need to iterate over all events iff: - * - context have events in frequency mode (needs freq adjust) - * - there are events to unthrottle on this cpu - */ - if (!(ctx->nr_freq || unthrottle)) - return; - - raw_spin_lock(&ctx->lock); - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -4154,8 +4141,6 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) if (!event_filter_match(event)) continue; - perf_pmu_disable(event->pmu); - hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { @@ -4165,7 +4150,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) } if (!event->attr.freq || !event->attr.sample_freq) - goto next; + continue; /* * stop the event and update event->count @@ -4187,8 +4172,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); - next: - perf_pmu_enable(event->pmu); + } +} + +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +{ + struct perf_event_pmu_context *pmu_ctx; + + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || unthrottle)) + return; + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (!(pmu_ctx->nr_freq || unthrottle)) + continue; + if (!perf_pmu_ctx_is_active(pmu_ctx)) + continue; + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) + continue; + + perf_pmu_disable(pmu_ctx->pmu); + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); + perf_pmu_enable(pmu_ctx->pmu); } raw_spin_unlock(&ctx->lock); From f38628b06c36222367e26820879789ae59e49f60 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Feb 2024 21:05:45 -0800 Subject: [PATCH 07/21] perf/core: Reduce PMU access to adjust sample freq In perf_adjust_freq_unthr_context(), it first starts the event and then stop unnecessarily to adjust the sampling frequency if the event is throttled. For a throttled non-frequency event, it doesn't have a freq so no need to adjust. Just starting the event would be ok. For a frequency event, whether it's throttled or not, it needs to stop before adjusting the frequency. That means it should not start the even if it was throttled. I tried to skip calling the stop callback, but it didn't work well since the event count might not be up to date. It should call the stop callback with PERF_EF_UPDATE anyway. However not calling start would prevent unnecessary MSR accesses (which can be costly) for already stopped events as stop state is saved in the hw config. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240207050545.2727923-2-namhyung@kernel.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9566cfb27355..fd94e45a9d86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4146,7 +4146,8 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); - event->pmu->start(event, 0); + if (!event->attr.freq || !event->attr.sample_freq) + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) From fb70fe74beaa809e13e7f469b116d54ef7cd19e9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 10 Apr 2024 20:45:53 +0800 Subject: [PATCH 08/21] perf/x86/rapl: Add support for Intel Arrow Lake Arrow Lake RAPL support is the same as previous Sky Lake. Add Arrow Lake model for RAPL. Signed-off-by: Zhang Rui Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240410124554.448987-1-rui.zhang@intel.com --- arch/x86/events/rapl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8ef08b5d55a7..00bb6eacd1e9 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -806,6 +806,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); From acf68d98cae8a60dc4af2e9feaaa799bf0aa5c04 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 10 Apr 2024 20:45:54 +0800 Subject: [PATCH 09/21] perf/x86/rapl: Add support for Intel Lunar Lake Lunar Lake RAPL support is the same as previous Sky Lake. Add Lunar Lake model for RAPL. Signed-off-by: Zhang Rui Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240410124554.448987-2-rui.zhang@intel.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 00bb6eacd1e9..ca5f687fa420 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -808,6 +808,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); From 4c03fe11b96bda60610aca77002e83f37b4a2242 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:13 -0700 Subject: [PATCH 10/21] perf/bpf: Reorder bpf_overflow_handler() ahead of __perf_event_overflow() This will allow __perf_event_overflow() to call bpf_overflow_handler(). Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240412015019.7060-2-khuey@kylehuey.com --- kernel/events/core.c | 183 ++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 91 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fd94e45a9d86..ca0a90648fe6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9563,6 +9563,98 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r return true; } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .event = event, + }; + struct bpf_prog *prog; + int ret = 0; + + ctx.regs = perf_arch_bpf_user_pt_regs(regs); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + prog = READ_ONCE(event->prog); + if (prog) { + perf_prepare_sample(data, event, regs); + ret = bpf_prog_run(prog, &ctx); + } + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; + + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + return -EPROTO; + } + + event->prog = prog; + event->bpf_cookie = bpf_cookie; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -EOPNOTSUPP; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + /* * Generic event overflow handling, sampling. */ @@ -10441,97 +10533,6 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } -#ifdef CONFIG_BPF_SYSCALL -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct bpf_perf_event_data_kern ctx = { - .data = data, - .event = event, - }; - struct bpf_prog *prog; - int ret = 0; - - ctx.regs = perf_arch_bpf_user_pt_regs(regs); - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) - goto out; - rcu_read_lock(); - prog = READ_ONCE(event->prog); - if (prog) { - perf_prepare_sample(data, event, regs); - ret = bpf_prog_run(prog, &ctx); - } - rcu_read_unlock(); -out: - __this_cpu_dec(bpf_prog_active); - if (!ret) - return; - - event->orig_overflow_handler(event, data, regs); -} - -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) -{ - if (event->overflow_handler_context) - /* hw breakpoint or kernel counter */ - return -EINVAL; - - if (event->prog) - return -EEXIST; - - if (prog->type != BPF_PROG_TYPE_PERF_EVENT) - return -EINVAL; - - if (event->attr.precise_ip && - prog->call_get_stack && - (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || - event->attr.exclude_callchain_kernel || - event->attr.exclude_callchain_user)) { - /* - * On perf_event with precise_ip, calling bpf_get_stack() - * may trigger unwinder warnings and occasional crashes. - * bpf_get_[stack|stackid] works around this issue by using - * callchain attached to perf_sample_data. If the - * perf_event does not full (kernel and user) callchain - * attached to perf_sample_data, do not allow attaching BPF - * program that calls bpf_get_[stack|stackid]. - */ - return -EPROTO; - } - - event->prog = prog; - event->bpf_cookie = bpf_cookie; - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); - return 0; -} - -static void perf_event_free_bpf_handler(struct perf_event *event) -{ - struct bpf_prog *prog = event->prog; - - if (!prog) - return; - - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); - event->prog = NULL; - bpf_prog_put(prog); -} -#else -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) -{ - return -EOPNOTSUPP; -} -static void perf_event_free_bpf_handler(struct perf_event *event) -{ -} -#endif - /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() From 924d934393f98fa6a41d6ea27352faf79c2bbaf6 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:14 -0700 Subject: [PATCH 11/21] perf/bpf: Create bpf_overflow_handler() stub for !CONFIG_BPF_SYSCALL This will allow __perf_event_overflow() (which is independent of CONFIG_BPF_SYSCALL) to call bpf_overflow_handler(). Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240412015019.7060-3-khuey@kylehuey.com --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index ca0a90648fe6..d3f3f552e193 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9643,6 +9643,12 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ +} + static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) From 14e40a9578b70cc5323e55f61292a7e021f6037c Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:15 -0700 Subject: [PATCH 12/21] perf/bpf: Remove #ifdef CONFIG_BPF_SYSCALL from struct perf_event members This will allow __perf_event_overflow() (which is independent of CONFIG_BPF_SYSCALL) to use struct perf_event's prog to decide whether to call bpf_overflow_handler(). Suggested-by: Ingo Molnar Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240412015019.7060-4-khuey@kylehuey.com --- include/linux/perf_event.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e33b366347a..50e01db083ee 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -809,11 +809,9 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; -#ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; u64 bpf_cookie; -#endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; From f11f10bfa1ca23b32020b2073aa13131a27978fe Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:16 -0700 Subject: [PATCH 13/21] perf/bpf: Call BPF handler directly, not through overflow machinery To ultimately allow BPF programs attached to perf events to completely suppress all of the effects of a perf event overflow (rather than just the sample output, as they do today), call bpf_overflow_handler() from __perf_event_overflow() directly rather than modifying struct perf_event's overflow_handler. Return the BPF program's return value from bpf_overflow_handler() so that __perf_event_overflow() knows how to proceed. Remove the now unnecessary orig_overflow_handler from struct perf_event. This patch is solely a refactoring and results in no behavior change. Suggested-by: Namhyung Kim Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Acked-by: Song Liu Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240412015019.7060-5-khuey@kylehuey.com --- include/linux/perf_event.h | 6 +----- kernel/events/core.c | 27 +++++++++++---------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 50e01db083ee..2ce2fbc02ec6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -809,7 +809,6 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; - perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; u64 bpf_cookie; @@ -1361,10 +1360,7 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler) #ifdef CONFIG_BPF_SYSCALL static inline bool uses_default_overflow_handler(struct perf_event *event) { - if (likely(is_default_overflow_handler(event))) - return true; - - return __is_default_overflow_handler(event->orig_overflow_handler); + return is_default_overflow_handler(event); } #else #define uses_default_overflow_handler(event) \ diff --git a/kernel/events/core.c b/kernel/events/core.c index d3f3f552e193..c6a6936183d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9564,9 +9564,9 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r } #ifdef CONFIG_BPF_SYSCALL -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, @@ -9587,10 +9587,8 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); - if (!ret) - return; - event->orig_overflow_handler(event, data, regs); + return ret; } static int perf_event_set_bpf_handler(struct perf_event *event, @@ -9626,8 +9624,6 @@ static int perf_event_set_bpf_handler(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; } @@ -9638,15 +9634,15 @@ static void perf_event_free_bpf_handler(struct perf_event *event) if (!prog) return; - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); event->prog = NULL; bpf_prog_put(prog); } #else -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) { + return 1; } static int perf_event_set_bpf_handler(struct perf_event *event, @@ -9730,7 +9726,8 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending_irq); } - READ_ONCE(event->overflow_handler)(event, data, regs); + if (!(event->prog && !bpf_overflow_handler(event, data, regs))) + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -11997,13 +11994,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) - if (overflow_handler == bpf_overflow_handler) { + if (parent_event->prog) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; - event->orig_overflow_handler = - parent_event->orig_overflow_handler; } #endif } From 76f6d58845829e5d6ef55532e67a323e7d30c26e Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:17 -0700 Subject: [PATCH 14/21] perf/bpf: Remove unneeded uses_default_overflow_handler() Now that struct perf_event's orig_overflow_handler is gone, there's no need for the functions and macros to support looking past overflow_handler to orig_overflow_handler. This patch is solely a refactoring and results in no behavior change. Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Acked-by: Will Deacon Acked-by: Song Liu Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240412015019.7060-6-khuey@kylehuey.com --- arch/arm/kernel/hw_breakpoint.c | 8 ++++---- arch/arm64/kernel/hw_breakpoint.c | 4 ++-- include/linux/perf_event.h | 17 +++-------------- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index dc0fb7a81371..054e9199f30d 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, hw->address &= ~alignment_mask; hw->ctrl.len <<= offset; - if (uses_default_overflow_handler(bp)) { + if (is_default_overflow_handler(bp)) { /* * Mismatch breakpoints are required for single-stepping * breakpoints. @@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, * Otherwise, insert a temporary mismatch breakpoint so that * we can single-step over the watchpoint trigger. */ - if (!uses_default_overflow_handler(wp)) + if (!is_default_overflow_handler(wp)) continue; step: enable_single_step(wp, instruction_pointer(regs)); @@ -811,7 +811,7 @@ step: info->trigger = addr; pr_debug("watchpoint fired: address = 0x%x\n", info->trigger); perf_bp_event(wp, regs); - if (uses_default_overflow_handler(wp)) + if (is_default_overflow_handler(wp)) enable_single_step(wp, instruction_pointer(regs)); } @@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) info->trigger = addr; pr_debug("breakpoint fired: address = 0x%x\n", addr); perf_bp_event(bp, regs); - if (uses_default_overflow_handler(bp)) + if (is_default_overflow_handler(bp)) enable_single_step(bp, addr); goto unlock; } diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 2f5755192c2b..722ac45f9f7b 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -655,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (uses_default_overflow_handler(bp)) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -734,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, static int watchpoint_report(struct perf_event *wp, unsigned long addr, struct pt_regs *regs) { - int step = uses_default_overflow_handler(wp); + int step = is_default_overflow_handler(wp); struct arch_hw_breakpoint *info = counter_arch_bp(wp); info->trigger = addr; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2ce2fbc02ec6..d5ff0c164875 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1345,8 +1345,10 @@ extern int perf_event_output(struct perf_event *event, struct pt_regs *regs); static inline bool -__is_default_overflow_handler(perf_overflow_handler_t overflow_handler) +is_default_overflow_handler(struct perf_event *event) { + perf_overflow_handler_t overflow_handler = event->overflow_handler; + if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) @@ -1354,19 +1356,6 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler) return false; } -#define is_default_overflow_handler(event) \ - __is_default_overflow_handler((event)->overflow_handler) - -#ifdef CONFIG_BPF_SYSCALL -static inline bool uses_default_overflow_handler(struct perf_event *event) -{ - return is_default_overflow_handler(event); -} -#else -#define uses_default_overflow_handler(event) \ - is_default_overflow_handler(event) -#endif - extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, From c4fcc7d1f41532e878087c7c43f4cf247604d68b Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:18 -0700 Subject: [PATCH 15/21] perf/bpf: Allow a BPF program to suppress all sample side effects Returning zero from a BPF program attached to a perf event already suppresses any data output. Return early from __perf_event_overflow() in this case so it will also suppress event_limit accounting, SIGTRAP generation, and F_ASYNC signalling. Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Acked-by: Song Liu Acked-by: Jiri Olsa Acked-by: Namhyung Kim Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240412015019.7060-7-khuey@kylehuey.com --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index c6a6936183d5..2212670cbe9b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9677,6 +9677,9 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->prog && !bpf_overflow_handler(event, data, regs)) + return ret; + /* * XXX event_limit might not quite work as expected on inherited * events @@ -9726,8 +9729,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending_irq); } - if (!(event->prog && !bpf_overflow_handler(event, data, regs))) - READ_ONCE(event->overflow_handler)(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; From a265c9f6d52ac760e6e572bac73a11b60b998779 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Thu, 11 Apr 2024 18:50:19 -0700 Subject: [PATCH 16/21] selftest/bpf: Test a perf BPF program that suppresses side effects The test sets a hardware breakpoint and uses a BPF program to suppress the side effects of a perf event sample, including I/O availability signals, SIGTRAPs, and decrementing the event counter limit, if the IP matches the expected value. Then the function with the breakpoint is executed multiple times to test that all effects behave as expected. Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Acked-by: Song Liu Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240412015019.7060-8-khuey@kylehuey.com --- .../selftests/bpf/prog_tests/perf_skip.c | 137 ++++++++++++++++++ .../selftests/bpf/progs/test_perf_skip.c | 15 ++ 2 files changed, 152 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_skip.c create mode 100644 tools/testing/selftests/bpf/progs/test_perf_skip.c diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c new file mode 100644 index 000000000000..37d8618800e4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include "test_perf_skip.skel.h" +#include +#include +#include + +#ifndef TRAP_PERF +#define TRAP_PERF 6 +#endif + +int sigio_count, sigtrap_count; + +static void handle_sigio(int sig __always_unused) +{ + ++sigio_count; +} + +static void handle_sigtrap(int signum __always_unused, + siginfo_t *info, + void *ucontext __always_unused) +{ + ASSERT_EQ(info->si_code, TRAP_PERF, "si_code"); + ++sigtrap_count; +} + +static noinline int test_function(void) +{ + asm volatile (""); + return 0; +} + +void serial_test_perf_skip(void) +{ + struct sigaction action = {}; + struct sigaction previous_sigtrap; + sighandler_t previous_sigio = SIG_ERR; + struct test_perf_skip *skel = NULL; + struct perf_event_attr attr = {}; + int perf_fd = -1; + int err; + struct f_owner_ex owner; + struct bpf_link *prog_link = NULL; + + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = handle_sigtrap; + sigemptyset(&action.sa_mask); + if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction")) + return; + + previous_sigio = signal(SIGIO, handle_sigio); + if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal")) + goto cleanup; + + skel = test_perf_skip__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.bp_type = HW_BREAKPOINT_X; + attr.bp_addr = (uintptr_t)test_function; + attr.bp_len = sizeof(long); + attr.sample_period = 1; + attr.sample_type = PERF_SAMPLE_IP; + attr.pinned = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.precise_ip = 3; + attr.sigtrap = 1; + attr.remove_on_exec = 1; + + perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n"); + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(perf_fd < 0, "perf_event_open")) + goto cleanup; + + /* Configure the perf event to signal on sample. */ + err = fcntl(perf_fd, F_SETFL, O_ASYNC); + if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)")) + goto cleanup; + + owner.type = F_OWNER_TID; + owner.pid = syscall(__NR_gettid); + err = fcntl(perf_fd, F_SETOWN_EX, &owner); + if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)")) + goto cleanup; + + /* Allow at most one sample. A sample rejected by bpf should + * not count against this. + */ + err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1); + if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)")) + goto cleanup; + + prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd); + if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event")) + goto cleanup; + + /* Configure the bpf program to suppress the sample. */ + skel->bss->ip = (uintptr_t)test_function; + test_function(); + + ASSERT_EQ(sigio_count, 0, "sigio_count"); + ASSERT_EQ(sigtrap_count, 0, "sigtrap_count"); + + /* Configure the bpf program to allow the sample. */ + skel->bss->ip = 0; + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + + /* Test that the sample above is the only one allowed (by perf, not + * by bpf) + */ + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + +cleanup: + bpf_link__destroy(prog_link); + if (perf_fd >= 0) + close(perf_fd); + test_perf_skip__destroy(skel); + + if (previous_sigio != SIG_ERR) + signal(SIGIO, previous_sigio); + sigaction(SIGTRAP, &previous_sigtrap, NULL); +} diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c new file mode 100644 index 000000000000..7eb8b6de7a57 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +uintptr_t ip; + +SEC("perf_event") +int handler(struct bpf_perf_event_data *data) +{ + /* Skip events that have the correct ip. */ + return ip != PT_REGS_IP(&data->regs); +} + +char _license[] SEC("license") = "GPL"; From 93d3fde7fd19c2e2cde7220e7986f9a75e9c5680 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Apr 2024 11:55:00 +0200 Subject: [PATCH 17/21] perf/bpf: Change the !CONFIG_BPF_SYSCALL stubs to static inlines Otherwise the compiler will be unhappy if they go unused, which they do on allnoconfigs. Signed-off-by: Ingo Molnar Cc: Kyle Huey Link: https://lore.kernel.org/r/ZhkE9F4dyfR2dH2D@gmail.com --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2212670cbe9b..6708c1121b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9638,21 +9638,21 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) { return 1; } -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -EOPNOTSUPP; } -static void perf_event_free_bpf_handler(struct perf_event *event) +static inline void perf_event_free_bpf_handler(struct perf_event *event) { } #endif From 4a013980666857c1eb2df6a2137817caa21d38a6 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Sat, 13 Apr 2024 07:16:16 -0700 Subject: [PATCH 18/21] perf: Move perf_event_fasync() to perf_event.h This will allow it to be called from perf_output_wakeup(). Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240413141618.4160-2-khuey@kylehuey.com --- include/linux/perf_event.h | 8 ++++++++ kernel/events/core.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5ff0c164875..a5304ae8c654 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1686,6 +1686,14 @@ perf_event_addr_filters(struct perf_event *event) return ifh; } +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* Only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6708c1121b9f..da9d9a1f4dca 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6703,14 +6703,6 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ -static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) -{ - /* only the parent has fasync state */ - if (event->parent) - event = event->parent; - return &event->fasync; -} - void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); From fd20bb51ed3913e0d25085eb79e8c0babfb4ee28 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Sat, 13 Apr 2024 07:16:18 -0700 Subject: [PATCH 19/21] perf/ring_buffer: Trigger IO signals for watermark_wakeup perf_output_wakeup() already marks the perf event fd available for polling. Trigger IO signals with FASYNC too. Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240413141618.4160-3-khuey@kylehuey.com --- kernel/events/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 60ed43d1c29e..4013408ce012 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; + + if (*perf_event_fasync(handle->event) && !handle->event->pending_kill) + handle->event->pending_kill = POLL_IN; + irq_work_queue(&handle->event->pending_irq); } From e224d1c1fb93f258030186b4878abe105c296ac1 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Sat, 13 Apr 2024 07:16:20 -0700 Subject: [PATCH 20/21] selftests/perf_events: Test FASYNC with watermark wakeups The test uses PERF_RECORD_SWITCH records to fill the ring buffer and trigger the watermark wakeup, which in turn should trigger an IO signal. Signed-off-by: Kyle Huey Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240413141618.4160-4-khuey@kylehuey.com --- .../testing/selftests/perf_events/.gitignore | 1 + tools/testing/selftests/perf_events/Makefile | 2 +- .../selftests/perf_events/watermark_signal.c | 146 ++++++++++++++++++ 3 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/perf_events/watermark_signal.c diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore index 790c47001e77..ee93dc4969b8 100644 --- a/tools/testing/selftests/perf_events/.gitignore +++ b/tools/testing/selftests/perf_events/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only sigtrap_threads remove_on_exec +watermark_signal diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile index db93c4ff081a..70e3ff211278 100644 --- a/tools/testing/selftests/perf_events/Makefile +++ b/tools/testing/selftests/perf_events/Makefile @@ -2,5 +2,5 @@ CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) LDFLAGS += -lpthread -TEST_GEN_PROGS := sigtrap_threads remove_on_exec +TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal include ../lib.mk diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c new file mode 100644 index 000000000000..49dc1e831174 --- /dev/null +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define __maybe_unused __attribute__((__unused__)) + +static int sigio_count; + +static void handle_sigio(int signum __maybe_unused, + siginfo_t *oh __maybe_unused, + void *uc __maybe_unused) +{ + ++sigio_count; +} + +static void do_child(void) +{ + raise(SIGSTOP); + + for (int i = 0; i < 20; ++i) + sleep(1); + + raise(SIGSTOP); + + exit(0); +} + +TEST(watermark_signal) +{ + struct perf_event_attr attr; + struct perf_event_mmap_page *p = NULL; + struct sigaction previous_sigio, sigio = { 0 }; + pid_t child = -1; + int child_status; + int fd = -1; + long page_size = sysconf(_SC_PAGE_SIZE); + + sigio.sa_sigaction = handle_sigio; + EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0); + + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_DUMMY; + attr.sample_period = 1; + attr.disabled = 1; + attr.watermark = 1; + attr.context_switch = 1; + attr.wakeup_watermark = 1; + + child = fork(); + EXPECT_GE(child, 0); + if (child == 0) + do_child(); + else if (child < 0) { + perror("fork()"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != child || + !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) { + fprintf(stderr, + "failed to sycnhronize with child errno=%d status=%x\n", + errno, + child_status); + goto cleanup; + } + + fd = syscall(__NR_perf_event_open, &attr, child, -1, -1, + PERF_FLAG_FD_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "failed opening event %llx\n", attr.config); + goto cleanup; + } + + if (fcntl(fd, F_SETFL, FASYNC)) { + perror("F_SETFL FASYNC"); + goto cleanup; + } + + if (fcntl(fd, F_SETOWN, getpid())) { + perror("F_SETOWN getpid()"); + goto cleanup; + } + + if (fcntl(fd, F_SETSIG, SIGIO)) { + perror("F_SETSIG SIGIO"); + goto cleanup; + } + + p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == NULL) { + perror("mmap"); + goto cleanup; + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) { + perror("PERF_EVENT_IOC_ENABLE"); + goto cleanup; + } + + if (kill(child, SIGCONT) < 0) { + perror("SIGCONT"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR) + fprintf(stderr, + "expected SIGIO to terminate wait errno=%d status=%x\n%d", + errno, + child_status, + sigio_count); + + EXPECT_GE(sigio_count, 1); + +cleanup: + if (p != NULL) + munmap(p, 2 * page_size); + + if (fd >= 0) + close(fd); + + if (child > 0) { + kill(child, SIGKILL); + waitpid(child, NULL, 0); + } + + sigaction(SIGIO, &previous_sigio, NULL); +} + +TEST_HARNESS_MAIN From 854dd99b5ddc9d90e31e5f112462a5994dd31810 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Apr 2024 22:33:27 +0200 Subject: [PATCH 21/21] perf/bpf: Mark perf_event_set_bpf_handler() and perf_event_free_bpf_handler() as inline too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They can be unused with certain Kconfig variations: kernel/events/core.c:9622:13: warning: ‘perf_event_free_bpf_handler’ defined but not used [-Wunused-function] kernel/events/core.c:9586:12: warning: ‘perf_event_set_bpf_handler’ defined but not used [-Wunused-function] Since they are both single-use, mark them inline. Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Kyle Huey --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index da9d9a1f4dca..6b0a66ed2ae3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9583,9 +9583,9 @@ out: return ret; } -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ @@ -9619,7 +9619,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, return 0; } -static void perf_event_free_bpf_handler(struct perf_event *event) +static inline void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog;