From 32ba156df1b1c8804a4e5be5339616945eafea22 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 16 Aug 2022 05:56:11 -0700 Subject: [PATCH 1/5] perf/x86/lbr: Enable the branch type for the Arch LBR by default On the platform with Arch LBR, the HW raw branch type encoding may leak to the perf tool when the SAVE_TYPE option is not set. In the intel_pmu_store_lbr(), the HW raw branch type is stored in lbr_entries[].type. If the SAVE_TYPE option is set, the lbr_entries[].type will be converted into the generic PERF_BR_* type in the intel_pmu_lbr_filter() and exposed to the user tools. But if the SAVE_TYPE option is NOT set by the user, the current perf kernel doesn't clear the field. The HW raw branch type leaks. There are two solutions to fix the issue for the Arch LBR. One is to clear the field if the SAVE_TYPE option is NOT set. The other solution is to unconditionally convert the branch type and expose the generic type to the user tools. The latter is implemented here, because - The branch type is valuable information. I don't see a case where you would not benefit from the branch type. (Stephane Eranian) - Not having the branch type DOES NOT save any space in the branch record (Stephane Eranian) - The Arch LBR HW can retrieve the common branch types from the LBR_INFO. It doesn't require the high overhead SW disassemble. Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220816125612.2042397-1-kan.liang@linux.intel.com --- arch/x86/events/intel/lbr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..47fca6a7a8bc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1097,6 +1097,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { reg->config = mask; + + /* + * The Arch LBR HW can retrieve the common branch types + * from the LBR_INFO. It doesn't require the high overhead + * SW disassemble. + * Enable the branch type by default for the Arch LBR. + */ + reg->reg |= X86_BR_TYPE_SAVE; return 0; } From 7d3598868aaee05eb738d1c3115616b867e7530a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Aug 2022 19:40:57 +0800 Subject: [PATCH 2/5] perf/x86/core: Set pebs_capable and PMU_FL_PEBS_ALL for the Baseline The SDM explicitly states that PEBS Baseline implies Extended PEBS. For cpu model forward compatibility (e.g. on ICX, SPR, ADL), it's safe to stop doing FMS table thing such as setting pebs_capable and PMU_FL_PEBS_ALL since it's already set in the intel_ds_init(). The Goldmont Plus is the only platform which supports extended PEBS but doesn't have Baseline. Keep the status quo. Reported-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20220816114057.51307-1-likexu@tencent.com --- arch/x86/events/intel/core.c | 4 ---- arch/x86/events/intel/ds.c | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..cb98a05ee743 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6291,10 +6291,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; - x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; @@ -6337,10 +6335,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; - x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; x86_pmu.lbr_pt_coexist = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..ac6dd4c96dbc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2262,6 +2262,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_BRANCH_STACK | PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.pebs_capable = ~0ULL; pebs_qual = "-baseline"; x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { From d4bdb0bebc5ba3299d74f123c782d99cd4e25c49 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 17 Aug 2022 22:46:13 -0700 Subject: [PATCH 3/5] perf/x86/intel/ds: Fix precise store latency handling With the existing code in store_latency_data(), the memory operation (mem_op) returned to the user is always OP_LOAD where in fact, it should be OP_STORE. This comes from the fact that the function is simply grabbing the information from a data source map which covers only load accesses. Intel 12th gen CPU offers precise store sampling that captures both the data source and latency. Therefore it can use the data source mapping table but must override the memory operation to reflect stores instead of loads. Fixes: 61b985e3e775 ("perf/x86/intel: Add perf core PMU support for Sapphire Rapids") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220818054613.1548130-1-eranian@google.com --- arch/x86/events/intel/ds.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ac6dd4c96dbc..e5b587499122 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -291,6 +291,7 @@ static u64 load_latency_data(struct perf_event *event, u64 status) static u64 store_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; + union perf_mem_data_src src; u64 val; dse.val = status; @@ -304,7 +305,14 @@ static u64 store_latency_data(struct perf_event *event, u64 status) val |= P(BLK, NA); - return val; + /* + * the pebs_data_source table is only for loads + * so override the mem_op to say STORE instead + */ + src.val = val; + src.mem_op = P(OP,STORE); + + return src.val; } struct pebs_record_core { From cde643ff75bc20c538dfae787ca3b587bab16b50 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Aug 2022 11:44:29 -0700 Subject: [PATCH 4/5] perf/x86/intel: Fix pebs event constraints for ADL According to the latest event list, the LOAD_LATENCY PEBS event only works on the GP counter 0 and 1 for ADL and RPL. Update the pebs event constraints table. Fixes: f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support") Reported-by: Ammy Yi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220818184429.2355857-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index e5b587499122..de1f55d51784 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -830,7 +830,7 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { struct event_constraint intel_grt_pebs_event_constraints[] = { /* Allow all events as PEBS with no flags */ - INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf), + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3), INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf), EVENT_CONSTRAINT_END }; From 11745ecfe8fea4b4a4c322967a7605d2ecbd5080 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Aug 2022 09:00:31 -0700 Subject: [PATCH 5/5] perf/x86/intel/uncore: Fix broken read_counter() for SNB IMC PMU Existing code was generating bogus counts for the SNB IMC bandwidth counters: $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ 1.000327813 1,024.03 MiB uncore_imc/data_reads/ 1.000327813 20.73 MiB uncore_imc/data_writes/ 2.000580153 261,120.00 MiB uncore_imc/data_reads/ 2.000580153 23.28 MiB uncore_imc/data_writes/ The problem was introduced by commit: 07ce734dd8ad ("perf/x86/intel/uncore: Clean up client IMC") Where the read_counter callback was replace to point to the generic uncore_mmio_read_counter() function. The SNB IMC counters are freerunnig 32-bit counters laid out contiguously in MMIO. But uncore_mmio_read_counter() is using a readq() call to read from MMIO therefore reading 64-bit from MMIO. Although this is okay for the uncore_perf_event_update() function because it is shifting the value based on the actual counter width to compute a delta, it is not okay for the uncore_pmu_event_start() which is simply reading the counter and therefore priming the event->prev_count with a bogus value which is responsible for causing bogus deltas in the perf stat command above. The fix is to reintroduce the custom callback for read_counter for the SNB IMC PMU and use readl() instead of readq(). With the change the output of perf stat is back to normal: $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ 1.000120987 296.94 MiB uncore_imc/data_reads/ 1.000120987 138.42 MiB uncore_imc/data_writes/ 2.000403144 175.91 MiB uncore_imc/data_reads/ 2.000403144 68.50 MiB uncore_imc/data_writes/ Fixes: 07ce734dd8ad ("perf/x86/intel/uncore: Clean up client IMC") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20220803160031.1379788-1-eranian@google.com --- arch/x86/events/intel/uncore_snb.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index ce440011cc4e..1ef4f7861e2e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -841,6 +841,22 @@ int snb_pci2phy_map_init(int devid) return 0; } +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * SNB IMC counters are 32-bit and are laid out back to back + * in MMIO space. Therefore we must use a 32-bit accessor function + * using readq() from uncore_mmio_read_counter() causes problems + * because it is reading 64-bit at a time. This is okay for the + * uncore_perf_event_update() function because it drops the upper + * 32-bits but not okay for plain uncore_read_counter() as invoked + * in uncore_pmu_event_start(). + */ + return (u64)readl(box->io_addr + hwc->event_base); +} + static struct pmu snb_uncore_imc_pmu = { .task_ctx_nr = perf_invalid_context, .event_init = snb_uncore_imc_event_init, @@ -860,7 +876,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = { .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = uncore_mmio_read_counter, + .read_counter = snb_uncore_imc_read_counter, }; static struct intel_uncore_type snb_uncore_imc = {