Perf events updates for this cycle are:
- Fix Intel Alder Lake PEBS memory access latency & data source profiling info bugs. - Use Intel large-PEBS hardware feature in more circumstances, to reduce PMI overhead & reduce sampling data. - Extend the lost-sample profiling output with the PERF_FORMAT_LOST ABI variant, which tells tooling the exact number of samples lost. - Add new IBS register bits definitions. - AMD uncore events: Add PerfMonV2 DF (Data Fabric) enhancements. Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmLn5MARHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1jAWA/+N48UX35dD0u3k5S2zdYJRHzQkdbivVGc dOuCB3XTJYneaSI5byQkI4Xo8LUuMbF4q2Zi3/+XhTaqn2zYPP65D6ACL5hU9Shh F95TnLWbedIaxSJmjMCsWDlwBob8WgtLhokWvyq+ks66BqaDoBKHRtn+2fi0rwZb MbuN0199Gx/EicWEOeUGBSxoeKbjSix0BApqy+CuXC0DC3+3iwIPk4dbNfHXpHYs nqxjQKhJnoxdlgjiOY3UuYhdCZl1cuQFIu2Ce1N2nXCAgR2FeQD7ZqtcaA2TnsAO 9BwRfLljavzHhOoz0zALR42kF+eOcnH5K9pIBx7py9Hjdmdsx88fUCovWK34MdG5 KTuqiMWNLIUvoP9WBjl7wUtl2+vcjr9XwgCdneOO+zoNsk44qSRyer1RpEP6D9UM e9HvdXBVRzhnIhK9NYugeLJ+3nxvFL+OLvc3ZfUrtm04UzeetCBxMlvMv3y021V7 0fInZjhzh4Dz2tJgNlG7AKXkXlsHlyj6/BH9uKc9wVokK+94g5mbspxW8R4gKPr2 l06pYB7ttSpp26sq9vl5ASHO0rniiYAPsQcr7Ko3y72mmp6kfIe/HzYNhCEvgYe2 6JJ8F9kPgRuKr0CwGvUzxFwBC7PJR80zUtZkRCIpV+rgxQcNmK5YXp/KQFIjQqkI rJfEaDOshl0= =DqaA -----END PGP SIGNATURE----- Merge tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull perf events updates from Ingo Molnar: - Fix Intel Alder Lake PEBS memory access latency & data source profiling info bugs. - Use Intel large-PEBS hardware feature in more circumstances, to reduce PMI overhead & reduce sampling data. - Extend the lost-sample profiling output with the PERF_FORMAT_LOST ABI variant, which tells tooling the exact number of samples lost. - Add new IBS register bits definitions. - AMD uncore events: Add PerfMonV2 DF (Data Fabric) enhancements. * tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86/ibs: Add new IBS register bits into header perf/x86/intel: Fix PEBS data source encoding for ADL perf/x86/intel: Fix PEBS memory access info encoding for ADL perf/core: Add a new read format to get a number of lost samples perf/x86/amd/uncore: Add PerfMonV2 RDPMC assignments perf/x86/amd/uncore: Add PerfMonV2 DF event format perf/x86/amd/uncore: Detect available DF counters perf/x86/amd/uncore: Use attr_update for format attributes perf/x86/amd/uncore: Use dynamic events array x86/events/intel/ds: Enable large PEBS for PERF_SAMPLE_WEIGHT_TYPE
This commit is contained in:
commit
63e6053add
@ -21,7 +21,6 @@
|
||||
#define NUM_COUNTERS_NB 4
|
||||
#define NUM_COUNTERS_L2 4
|
||||
#define NUM_COUNTERS_L3 6
|
||||
#define MAX_COUNTERS 6
|
||||
|
||||
#define RDPMC_BASE_NB 6
|
||||
#define RDPMC_BASE_LLC 10
|
||||
@ -31,6 +30,7 @@
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "amd_uncore: " fmt
|
||||
|
||||
static int pmu_version;
|
||||
static int num_counters_llc;
|
||||
static int num_counters_nb;
|
||||
static bool l3_mask;
|
||||
@ -46,7 +46,7 @@ struct amd_uncore {
|
||||
u32 msr_base;
|
||||
cpumask_t *active_mask;
|
||||
struct pmu *pmu;
|
||||
struct perf_event *events[MAX_COUNTERS];
|
||||
struct perf_event **events;
|
||||
struct hlist_node node;
|
||||
};
|
||||
|
||||
@ -158,6 +158,16 @@ out:
|
||||
hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
|
||||
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
||||
|
||||
/*
|
||||
* The first four DF counters are accessible via RDPMC index 6 to 9
|
||||
* followed by the L3 counters from index 10 to 15. For processors
|
||||
* with more than four DF counters, the DF RDPMC assignments become
|
||||
* discontiguous as the additional counters are accessible starting
|
||||
* from index 16.
|
||||
*/
|
||||
if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB)
|
||||
hwc->event_base_rdpmc += NUM_COUNTERS_L3;
|
||||
|
||||
if (flags & PERF_EF_START)
|
||||
amd_uncore_start(event, PERF_EF_RELOAD);
|
||||
|
||||
@ -209,10 +219,14 @@ static int amd_uncore_event_init(struct perf_event *event)
|
||||
{
|
||||
struct amd_uncore *uncore;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
u64 event_mask = AMD64_RAW_EVENT_MASK_NB;
|
||||
|
||||
if (event->attr.type != event->pmu->type)
|
||||
return -ENOENT;
|
||||
|
||||
if (pmu_version >= 2 && is_nb_event(event))
|
||||
event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB;
|
||||
|
||||
/*
|
||||
* NB and Last level cache counters (MSRs) are shared across all cores
|
||||
* that share the same NB / Last level cache. On family 16h and below,
|
||||
@ -221,7 +235,7 @@ static int amd_uncore_event_init(struct perf_event *event)
|
||||
* out. So we do not support sampling and per-thread events via
|
||||
* CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
|
||||
*/
|
||||
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
|
||||
hwc->config = event->attr.config & event_mask;
|
||||
hwc->idx = -1;
|
||||
|
||||
if (event->cpu < 0)
|
||||
@ -247,6 +261,19 @@ static int amd_uncore_event_init(struct perf_event *event)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static umode_t
|
||||
amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
||||
{
|
||||
return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ?
|
||||
attr->mode : 0;
|
||||
}
|
||||
|
||||
static umode_t
|
||||
amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
||||
{
|
||||
return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0;
|
||||
}
|
||||
|
||||
static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
@ -287,8 +314,10 @@ static struct device_attribute format_attr_##_var = \
|
||||
|
||||
DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
|
||||
DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
|
||||
DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15");
|
||||
DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */
|
||||
@ -297,20 +326,33 @@ DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3
|
||||
DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */
|
||||
DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */
|
||||
|
||||
/* Common DF and NB attributes */
|
||||
static struct attribute *amd_uncore_df_format_attr[] = {
|
||||
&format_attr_event12.attr, /* event14 if F17h+ */
|
||||
&format_attr_umask.attr,
|
||||
&format_attr_event12.attr, /* event */
|
||||
&format_attr_umask8.attr, /* umask */
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* Common L2 and L3 attributes */
|
||||
static struct attribute *amd_uncore_l3_format_attr[] = {
|
||||
&format_attr_event12.attr, /* event8 if F17h+ */
|
||||
&format_attr_umask.attr,
|
||||
NULL, /* slicemask if F17h, coreid if F19h */
|
||||
NULL, /* threadmask8 if F17h, enallslices if F19h */
|
||||
NULL, /* enallcores if F19h */
|
||||
NULL, /* sliceid if F19h */
|
||||
NULL, /* threadmask2 if F19h */
|
||||
&format_attr_event12.attr, /* event */
|
||||
&format_attr_umask8.attr, /* umask */
|
||||
NULL, /* threadmask */
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* F17h unique L3 attributes */
|
||||
static struct attribute *amd_f17h_uncore_l3_format_attr[] = {
|
||||
&format_attr_slicemask.attr, /* slicemask */
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* F19h unique L3 attributes */
|
||||
static struct attribute *amd_f19h_uncore_l3_format_attr[] = {
|
||||
&format_attr_coreid.attr, /* coreid */
|
||||
&format_attr_enallslices.attr, /* enallslices */
|
||||
&format_attr_enallcores.attr, /* enallcores */
|
||||
&format_attr_sliceid.attr, /* sliceid */
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -324,6 +366,18 @@ static struct attribute_group amd_uncore_l3_format_group = {
|
||||
.attrs = amd_uncore_l3_format_attr,
|
||||
};
|
||||
|
||||
static struct attribute_group amd_f17h_uncore_l3_format_group = {
|
||||
.name = "format",
|
||||
.attrs = amd_f17h_uncore_l3_format_attr,
|
||||
.is_visible = amd_f17h_uncore_is_visible,
|
||||
};
|
||||
|
||||
static struct attribute_group amd_f19h_uncore_l3_format_group = {
|
||||
.name = "format",
|
||||
.attrs = amd_f19h_uncore_l3_format_attr,
|
||||
.is_visible = amd_f19h_uncore_is_visible,
|
||||
};
|
||||
|
||||
static const struct attribute_group *amd_uncore_df_attr_groups[] = {
|
||||
&amd_uncore_attr_group,
|
||||
&amd_uncore_df_format_group,
|
||||
@ -336,6 +390,12 @@ static const struct attribute_group *amd_uncore_l3_attr_groups[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group *amd_uncore_l3_attr_update[] = {
|
||||
&amd_f17h_uncore_l3_format_group,
|
||||
&amd_f19h_uncore_l3_format_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct pmu amd_nb_pmu = {
|
||||
.task_ctx_nr = perf_invalid_context,
|
||||
.attr_groups = amd_uncore_df_attr_groups,
|
||||
@ -353,6 +413,7 @@ static struct pmu amd_nb_pmu = {
|
||||
static struct pmu amd_llc_pmu = {
|
||||
.task_ctx_nr = perf_invalid_context,
|
||||
.attr_groups = amd_uncore_l3_attr_groups,
|
||||
.attr_update = amd_uncore_l3_attr_update,
|
||||
.name = "amd_l2",
|
||||
.event_init = amd_uncore_event_init,
|
||||
.add = amd_uncore_add,
|
||||
@ -370,11 +431,19 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
|
||||
cpu_to_node(cpu));
|
||||
}
|
||||
|
||||
static inline struct perf_event **
|
||||
amd_uncore_events_alloc(unsigned int num, unsigned int cpu)
|
||||
{
|
||||
return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL,
|
||||
cpu_to_node(cpu));
|
||||
}
|
||||
|
||||
static int amd_uncore_cpu_up_prepare(unsigned int cpu)
|
||||
{
|
||||
struct amd_uncore *uncore_nb = NULL, *uncore_llc;
|
||||
struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL;
|
||||
|
||||
if (amd_uncore_nb) {
|
||||
*per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
|
||||
uncore_nb = amd_uncore_alloc(cpu);
|
||||
if (!uncore_nb)
|
||||
goto fail;
|
||||
@ -384,11 +453,15 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
|
||||
uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
|
||||
uncore_nb->active_mask = &amd_nb_active_mask;
|
||||
uncore_nb->pmu = &amd_nb_pmu;
|
||||
uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu);
|
||||
if (!uncore_nb->events)
|
||||
goto fail;
|
||||
uncore_nb->id = -1;
|
||||
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
|
||||
}
|
||||
|
||||
if (amd_uncore_llc) {
|
||||
*per_cpu_ptr(amd_uncore_llc, cpu) = NULL;
|
||||
uncore_llc = amd_uncore_alloc(cpu);
|
||||
if (!uncore_llc)
|
||||
goto fail;
|
||||
@ -398,6 +471,9 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
|
||||
uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
|
||||
uncore_llc->active_mask = &amd_llc_active_mask;
|
||||
uncore_llc->pmu = &amd_llc_pmu;
|
||||
uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu);
|
||||
if (!uncore_llc->events)
|
||||
goto fail;
|
||||
uncore_llc->id = -1;
|
||||
*per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
|
||||
}
|
||||
@ -405,9 +481,16 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
if (amd_uncore_nb)
|
||||
*per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
|
||||
kfree(uncore_nb);
|
||||
if (uncore_nb) {
|
||||
kfree(uncore_nb->events);
|
||||
kfree(uncore_nb);
|
||||
}
|
||||
|
||||
if (uncore_llc) {
|
||||
kfree(uncore_llc->events);
|
||||
kfree(uncore_llc);
|
||||
}
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -540,8 +623,11 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
|
||||
if (cpu == uncore->cpu)
|
||||
cpumask_clear_cpu(cpu, uncore->active_mask);
|
||||
|
||||
if (!--uncore->refcnt)
|
||||
if (!--uncore->refcnt) {
|
||||
kfree(uncore->events);
|
||||
kfree(uncore);
|
||||
}
|
||||
|
||||
*per_cpu_ptr(uncores, cpu) = NULL;
|
||||
}
|
||||
|
||||
@ -560,6 +646,7 @@ static int __init amd_uncore_init(void)
|
||||
{
|
||||
struct attribute **df_attr = amd_uncore_df_format_attr;
|
||||
struct attribute **l3_attr = amd_uncore_l3_format_attr;
|
||||
union cpuid_0x80000022_ebx ebx;
|
||||
int ret = -ENODEV;
|
||||
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
|
||||
@ -569,6 +656,9 @@ static int __init amd_uncore_init(void)
|
||||
if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
|
||||
return -ENODEV;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PERFMON_V2))
|
||||
pmu_version = 2;
|
||||
|
||||
num_counters_nb = NUM_COUNTERS_NB;
|
||||
num_counters_llc = NUM_COUNTERS_L2;
|
||||
if (boot_cpu_data.x86 >= 0x17) {
|
||||
@ -585,8 +675,12 @@ static int __init amd_uncore_init(void)
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
|
||||
if (boot_cpu_data.x86 >= 0x17)
|
||||
if (pmu_version >= 2) {
|
||||
*df_attr++ = &format_attr_event14v2.attr;
|
||||
*df_attr++ = &format_attr_umask12.attr;
|
||||
} else if (boot_cpu_data.x86 >= 0x17) {
|
||||
*df_attr = &format_attr_event14.attr;
|
||||
}
|
||||
|
||||
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
|
||||
if (!amd_uncore_nb) {
|
||||
@ -597,6 +691,11 @@ static int __init amd_uncore_init(void)
|
||||
if (ret)
|
||||
goto fail_nb;
|
||||
|
||||
if (pmu_version >= 2) {
|
||||
ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
|
||||
num_counters_nb = ebx.split.num_df_pmc;
|
||||
}
|
||||
|
||||
pr_info("%d %s %s counters detected\n", num_counters_nb,
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "",
|
||||
amd_nb_pmu.name);
|
||||
@ -607,16 +706,11 @@ static int __init amd_uncore_init(void)
|
||||
if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
|
||||
if (boot_cpu_data.x86 >= 0x19) {
|
||||
*l3_attr++ = &format_attr_event8.attr;
|
||||
*l3_attr++ = &format_attr_umask.attr;
|
||||
*l3_attr++ = &format_attr_coreid.attr;
|
||||
*l3_attr++ = &format_attr_enallslices.attr;
|
||||
*l3_attr++ = &format_attr_enallcores.attr;
|
||||
*l3_attr++ = &format_attr_sliceid.attr;
|
||||
*l3_attr++ = &format_attr_umask8.attr;
|
||||
*l3_attr++ = &format_attr_threadmask2.attr;
|
||||
} else if (boot_cpu_data.x86 >= 0x17) {
|
||||
*l3_attr++ = &format_attr_event8.attr;
|
||||
*l3_attr++ = &format_attr_umask.attr;
|
||||
*l3_attr++ = &format_attr_slicemask.attr;
|
||||
*l3_attr++ = &format_attr_umask8.attr;
|
||||
*l3_attr++ = &format_attr_threadmask8.attr;
|
||||
}
|
||||
|
||||
|
@ -4141,6 +4141,8 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
||||
{
|
||||
struct event_constraint *c;
|
||||
|
||||
c = intel_get_event_constraints(cpuc, idx, event);
|
||||
|
||||
/*
|
||||
* :ppp means to do reduced skid PEBS,
|
||||
* which is available on PMC0 and fixed counter 0.
|
||||
@ -4153,8 +4155,6 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
|
||||
return &counter0_constraint;
|
||||
}
|
||||
|
||||
c = intel_get_event_constraints(cpuc, idx, event);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
@ -6241,7 +6241,8 @@ __init int intel_pmu_init(void)
|
||||
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
|
||||
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
|
||||
x86_pmu.lbr_pt_coexist = true;
|
||||
intel_pmu_pebs_data_source_skl(false);
|
||||
intel_pmu_pebs_data_source_adl();
|
||||
x86_pmu.pebs_latency_data = adl_latency_data_small;
|
||||
x86_pmu.num_topdown_events = 8;
|
||||
x86_pmu.update_topdown_event = adl_update_topdown_event;
|
||||
x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
|
||||
|
@ -94,15 +94,40 @@ void __init intel_pmu_pebs_data_source_nhm(void)
|
||||
pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
|
||||
}
|
||||
|
||||
void __init intel_pmu_pebs_data_source_skl(bool pmem)
|
||||
static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source)
|
||||
{
|
||||
u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
|
||||
|
||||
pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
|
||||
pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
|
||||
pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
|
||||
pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
|
||||
pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
|
||||
data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
|
||||
data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
|
||||
data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
|
||||
data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
|
||||
data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
|
||||
}
|
||||
|
||||
void __init intel_pmu_pebs_data_source_skl(bool pmem)
|
||||
{
|
||||
__intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
|
||||
}
|
||||
|
||||
static void __init intel_pmu_pebs_data_source_grt(u64 *data_source)
|
||||
{
|
||||
data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
|
||||
data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
|
||||
data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
|
||||
}
|
||||
|
||||
void __init intel_pmu_pebs_data_source_adl(void)
|
||||
{
|
||||
u64 *data_source;
|
||||
|
||||
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
|
||||
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
|
||||
__intel_pmu_pebs_data_source_skl(false, data_source);
|
||||
|
||||
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
|
||||
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
|
||||
intel_pmu_pebs_data_source_grt(data_source);
|
||||
}
|
||||
|
||||
static u64 precise_store_data(u64 status)
|
||||
@ -171,7 +196,50 @@ static u64 precise_datala_hsw(struct perf_event *event, u64 status)
|
||||
return dse.val;
|
||||
}
|
||||
|
||||
static u64 load_latency_data(u64 status)
|
||||
static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
|
||||
{
|
||||
/*
|
||||
* TLB access
|
||||
* 0 = did not miss 2nd level TLB
|
||||
* 1 = missed 2nd level TLB
|
||||
*/
|
||||
if (tlb)
|
||||
*val |= P(TLB, MISS) | P(TLB, L2);
|
||||
else
|
||||
*val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
|
||||
|
||||
/* locked prefix */
|
||||
if (lock)
|
||||
*val |= P(LOCK, LOCKED);
|
||||
}
|
||||
|
||||
/* Retrieve the latency data for e-core of ADL */
|
||||
u64 adl_latency_data_small(struct perf_event *event, u64 status)
|
||||
{
|
||||
union intel_x86_pebs_dse dse;
|
||||
u64 val;
|
||||
|
||||
WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
|
||||
|
||||
dse.val = status;
|
||||
|
||||
val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
|
||||
|
||||
/*
|
||||
* For the atom core on ADL,
|
||||
* bit 4: lock, bit 5: TLB access.
|
||||
*/
|
||||
pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
|
||||
|
||||
if (dse.ld_data_blk)
|
||||
val |= P(BLK, DATA);
|
||||
else
|
||||
val |= P(BLK, NA);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static u64 load_latency_data(struct perf_event *event, u64 status)
|
||||
{
|
||||
union intel_x86_pebs_dse dse;
|
||||
u64 val;
|
||||
@ -181,7 +249,7 @@ static u64 load_latency_data(u64 status)
|
||||
/*
|
||||
* use the mapping table for bit 0-3
|
||||
*/
|
||||
val = pebs_data_source[dse.ld_dse];
|
||||
val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
|
||||
|
||||
/*
|
||||
* Nehalem models do not support TLB, Lock infos
|
||||
@ -190,21 +258,8 @@ static u64 load_latency_data(u64 status)
|
||||
val |= P(TLB, NA) | P(LOCK, NA);
|
||||
return val;
|
||||
}
|
||||
/*
|
||||
* bit 4: TLB access
|
||||
* 0 = did not miss 2nd level TLB
|
||||
* 1 = missed 2nd level TLB
|
||||
*/
|
||||
if (dse.ld_stlb_miss)
|
||||
val |= P(TLB, MISS) | P(TLB, L2);
|
||||
else
|
||||
val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
|
||||
|
||||
/*
|
||||
* bit 5: locked prefix
|
||||
*/
|
||||
if (dse.ld_locked)
|
||||
val |= P(LOCK, LOCKED);
|
||||
pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked);
|
||||
|
||||
/*
|
||||
* Ice Lake and earlier models do not support block infos.
|
||||
@ -233,7 +288,7 @@ static u64 load_latency_data(u64 status)
|
||||
return val;
|
||||
}
|
||||
|
||||
static u64 store_latency_data(u64 status)
|
||||
static u64 store_latency_data(struct perf_event *event, u64 status)
|
||||
{
|
||||
union intel_x86_pebs_dse dse;
|
||||
u64 val;
|
||||
@ -243,23 +298,9 @@ static u64 store_latency_data(u64 status)
|
||||
/*
|
||||
* use the mapping table for bit 0-3
|
||||
*/
|
||||
val = pebs_data_source[dse.st_lat_dse];
|
||||
val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse];
|
||||
|
||||
/*
|
||||
* bit 4: TLB access
|
||||
* 0 = did not miss 2nd level TLB
|
||||
* 1 = missed 2nd level TLB
|
||||
*/
|
||||
if (dse.st_lat_stlb_miss)
|
||||
val |= P(TLB, MISS) | P(TLB, L2);
|
||||
else
|
||||
val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
|
||||
|
||||
/*
|
||||
* bit 5: locked prefix
|
||||
*/
|
||||
if (dse.st_lat_locked)
|
||||
val |= P(LOCK, LOCKED);
|
||||
pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked);
|
||||
|
||||
val |= P(BLK, NA);
|
||||
|
||||
@ -781,8 +822,8 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
|
||||
|
||||
struct event_constraint intel_grt_pebs_event_constraints[] = {
|
||||
/* Allow all events as PEBS with no flags */
|
||||
INTEL_PLD_CONSTRAINT(0x5d0, 0xf),
|
||||
INTEL_PSD_CONSTRAINT(0x6d0, 0xf),
|
||||
INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf),
|
||||
INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
|
||||
EVENT_CONSTRAINT_END
|
||||
};
|
||||
|
||||
@ -1443,9 +1484,11 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
|
||||
bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
|
||||
|
||||
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
|
||||
val = load_latency_data(aux);
|
||||
val = load_latency_data(event, aux);
|
||||
else if (fl & PERF_X86_EVENT_PEBS_STLAT)
|
||||
val = store_latency_data(aux);
|
||||
val = store_latency_data(event, aux);
|
||||
else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID)
|
||||
val = x86_pmu.pebs_latency_data(event, aux);
|
||||
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
|
||||
val = precise_datala_hsw(event, aux);
|
||||
else if (fst)
|
||||
|
@ -84,6 +84,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
|
||||
#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
|
||||
#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
|
||||
#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
|
||||
#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */
|
||||
|
||||
static inline bool is_topdown_count(struct perf_event *event)
|
||||
{
|
||||
@ -136,7 +137,8 @@ struct amd_nb {
|
||||
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
|
||||
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
|
||||
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
|
||||
PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE)
|
||||
PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \
|
||||
PERF_SAMPLE_WEIGHT_TYPE)
|
||||
|
||||
#define PEBS_GP_REGS \
|
||||
((1ULL << PERF_REG_X86_AX) | \
|
||||
@ -460,6 +462,10 @@ struct cpu_hw_events {
|
||||
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
|
||||
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
|
||||
|
||||
#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \
|
||||
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
|
||||
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID)
|
||||
|
||||
/* Event constraint, but match on all event flags too. */
|
||||
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
|
||||
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
|
||||
@ -638,6 +644,8 @@ enum {
|
||||
x86_lbr_exclusive_max,
|
||||
};
|
||||
|
||||
#define PERF_PEBS_DATA_SOURCE_MAX 0x10
|
||||
|
||||
struct x86_hybrid_pmu {
|
||||
struct pmu pmu;
|
||||
const char *name;
|
||||
@ -665,6 +673,8 @@ struct x86_hybrid_pmu {
|
||||
unsigned int late_ack :1,
|
||||
mid_ack :1,
|
||||
enabled_ack :1;
|
||||
|
||||
u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
|
||||
};
|
||||
|
||||
static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
|
||||
@ -825,6 +835,7 @@ struct x86_pmu {
|
||||
void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
|
||||
struct event_constraint *pebs_constraints;
|
||||
void (*pebs_aliases)(struct perf_event *event);
|
||||
u64 (*pebs_latency_data)(struct perf_event *event, u64 status);
|
||||
unsigned long large_pebs_flags;
|
||||
u64 rtm_abort_event;
|
||||
|
||||
@ -1392,6 +1403,8 @@ void intel_pmu_disable_bts(void);
|
||||
|
||||
int intel_pmu_drain_bts_buffer(void);
|
||||
|
||||
u64 adl_latency_data_small(struct perf_event *event, u64 status);
|
||||
|
||||
extern struct event_constraint intel_core2_pebs_event_constraints[];
|
||||
|
||||
extern struct event_constraint intel_atom_pebs_event_constraints[];
|
||||
@ -1499,6 +1512,8 @@ void intel_pmu_pebs_data_source_nhm(void);
|
||||
|
||||
void intel_pmu_pebs_data_source_skl(bool pmem);
|
||||
|
||||
void intel_pmu_pebs_data_source_adl(void);
|
||||
|
||||
int intel_pmu_setup_lbr_filter(struct perf_event *event);
|
||||
|
||||
void intel_pt_interrupt(void);
|
||||
|
@ -29,7 +29,10 @@ union ibs_fetch_ctl {
|
||||
rand_en:1, /* 57: random tagging enable */
|
||||
fetch_l2_miss:1,/* 58: L2 miss for sampled fetch
|
||||
* (needs IbsFetchComp) */
|
||||
reserved:5; /* 59-63: reserved */
|
||||
l3_miss_only:1, /* 59: Collect L3 miss samples only */
|
||||
fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */
|
||||
fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */
|
||||
reserved:2; /* 62-63: reserved */
|
||||
};
|
||||
};
|
||||
|
||||
@ -38,14 +41,14 @@ union ibs_op_ctl {
|
||||
__u64 val;
|
||||
struct {
|
||||
__u64 opmaxcnt:16, /* 0-15: periodic op max. count */
|
||||
reserved0:1, /* 16: reserved */
|
||||
l3_miss_only:1, /* 16: Collect L3 miss samples only */
|
||||
op_en:1, /* 17: op sampling enable */
|
||||
op_val:1, /* 18: op sample valid */
|
||||
cnt_ctl:1, /* 19: periodic op counter control */
|
||||
opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
|
||||
reserved1:5, /* 27-31: reserved */
|
||||
reserved0:5, /* 27-31: reserved */
|
||||
opcurcnt:27, /* 32-58: periodic op counter current count */
|
||||
reserved2:5; /* 59-63: reserved */
|
||||
reserved1:5; /* 59-63: reserved */
|
||||
};
|
||||
};
|
||||
|
||||
@ -71,11 +74,12 @@ union ibs_op_data {
|
||||
union ibs_op_data2 {
|
||||
__u64 val;
|
||||
struct {
|
||||
__u64 data_src:3, /* 0-2: data source */
|
||||
__u64 data_src_lo:3, /* 0-2: data source low */
|
||||
reserved0:1, /* 3: reserved */
|
||||
rmt_node:1, /* 4: destination node */
|
||||
cache_hit_st:1, /* 5: cache hit state */
|
||||
reserved1:57; /* 5-63: reserved */
|
||||
data_src_hi:2, /* 6-7: data source high */
|
||||
reserved1:56; /* 8-63: reserved */
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -89,6 +89,19 @@
|
||||
#define AMD64_RAW_EVENT_MASK_NB \
|
||||
(AMD64_EVENTSEL_EVENT | \
|
||||
ARCH_PERFMON_EVENTSEL_UMASK)
|
||||
|
||||
#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \
|
||||
(AMD64_EVENTSEL_EVENT | \
|
||||
GENMASK_ULL(37, 36))
|
||||
|
||||
#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \
|
||||
(ARCH_PERFMON_EVENTSEL_UMASK | \
|
||||
GENMASK_ULL(27, 24))
|
||||
|
||||
#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \
|
||||
(AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \
|
||||
AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
|
||||
|
||||
#define AMD64_NUM_COUNTERS 4
|
||||
#define AMD64_NUM_COUNTERS_CORE 6
|
||||
#define AMD64_NUM_COUNTERS_NB 4
|
||||
@ -194,6 +207,9 @@ union cpuid_0x80000022_ebx {
|
||||
struct {
|
||||
/* Number of Core Performance Counters */
|
||||
unsigned int num_core_pmc:4;
|
||||
unsigned int reserved:6;
|
||||
/* Number of Data Fabric Counters */
|
||||
unsigned int num_df_pmc:6;
|
||||
} split;
|
||||
unsigned int full;
|
||||
};
|
||||
|
@ -759,6 +759,8 @@ struct perf_event {
|
||||
struct pid_namespace *ns;
|
||||
u64 id;
|
||||
|
||||
atomic64_t lost_samples;
|
||||
|
||||
u64 (*clock)(void);
|
||||
perf_overflow_handler_t overflow_handler;
|
||||
void *overflow_handler_context;
|
||||
|
@ -301,6 +301,7 @@ enum {
|
||||
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
|
||||
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
|
||||
* { u64 id; } && PERF_FORMAT_ID
|
||||
* { u64 lost; } && PERF_FORMAT_LOST
|
||||
* } && !PERF_FORMAT_GROUP
|
||||
*
|
||||
* { u64 nr;
|
||||
@ -308,6 +309,7 @@ enum {
|
||||
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
|
||||
* { u64 value;
|
||||
* { u64 id; } && PERF_FORMAT_ID
|
||||
* { u64 lost; } && PERF_FORMAT_LOST
|
||||
* } cntr[nr];
|
||||
* } && PERF_FORMAT_GROUP
|
||||
* };
|
||||
@ -317,8 +319,9 @@ enum perf_event_read_format {
|
||||
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
|
||||
PERF_FORMAT_ID = 1U << 2,
|
||||
PERF_FORMAT_GROUP = 1U << 3,
|
||||
PERF_FORMAT_LOST = 1U << 4,
|
||||
|
||||
PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
|
||||
PERF_FORMAT_MAX = 1U << 5, /* non-ABI */
|
||||
};
|
||||
|
||||
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
|
||||
|
@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
|
||||
if (event->attr.read_format & PERF_FORMAT_ID)
|
||||
entry += sizeof(u64);
|
||||
|
||||
if (event->attr.read_format & PERF_FORMAT_LOST)
|
||||
entry += sizeof(u64);
|
||||
|
||||
if (event->attr.read_format & PERF_FORMAT_GROUP) {
|
||||
nr += nr_siblings;
|
||||
size += sizeof(u64);
|
||||
@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
|
||||
values[n++] += perf_event_count(leader);
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(leader);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&leader->lost_samples);
|
||||
|
||||
for_each_sibling_event(sub, leader) {
|
||||
values[n++] += perf_event_count(sub);
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(sub);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&sub->lost_samples);
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&ctx->lock, flags);
|
||||
@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
|
||||
u64 read_format, char __user *buf)
|
||||
{
|
||||
u64 enabled, running;
|
||||
u64 values[4];
|
||||
u64 values[5];
|
||||
int n = 0;
|
||||
|
||||
values[n++] = __perf_event_read_value(event, &enabled, &running);
|
||||
@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
|
||||
values[n++] = running;
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(event);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&event->lost_samples);
|
||||
|
||||
if (copy_to_user(buf, values, n * sizeof(u64)))
|
||||
return -EFAULT;
|
||||
@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
|
||||
u64 enabled, u64 running)
|
||||
{
|
||||
u64 read_format = event->attr.read_format;
|
||||
u64 values[4];
|
||||
u64 values[5];
|
||||
int n = 0;
|
||||
|
||||
values[n++] = perf_event_count(event);
|
||||
@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
|
||||
}
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(event);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&event->lost_samples);
|
||||
|
||||
__output_copy(handle, values, n * sizeof(u64));
|
||||
}
|
||||
@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
|
||||
{
|
||||
struct perf_event *leader = event->group_leader, *sub;
|
||||
u64 read_format = event->attr.read_format;
|
||||
u64 values[5];
|
||||
u64 values[6];
|
||||
int n = 0;
|
||||
|
||||
values[n++] = 1 + leader->nr_siblings;
|
||||
@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
|
||||
values[n++] = perf_event_count(leader);
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(leader);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&leader->lost_samples);
|
||||
|
||||
__output_copy(handle, values, n * sizeof(u64));
|
||||
|
||||
@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
|
||||
values[n++] = perf_event_count(sub);
|
||||
if (read_format & PERF_FORMAT_ID)
|
||||
values[n++] = primary_event_id(sub);
|
||||
if (read_format & PERF_FORMAT_LOST)
|
||||
values[n++] = atomic64_read(&sub->lost_samples);
|
||||
|
||||
__output_copy(handle, values, n * sizeof(u64));
|
||||
}
|
||||
|
@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
|
||||
goto out;
|
||||
|
||||
if (unlikely(rb->paused)) {
|
||||
if (rb->nr_pages)
|
||||
if (rb->nr_pages) {
|
||||
local_inc(&rb->lost);
|
||||
atomic64_inc(&event->lost_samples);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
|
||||
|
||||
fail:
|
||||
local_inc(&rb->lost);
|
||||
atomic64_inc(&event->lost_samples);
|
||||
perf_output_put_handle(handle);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
Loading…
Reference in New Issue
Block a user