From 231ce08b662a58d4392da998699b3d4a7e2e87cf Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 12:53:53 -0400 Subject: [PATCH 01/15] tools/power turbostat: Add "snapshot:" Makefile target Kernel developers often need to diagnose remote customer systems with the latest turbostat, yet customers are running binary distros with out-dated turbostat and the customer has no experience cloning linux kernel trees. Add a turbostat "snapshot" makefile target to create a standalone source snapshot from the developer's git tree, appropriately hacked so that the customer can build turbostat without a kernel tree. Include the turbostat binary in the snapshot, for convenience in those situations where the source and destination are trusted, (and have new enough glibc to execute). The snapshot is named with the date it was taken rather than the turbostat VERSION, as it could occur between VERSIONS... Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c792..2d6dce2c8f77 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -22,9 +24,30 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include /#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) From ae3326ac5742506409a03ce5d69716a8dba4eabc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 14:45:10 -0400 Subject: [PATCH 02/15] tools/power turbostat: Harden probe_intel_uncore_frequency() If sysfs directory "intel_uncore_frequency/cluster00/" exists, then use uncore cluster code (now its own routine). The previous check for "intel_uncore_frequency/package_00_die_00/current_freq_khz", could be unreliable in the face of sparse die id's. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98256468e248..ca33fb057d1f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5294,16 +5294,13 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void probe_intel_uncore_frequency(void) +static void probe_intel_uncore_frequency_legacy(void) { int i, j; char path[256]; - if (!genuine_intel) - return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - goto probe_cluster; + return; BIC_PRESENT(BIC_UNCORE_MHZ); @@ -5335,9 +5332,13 @@ static void probe_intel_uncore_frequency(void) fprintf(outf, " %d MHz\n", k / 1000); } } - return; +} + +static void probe_intel_uncore_frequency_cluster(void) +{ + int i; + char path[256]; -probe_cluster: if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; @@ -5351,6 +5352,7 @@ probe_cluster: sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + /* uncore## start at 00 and skip no numbers, so stop upon first missing */ if (access(path_base, R_OK)) break; @@ -5382,6 +5384,17 @@ probe_cluster: } } +static void probe_intel_uncore_frequency(void) +{ + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0) + probe_intel_uncore_frequency_cluster(); + else + probe_intel_uncore_frequency_legacy(); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ From cda203388687aa075db6f8996c3c4549fa518ea8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 11:56:48 -0400 Subject: [PATCH 03/15] tools/power turbostat: Remember global max_die_id This is necessary to gracefully handle sparse die_id's. no functional change Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ca33fb057d1f..e6d643a58cd8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1415,6 +1415,7 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_die_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -6980,7 +6981,6 @@ void topology_probe(bool startup) int i; int max_core_id = 0; int max_package_id = 0; - int max_die_id = 0; int max_siblings = 0; /* Initialize num_cpus, max_cpu_num */ @@ -7097,8 +7097,8 @@ void topology_probe(bool startup) /* get die information */ cpus[i].die_id = get_die_id(i); - if (cpus[i].die_id > max_die_id) - max_die_id = cpus[i].die_id; + if (cpus[i].die_id > topo.max_die_id) + topo.max_die_id = cpus[i].die_id; /* get numa node information */ cpus[i].physical_node_id = get_physical_node_id(&cpus[i]); @@ -7124,9 +7124,9 @@ void topology_probe(bool startup) if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); - topo.num_die = max_die_id + 1; + topo.num_die = topo.max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); From c8b246ea2ea5771f2a0ca6f6a9a520406e6b6eb7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 12:02:24 -0400 Subject: [PATCH 04/15] tools/power turbostat: Survive sparse die_id Turbostat assumed that every package had a die_id = 0. When this assumption was violated, it exited when looking for the package uncore frequency: turbostat: /sys/.../intel_uncore_frequency/package_01_die_00/current_freq_khz: open failed: No such file or directory Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 45 +++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e6d643a58cd8..4b95fd90e16c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2992,14 +2992,29 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { - return (snapshot_sysfs_counter(path) / 1000); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); + + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3631,9 +3646,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -5300,22 +5314,22 @@ static void probe_intel_uncore_frequency_legacy(void) int i, j; char path[256]; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - return; - - BIC_PRESENT(BIC_UNCORE_MHZ); - - if (quiet) - return; - for (i = 0; i < topo.num_packages; ++i) { - for (j = 0; j < topo.num_die; ++j) { + for (j = 0; j <= topo.max_die_id; ++j) { int k, l; char path_base[128]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); + if (access(path_base, R_OK)) + continue; + + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5480,7 +5494,6 @@ next: else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) From 196eca020600470ca44da94c65607e7a98aa9d3c Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 27 Mar 2024 22:35:03 +0800 Subject: [PATCH 05/15] tools/power turbostat: Enhance ARL/LNL support ARL/LNL don't have PC8, other than that, it behaves the same as CNL. Copy cnl_features for ARL/LNL, except that PC8 support is removed. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4b95fd90e16c..672936015b55 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -663,6 +663,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +922,8 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, From f04fcc7ac8ceb87933244cca28759d0fac6103ce Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 27 Mar 2024 22:36:38 +0800 Subject: [PATCH 06/15] tools/power turbostat: Add ARL-H support Add turbostat support for ARL-H, which behaves the same as ARL. [lenb: also add ARL-U] Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 672936015b55..fadf96934f4e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -922,6 +922,8 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, { INTEL_FAM6_ARROWLAKE, &arl_features }, { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, From d3e6f6253895f499b63bac261b81732f9efc4902 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 23 Apr 2024 11:59:49 +0200 Subject: [PATCH 07/15] tools/power turbostat: Replace _Static_assert with BUILD_BUG_ON So it compiles on GCC older than 9.0. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fadf96934f4e..bd6cb31b7099 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -38,6 +38,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -3467,7 +3468,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); From 0e39702fbbcdb16ad349439065d24a3bb5e2f331 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 25 Apr 2024 17:54:18 +0200 Subject: [PATCH 08/15] tools/power turbostat: Enable non-privileged users to read sysfs counters A group of counters called "sysfs" displays software C-state request counts and resulting perceived C-state residency. They are not built-in counters that turbostat knows about ahead of time, rather they are discovered in sysfs when turbostat starts. Thus, they are added dynamically, using the same interface as user-added MSR counters. When turbostat enters "no-msr" mode, such as when running as a non-privileged user, it clears all added counters. Updating that to clear only actual MSR added counters allows regular users to see the sysfs counters. [lenb: commit message] Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 56 +++++++++++++++------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bd6cb31b7099..f92b46cfda31 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1377,36 +1377,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; + while (*pp) { + p = *pp; + + if (p->msr_num != 0) { + *pp = p->next; + + free(p); + ++num_freed; + + continue; + } + + pp = &p->next; } - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + return num_freed; +} - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1566,7 +1572,7 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) From 78464d7681f79bb48995c3d29d7e93d27ba69bca Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 24 Apr 2024 21:12:18 -0400 Subject: [PATCH 09/15] tools/power turbostat: Add columns for clustered uncore frequency New machines have multiple uncore frequencies per package, visible in /sys/devices/system/cpu/intel_uncore_frequency/uncore##/ turbostat now samples these frequencies each measurement interval. For each package, turbostat now prints "UMHzX.Y" columns, where X = domain_id, and Y = fabric_cluster_id. The system summary for each UMHzX.Y column is the average value for across all of the packages in the system. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 4 +- tools/power/x86/turbostat/turbostat.c | 342 +++++++++++++++++--------- 2 files changed, 226 insertions(+), 120 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9ed..8d37acd39201 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f92b46cfda31..abfddff6aebd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -59,15 +59,21 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; + struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -79,64 +85,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -300,6 +306,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -999,8 +1008,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1201,7 +1211,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1234,7 +1244,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1955,13 +1965,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1993,7 +2005,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -2014,7 +2026,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2050,7 +2062,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2415,7 +2427,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2525,6 +2538,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2997,7 +3012,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -3007,11 +3022,11 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } @@ -3486,6 +3501,18 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + /* * get_counters(...) * migrate to cpu @@ -3547,7 +3574,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3602,7 +3629,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) get_core_throt_cnt(cpu, &c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &c->counter[i])) + if (get_mp(cpu, mp, &c->counter[i], mp->sp->path)) return -10; } @@ -3694,7 +3721,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &p->counter[i])) + char *path = NULL; + + if (mp->msr_num == 0) { + path = find_sysfs_path_by_id(mp->sp, p->package_id); + if (path == NULL) { + warnx("%s: package_id %d not found", __func__, p->package_id); + return -10; + } + } + if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } done: @@ -5377,8 +5413,9 @@ static void probe_intel_uncore_frequency_legacy(void) static void probe_intel_uncore_frequency_cluster(void) { - int i; + int i, uncore_max_id; char path[256]; + char path_base[128]; if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; @@ -5386,16 +5423,25 @@ static void probe_intel_uncore_frequency_cluster(void) if (quiet) return; - for (i = 0;; ++i) { + for (uncore_max_id = 0;; ++uncore_max_id) { + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); + + /* uncore## start at 00 and skips no numbers, so stop upon first missing */ + if (access(path_base, R_OK)) { + uncore_max_id -= 1; + break; + } + } + for (i = uncore_max_id; i >= 0; --i) { int k, l; - char path_base[128]; int package_id, domain_id, cluster_id; + char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); - /* uncore## start at 00 and skip no numbers, so stop upon first missing */ if (access(path_base, R_OK)) - break; + err(1, "%s: %s\n", __func__, path_base); sprintf(path, "%s/package_id", path_base); package_id = read_sysfs_int(path); @@ -5422,6 +5468,11 @@ static void probe_intel_uncore_frequency_cluster(void) sprintf(path, "%s/current_freq_khz", path_base); k = read_sysfs_int(path); fprintf(outf, " %d MHz\n", k / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); } } @@ -7575,61 +7626,114 @@ void print_bootcmd(void) fclose(fp); } +struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) +{ + struct msr_counter *mp; + + for (mp = head; mp; mp = mp->next) { + if (debug) + printf("%s: %s %s\n", __func__, name, mp->name); + if (!strncmp(name, mp->name, strlen(mp->name))) + return mp; + } + return NULL; +} + int add_counter(unsigned int msr_num, char *path, char *name, unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; if (no_msr && msr_num) errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); - msrp = calloc(1, sizeof(struct msr_counter)); - if (msrp == NULL) { - perror("calloc"); - exit(1); - } - - msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES - 1); - if (path) - strncpy(msrp->path, path, PATH_BYTES - 1); - msrp->width = width; - msrp->type = type; - msrp->format = format; - msrp->flags = flags; + if (debug) + printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, + path, name, width, scope, type, format, flags, id); switch (scope) { case SCOPE_CPU: - msrp->next = sys.tp; - sys.tp = msrp; - sys.added_thread_counters++; - if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.tp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter %s", name); + return -1; } break; - case SCOPE_CORE: - msrp->next = sys.cp; - sys.cp = msrp; - sys.added_core_counters++; - if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.cp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter %s", name); + return -1; } break; - case SCOPE_PACKAGE: - msrp->next = sys.pp; - sys.pp = msrp; - sys.added_package_counters++; - if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.pp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter %s", name); + return -1; } break; + default: + warnx("ignoring counter %s with unknown scope", name); + return -1; + } + + if (msrp == NULL) { + msrp = calloc(1, sizeof(struct msr_counter)); + if (msrp == NULL) + err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; + strncpy(msrp->name, name, NAME_BYTES - 1); + msrp->width = width; + msrp->type = type; + msrp->format = format; + msrp->flags = flags; + + switch (scope) { + case SCOPE_CPU: + msrp->next = sys.tp; + sys.tp = msrp; + break; + case SCOPE_CORE: + msrp->next = sys.cp; + sys.cp = msrp; + break; + case SCOPE_PACKAGE: + msrp->next = sys.pp; + sys.pp = msrp; + break; + } + } + + if (path) { + struct sysfs_path *sp; + + sp = calloc(1, sizeof(struct sysfs_path)); + if (sp == NULL) { + perror("calloc"); + exit(1); + } + strncpy(sp->path, path, PATH_BYTES - 1); + sp->id = id; + sp->next = msrp->sp; + msrp->sp = sp; } return 0; @@ -7731,7 +7835,7 @@ next: sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0)) + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) fail++; if (fail) { @@ -7796,7 +7900,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); } for (state = 10; state >= 0; --state) { @@ -7824,7 +7928,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0); } } From 3559ea813ad3a9627934325c68ad05b18008a077 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 6 May 2024 15:39:08 +0200 Subject: [PATCH 10/15] tools/power turbostat: Avoid possible memory corruption due to sparse topology IDs Save the highest core and package id when parsing topology to allocate enough memory when get_rapl_counters() is called with a core or a package id as a domain. Note that RAPL domains are per-package on Intel, but per-core on AMD. Thus, the RAPL code effectively runs in different modes on those two product lines. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index abfddff6aebd..9ee9e4e5522c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1052,6 +1052,7 @@ struct rapl_counter_info_t { /* struct rapl_counter_info_t for each RAPL domain */ struct rapl_counter_info_t *rapl_counter_info_perdomain; +unsigned int rapl_counter_info_perdomain_size; #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) @@ -1451,6 +1452,8 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_core_id; + int max_package_id; int max_die_id; int max_node_num; int nodes_per_pkg; @@ -3425,15 +3428,18 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci rc->scale = rci->scale[idx]; } -int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; - struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + struct rapl_counter_info_t *rci; if (debug) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); + assert(domain < rapl_counter_info_perdomain_size); + + rci = &rapl_counter_info_perdomain[domain]; /* * If we have any perf counters to read, read them all now, in bulk @@ -4257,7 +4263,7 @@ void free_fd_rapl_percpu(void) if (!rapl_counter_info_perdomain) return; - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const int num_domains = rapl_counter_info_perdomain_size; for (int domain_id = 0; domain_id < num_domains; ++domain_id) { if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) @@ -4265,6 +4271,8 @@ void free_fd_rapl_percpu(void) } free(rapl_counter_info_perdomain); + rapl_counter_info_perdomain = NULL; + rapl_counter_info_perdomain_size = 0; } void free_all_buffers(void) @@ -6582,17 +6590,18 @@ void linux_perf_init(void) void rapl_perf_init(void) { - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1; bool *domain_visited = calloc(num_domains, sizeof(bool)); rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); if (rapl_counter_info_perdomain == NULL) err(-1, "calloc rapl_counter_info_percpu"); + rapl_counter_info_perdomain_size = num_domains; /* * Initialize rapl_counter_info_percpu */ - for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; rci->fd_perf = -1; @@ -6612,7 +6621,7 @@ void rapl_perf_init(void) bool has_counter = 0; double scale; enum rapl_unit unit; - int next_domain; + unsigned int next_domain; memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); @@ -6625,6 +6634,8 @@ void rapl_perf_init(void) next_domain = platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + assert(next_domain < num_domains); + if (domain_visited[next_domain]) continue; @@ -7207,6 +7218,8 @@ void topology_probe(bool startup) if (cpus[i].thread_id == 0) topo.num_cores++; } + topo.max_core_id = max_core_id; + topo.max_package_id = max_package_id; topo.cores_per_node = max_core_id + 1; if (debug > 1) From 1f9e46da9cba54d12880948fd2adac31bb0eaadb Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 11 Mar 2024 18:06:16 +0100 Subject: [PATCH 11/15] tools/power turbostat: Read Core-cstates via perf Reading the counters via perf can be done in bulk with a single syscall, making the counter values more accurate with respect to one another by minimizing the time gap between individual counter reads. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 383 ++++++++++++++++++++++---- 1 file changed, 337 insertions(+), 46 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9ee9e4e5522c..9d0278d17965 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -63,6 +63,7 @@ enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; struct sysfs_path { char path[PATH_BYTES]; @@ -1183,6 +1184,77 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + NUM_CCSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CCSTATE_COUNTERS]; + enum cstate_source source[NUM_CCSTATE_COUNTERS]; + unsigned long long msr[NUM_CCSTATE_COUNTERS]; + int fd_perf; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD (1u << 0) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 1) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1571,10 +1643,6 @@ static void bic_disable_msr_access(void) { const unsigned long bic_msrs = BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | @@ -3421,6 +3489,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3519,6 +3598,90 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) return NULL; } +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c) +{ + unsigned long long perf_data[NUM_CCSTATE_COUNTERS + 1]; + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (cci->fd_perf != -1) { + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + const ssize_t expected_read_size = + (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = + read(cci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", + __func__, expected_read_size, actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_CCSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", + "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", + "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CCSTATE_COUNTERS != 4); + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3574,10 +3737,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) @@ -3594,31 +3755,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { - if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { - if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } - - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { - if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) - return -8; - else if (t->is_atom) { - /* - * For Atom CPUs that has core cstate deeper than c6, - * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. - * Minus CC7 (and deeper cstates) residency to get - * accturate cc6 residency. - */ - c->c6 -= c->c7; - } + if (DO_BIC(BIC_CPU_c7) && t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; } if (DO_BIC(BIC_Mod_c6)) @@ -4258,6 +4402,23 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_cstate(void) +{ + if (!ccstate_counter_info) + return; + + const int counter_info_num = ccstate_counter_info_size; + + for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { + if (ccstate_counter_info[counter_id].fd_perf != -1) + close(ccstate_counter_info[counter_id].fd_perf); + } + + free(ccstate_counter_info); + ccstate_counter_info = NULL; + ccstate_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4319,6 +4480,7 @@ void free_all_buffers(void) free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free_fd_rapl_percpu(); + free_fd_cstate(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4654,6 +4816,7 @@ static void update_effective_set(bool startup) void linux_perf_init(void); void rapl_perf_init(void); +void cstate_perf_init(void); void re_initialize(void) { @@ -4661,6 +4824,7 @@ void re_initialize(void) setup_all_buffers(false); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -6517,7 +6681,8 @@ bool is_aperf_access_required(void) return BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC); + || BIC_IS_ENABLED(BIC_IPC) + || BIC_IS_ENABLED(BIC_CPU_c1); } int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, @@ -6749,22 +6914,133 @@ static int has_amperf_access(void) return 0; } +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, + const struct cstate_counter_arch_info *cai) +{ + if (no_perf) + return -1; + + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, + const struct cstate_counter_arch_info *cai) +{ + int ret = add_cstate_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + +void cstate_perf_init_(bool soft_c1) +{ + bool has_counter; + bool *cores_visited; + const int cores_visited_elems = topo.max_core_id + 1; + const int cci_num = topo.max_cpu_num + 1; + + ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); + if (!ccstate_counter_info) + err(1, "calloc ccstate_counter_arch_info"); + ccstate_counter_info_size = cci_num; + + cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited)); + if (!cores_visited) + err(1, "calloc cores_visited"); + + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) + ccstate_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_CCSTATE_COUNTERS; ++cidx) { + has_counter = false; + memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + + const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; + + for (int cpu = 0; cpu < cci_num; ++cpu) { + + struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + const int core_id = cpus[cpu].physical_core_id; + + assert(core_id < cores_visited_elems); + + const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + + if (!per_thread && cores_visited[core_id]) + continue; + + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_supported = + platform->supported_cstates & cai->feature_mask; + + if (counter_needed && counter_supported) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_cstate_perf_counter(cpu, cci, cai) != -1) { + + cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + } + } + + if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + has_counter = true; + cores_visited[core_id] = true; + } + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(cores_visited); +} + +void cstate_perf_init(void) +{ + /* + * If we don't have a C1 residency MSR, we calculate it "in software", + * but we need APERF, MPERF too. + */ + const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() + && platform->supported_cstates & CC1; + + if (soft_c1) + BIC_PRESENT(BIC_CPU_c1); + + cstate_perf_init_(soft_c1); +} + void probe_cstates(void) { probe_cst_limit(); - if (platform->supported_cstates & CC1) - BIC_PRESENT(BIC_CPU_c1); - - if (platform->supported_cstates & CC3) - BIC_PRESENT(BIC_CPU_c3); - - if (platform->supported_cstates & CC6) - BIC_PRESENT(BIC_CPU_c6); - - if (platform->supported_cstates & CC7) - BIC_PRESENT(BIC_CPU_c7); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) BIC_PRESENT(BIC_Pkgpc2); @@ -7042,6 +7318,19 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); } +static void counter_info_init(void) +{ + for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) { + struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; + + if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) + cai->msr = MSR_KNL_CORE_C6_RESIDENCY; + + if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) + cai->msr = 0; + } +} + void probe_pm_features(void) { probe_pstates(); @@ -7519,10 +7808,12 @@ void turbostat_init() check_msr_access(); check_perf_access(); process_cpuid(); + counter_info_init(); probe_pm_features(); set_amperf_source(); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); From 0451adf4d46d5df91f888a3d010a4109aa23a7ae Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 8 May 2024 15:00:14 +0200 Subject: [PATCH 12/15] tools/power turbostat: Read Package-cstates via perf Reading the counters via perf can be done in bulk with a single syscall, making the counter values more accurate with respect to one another by minimizing the time gap between individual counter reads. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 376 +++++++++++++++++--------- 1 file changed, 245 insertions(+), 131 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9d0278d17965..a3842e927799 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -224,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -1190,21 +1212,30 @@ enum ccstate_rci_index { CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, - NUM_CCSTATE_COUNTERS, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, }; struct cstate_counter_info_t { - unsigned long long data[NUM_CCSTATE_COUNTERS]; - enum cstate_source source[NUM_CCSTATE_COUNTERS]; - unsigned long long msr[NUM_CCSTATE_COUNTERS]; - int fd_perf; + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; }; struct cstate_counter_info_t *ccstate_counter_info; unsigned int ccstate_counter_info_size; -#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD (1u << 0) -#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 1) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) struct cstate_counter_arch_info { int feature_mask; /* Mask for testing if the counter is supported on host */ @@ -1214,6 +1245,7 @@ struct cstate_counter_arch_info { unsigned int rci_index; /* Maps data from perf counters to global variables */ unsigned long long bic; unsigned long long flags; + int pkg_cstate_limit; }; static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { @@ -1225,6 +1257,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, .bic = BIC_CPU_c1, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, }, { .feature_mask = CC3, @@ -1233,7 +1266,8 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C3_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, .bic = BIC_CPU_c3, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, }, { .feature_mask = CC6, @@ -1242,7 +1276,8 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C6_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, .bic = BIC_CPU_c6, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, }, { .feature_mask = CC7, @@ -1251,7 +1286,78 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C7_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, .bic = BIC_CPU_c7, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, }, }; @@ -1641,15 +1747,8 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -3493,7 +3592,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t { size_t ret = 0; - for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) if (cci->source[i] == CSTATE_SOURCE_PERF) ++ret; @@ -3598,9 +3697,16 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) return NULL; } -int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c) +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) { - unsigned long long perf_data[NUM_CCSTATE_COUNTERS + 1]; + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + struct cstate_counter_info_t *cci; if (debug) @@ -3609,35 +3715,72 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + cci = &ccstate_counter_info[cpu]; /* * If we have any perf counters to read, read them all now, in bulk */ - if (cci->fd_perf != -1) { - const size_t num_perf_counters = cstate_counter_info_count_perf(cci); - const ssize_t expected_read_size = - (num_perf_counters + 1) * sizeof(unsigned long long); - const ssize_t actual_read_size = - read(cci->fd_perf, &perf_data[0], sizeof(perf_data)); + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; - if (actual_read_size != expected_read_size) - err(-1, "%s: failed to read perf_data (%zu %zu)", - __func__, expected_read_size, actual_read_size); + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); } - for (unsigned int i = 0, pi = 1; i < NUM_CCSTATE_COUNTERS; ++i) { + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); + } + + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { case CSTATE_SOURCE_NONE: break; case CSTATE_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); - assert(cci->fd_perf != -1); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); if (debug) { - fprintf(stderr, "cstate via %s %u: %llu\n", - "perf", i, perf_data[pi]); + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); } cci->data[i] = perf_data[pi]; @@ -3651,8 +3794,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return -13 - i; if (debug) { - fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", - "msr", cci->msr[i], i, cci->data[i]); + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); } break; @@ -3671,12 +3813,21 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat out_counter = cci->data[index]; \ } while (0) - BUILD_BUG_ON(NUM_CCSTATE_COUNTERS != 4); + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + #undef PERF_COUNTER_WRITE_DATA return 0; @@ -3738,7 +3889,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->smi_count = msr & 0xFFFFFFFF; } - get_cstate_counters(cpu, t, c); + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) @@ -3803,34 +3954,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0)) return -13; } - if (DO_BIC(BIC_Pkgpc3)) - if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) - return -9; - if (DO_BIC(BIC_Pkgpc6)) { - if (platform->has_msr_atom_pkg_c6_residency) { - if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } else { - if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } - } - - if (DO_BIC(BIC_Pkgpc2)) - if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) - return -11; - if (DO_BIC(BIC_Pkgpc7)) - if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) - return -12; - if (DO_BIC(BIC_Pkgpc8)) - if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) - return -13; - if (DO_BIC(BIC_Pkgpc9)) - if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9)) - return -13; - if (DO_BIC(BIC_Pkgpc10)) - if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10)) - return -13; if (DO_BIC(BIC_CPU_LPI)) p->cpu_lpi = cpuidle_cur_cpu_lpi_us; @@ -3889,29 +4012,6 @@ done: return 0; } -/* - * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: - * If you change the values, note they are used both in comparisons - * (>= PCL__7) and to index pkg_cstate_limit_strings[]. - */ - -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ - int pkg_cstate_limit = PCLUKN; char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" @@ -4410,8 +4510,11 @@ void free_fd_cstate(void) const int counter_info_num = ccstate_counter_info_size; for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { - if (ccstate_counter_info[counter_id].fd_perf != -1) - close(ccstate_counter_info[counter_id].fd_perf); + if (ccstate_counter_info[counter_id].fd_perf_core != -1) + close(ccstate_counter_info[counter_id].fd_perf_core); + + if (ccstate_counter_info[counter_id].fd_perf_pkg != -1) + close(ccstate_counter_info[counter_id].fd_perf_pkg); } free(ccstate_counter_info); @@ -6914,30 +7017,43 @@ static int has_amperf_access(void) return 0; } -int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, - const struct cstate_counter_arch_info *cai) +int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) +{ + if (strcmp(group_name, "cstate_core") == 0) + return &cci->fd_perf_core; + + if (strcmp(group_name, "cstate_pkg") == 0) + return &cci->fd_perf_pkg; + + return NULL; +} + +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) { if (no_perf) return -1; + int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys); + + if (pfd_group == NULL) + return -1; + const unsigned int type = read_perf_type(cai->perf_subsys); const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); - const int fd_counter = - open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); if (fd_counter == -1) return -1; /* If it's the first counter opened, make it a group descriptor */ - if (cci->fd_perf == -1) - cci->fd_perf = fd_counter; + if (*pfd_group == -1) + *pfd_group = fd_counter; return fd_counter; } -int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, - const struct cstate_counter_arch_info *cai) +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) { int ret = add_cstate_perf_counter_(cpu, cci, cai); @@ -6950,8 +7066,9 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, void cstate_perf_init_(bool soft_c1) { bool has_counter; - bool *cores_visited; + bool *cores_visited = NULL, *pkg_visited = NULL; const int cores_visited_elems = topo.max_core_id + 1; + const int pkg_visited_elems = topo.max_package_id + 1; const int cci_num = topo.max_cpu_num + 1; ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); @@ -6963,13 +7080,20 @@ void cstate_perf_init_(bool soft_c1) if (!cores_visited) err(1, "calloc cores_visited"); - /* Initialize cstate_counter_info_percpu */ - for (int cpu = 0; cpu < cci_num; ++cpu) - ccstate_counter_info[cpu].fd_perf = -1; + pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited)); + if (!pkg_visited) + err(1, "calloc pkg_visited"); - for (int cidx = 0; cidx < NUM_CCSTATE_COUNTERS; ++cidx) { + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) { + ccstate_counter_info[cpu].fd_perf_core = -1; + ccstate_counter_info[cpu].fd_perf_pkg = -1; + } + + for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) { has_counter = false; memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited)); const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; @@ -6981,23 +7105,29 @@ void cstate_perf_init_(bool soft_c1) continue; const int core_id = cpus[cpu].physical_core_id; + const int pkg_id = cpus[cpu].physical_package_id; assert(core_id < cores_visited_elems); + assert(pkg_id < pkg_visited_elems); const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE; if (!per_thread && cores_visited[core_id]) continue; + if (!per_core && pkg_visited[pkg_id]) + continue; + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); const bool counter_supported = - platform->supported_cstates & cai->feature_mask; + (platform->supported_cstates & cai->feature_mask) && + (pkg_cstate_limit >= cai->pkg_cstate_limit); if (counter_needed && counter_supported) { /* Use perf API for this counter */ - if (!no_perf && cai->perf_name - && add_cstate_perf_counter(cpu, cci, cai) != -1) { + if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; @@ -7011,6 +7141,7 @@ void cstate_perf_init_(bool soft_c1) if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; + pkg_visited[pkg_id] = true; } } @@ -7020,6 +7151,7 @@ void cstate_perf_init_(bool soft_c1) } free(cores_visited); + free(pkg_visited); } void cstate_perf_init(void) @@ -7029,7 +7161,7 @@ void cstate_perf_init(void) * but we need APERF, MPERF too. */ const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() - && platform->supported_cstates & CC1; + && platform->supported_cstates & CC1; if (soft_c1) BIC_PRESENT(BIC_CPU_c1); @@ -7041,27 +7173,6 @@ void probe_cstates(void) { probe_cst_limit(); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); - - if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) - BIC_PRESENT(BIC_Pkgpc3); - - if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) - BIC_PRESENT(BIC_Pkgpc6); - - if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); - - if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) - BIC_PRESENT(BIC_Pkgpc8); - - if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) - BIC_PRESENT(BIC_Pkgpc9); - - if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) - BIC_PRESENT(BIC_Pkgpc10); - if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); @@ -7320,7 +7431,7 @@ void process_cpuid() static void counter_info_init(void) { - for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) { + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) { struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) @@ -7328,6 +7439,9 @@ static void counter_info_init(void) if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) cai->msr = 0; + + if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) + cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } } From 4e7ee02300805d26d9731fd24c4de8e10a43ffea Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 9 May 2024 12:24:02 +0200 Subject: [PATCH 13/15] tools/power turbostat: Fix order of strings in pkg_cstate_limit_strings Change the order so that it matches the indexes defined in: Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a3842e927799..b45b2f494416 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4013,7 +4013,7 @@ done: } int pkg_cstate_limit = PCLUKN; -char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", +char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; From 29fea61cd8d4d0e646022c0479aa35381cf1e990 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 9 May 2024 12:39:47 +0200 Subject: [PATCH 14/15] tools/power turbostat: Ignore pkg_cstate_limit when it is not available When running in no-msr mode, the pkg_cstate_limit is not populated, thus we use perf to determine if given pcstate counter is present on the platform. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b45b2f494416..8cb18d42c189 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -7121,9 +7121,7 @@ void cstate_perf_init_(bool soft_c1) const bool counter_needed = BIC_IS_ENABLED(cai->bic) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); - const bool counter_supported = - (platform->supported_cstates & cai->feature_mask) && - (pkg_cstate_limit >= cai->pkg_cstate_limit); + const bool counter_supported = (platform->supported_cstates & cai->feature_mask); if (counter_needed && counter_supported) { /* Use perf API for this counter */ @@ -7132,7 +7130,8 @@ void cstate_perf_init_(bool soft_c1) cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; /* User MSR for this counter */ - } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit + && probe_msr(cpu, cai->msr) == 0) { cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } From 256d218ec6aea99855dc5c54af550fcff96fc732 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 27 Apr 2024 22:15:48 -0400 Subject: [PATCH 15/15] tools/power turbostat: version 2024.05.10 New since 2024.04.08: Len Brown (6): tools/power turbostat: Add "snapshot:" Makefile target tools/power turbostat: Harden probe_intel_uncore_frequency() tools/power turbostat: Remember global max_die_id tools/power turbostat: Survive sparse die_id tools/power turbostat: Add columns for clustered uncore frequency tools/power turbostat: version 2024.05.10 Patryk Wlazlyn (7): tools/power turbostat: Replace _Static_assert with BUILD_BUG_ON tools/power turbostat: Enable non-privileged users to read sysfs counters tools/power turbostat: Avoid possible memory corruption due to sparse topology IDs tools/power turbostat: Read Core-cstates via perf tools/power turbostat: Read Package-cstates via perf tools/power turbostat: Fix order of strings in pkg_cstate_limit_strings tools/power turbostat: Ignore pkg_cstate_limit when it is not available Zhang Rui (2): tools/power turbostat: Enhance ARL/LNL support tools/power turbostat: Add ARL-H support Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8cb18d42c189..8cdf41906e98 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -8017,7 +8017,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048