Some platforms have 'cluster' topology and CPUs in the cluster will share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel Jacobsville). Currently parsing and building cluster topology have been supported since [1]. perf stat has already supported aggregation for other topologies like die or socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T bandwidth contention. This patch add support for "--per-cluster" option for per-cluster aggregation. Also update the docs and related test. The output will be like: [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5 Performance counter stats for 'system wide': S56-D0-CLS158 4 1,321,521,570 LLC-load S56-D0-CLS594 4 794,211,453 LLC-load S56-D0-CLS1030 4 41,623 LLC-load S56-D0-CLS1466 4 41,646 LLC-load S56-D0-CLS1902 4 16,863 LLC-load S56-D0-CLS2338 4 15,721 LLC-load S56-D0-CLS2774 4 22,671 LLC-load [...] On a legacy system without cluster or cluster support, the output will be look like: [root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1 Performance counter stats for 'system wide': S56-D0-CLS0 64 18,011,485 cycles S7182-D0-CLS0 64 16,548,835 cycles Note that this patch doesn't mix the cluster information in the outputs of --per-core to avoid breaking any tools/scripts using it. Note that perf recently supports "--per-cache" aggregation, but it's not the same with the cluster although cluster CPUs may share some cache resources. For example on my machine all clusters within a die share the same L3 cache: $ cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list 0-31 $ cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list 0-3 [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die") Tested-by: Jie Zhan <zhanjie9@hisilicon.com> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Cc: james.clark@arm.com Cc: 21cnbao@gmail.com Cc: prime.zeng@hisilicon.com Cc: Jonathan.Cameron@huawei.com Cc: fanghao11@huawei.com Cc: linuxarm@huawei.com Cc: tim.c.chen@intel.com Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240208024026.2691-1-yangyicong@huawei.com
111 lines
2.9 KiB
Bash
Executable File
111 lines
2.9 KiB
Bash
Executable File
#!/bin/bash
|
|
# perf stat STD output linter
|
|
# SPDX-License-Identifier: GPL-2.0
|
|
# Tests various perf stat STD output commands for
|
|
# default event and metricgroup
|
|
|
|
set -e
|
|
|
|
# shellcheck source=lib/stat_output.sh
|
|
. "$(dirname $0)"/lib/stat_output.sh
|
|
|
|
stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX)
|
|
|
|
event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults stalled-cycles-frontend stalled-cycles-backend cycles instructions branches branch-misses)
|
|
event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "frontend cycles idle" "backend cycles idle" "GHz" "insn per cycle" "/sec" "of all branches")
|
|
skip_metric=("stalled cycles per insn" "tma_" "retiring" "frontend_bound" "bad_speculation" "backend_bound")
|
|
|
|
cleanup() {
|
|
rm -f "${stat_output}"
|
|
|
|
trap - EXIT TERM INT
|
|
}
|
|
|
|
trap_cleanup() {
|
|
cleanup
|
|
exit 1
|
|
}
|
|
trap trap_cleanup EXIT TERM INT
|
|
|
|
function commachecker()
|
|
{
|
|
local prefix=1
|
|
|
|
case "$1"
|
|
in "--interval") prefix=2
|
|
;; "--per-thread") prefix=2
|
|
;; "--system-wide-no-aggr") prefix=2
|
|
;; "--per-core") prefix=3
|
|
;; "--per-socket") prefix=3
|
|
;; "--per-node") prefix=3
|
|
;; "--per-die") prefix=3
|
|
;; "--per-cache") prefix=3
|
|
;; "--per-cluster") prefix=3
|
|
esac
|
|
|
|
while read line
|
|
do
|
|
# Ignore initial "started on" comment.
|
|
x=${line:0:1}
|
|
[ "$x" = "#" ] && continue
|
|
# Ignore initial blank line.
|
|
[ "$line" = "" ] && continue
|
|
# Ignore "Performance counter stats"
|
|
x=${line:0:25}
|
|
[ "$x" = "Performance counter stats" ] && continue
|
|
# Ignore "seconds time elapsed" and break
|
|
[[ "$line" == *"time elapsed"* ]] && break
|
|
|
|
main_body=$(echo $line | cut -d' ' -f$prefix-)
|
|
x=${main_body%#*}
|
|
[ "$x" = "" ] && continue
|
|
|
|
# Skip metrics without event name
|
|
y=${main_body#*#}
|
|
for i in "${!skip_metric[@]}"; do
|
|
[[ "$y" == *"${skip_metric[$i]}"* ]] && break
|
|
done
|
|
[[ "$y" == *"${skip_metric[$i]}"* ]] && continue
|
|
|
|
# Check default event
|
|
for i in "${!event_name[@]}"; do
|
|
[[ "$x" == *"${event_name[$i]}"* ]] && break
|
|
done
|
|
|
|
[[ ! "$x" == *"${event_name[$i]}"* ]] && {
|
|
echo "Unknown event name in $line" 1>&2
|
|
exit 1;
|
|
}
|
|
|
|
# Check event metric if it exists
|
|
[[ ! "$main_body" == *"#"* ]] && continue
|
|
[[ ! "$main_body" == *"${event_metric[$i]}"* ]] && {
|
|
echo "wrong event metric. expected ${event_metric[$i]} in $line" 1>&2
|
|
exit 1;
|
|
}
|
|
done < "${stat_output}"
|
|
return 0
|
|
}
|
|
|
|
perf_cmd="-o ${stat_output}"
|
|
|
|
skip_test=$(check_for_topology)
|
|
check_no_args "STD" "$perf_cmd"
|
|
check_system_wide "STD" "$perf_cmd"
|
|
check_interval "STD" "$perf_cmd"
|
|
check_per_thread "STD" "$perf_cmd"
|
|
check_per_node "STD" "$perf_cmd"
|
|
if [ $skip_test -ne 1 ]
|
|
then
|
|
check_system_wide_no_aggr "STD" "$perf_cmd"
|
|
check_per_core "STD" "$perf_cmd"
|
|
check_per_cache_instance "STD" "$perf_cmd"
|
|
check_per_cluster "STD" "$perf_cmd"
|
|
check_per_die "STD" "$perf_cmd"
|
|
check_per_socket "STD" "$perf_cmd"
|
|
else
|
|
echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid"
|
|
fi
|
|
cleanup
|
|
exit 0
|