diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt index 4e8c263e1721..856f0dfb8e5a 100644 --- a/tools/perf/Documentation/perf-c2c.txt +++ b/tools/perf/Documentation/perf-c2c.txt @@ -130,6 +130,12 @@ REPORT OPTIONS The known limitations include exception handing such as setjmp/longjmp will have calls/returns not match. +--double-cl:: + Group the detection of shared cacheline events into double cacheline + granularity. Some architectures have an Adjacent Cacheline Prefetch + feature, which causes cacheline sharing to behave like the cacheline + size is doubled. + C2C RECORD ---------- The perf c2c record command setup options related to HITM cacheline analysis diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 52d94c7dd836..56974eae0638 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -524,7 +524,7 @@ static int dcacheline_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr); + addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -562,7 +562,7 @@ static int offset_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_offset(he->mem_info->daddr.al_addr); + addr = cl_offset(he->mem_info->daddr.al_addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -574,9 +574,10 @@ offset_cmp(struct perf_hpp_fmt *fmt __maybe_unused, uint64_t l = 0, r = 0; if (left->mem_info) - l = cl_offset(left->mem_info->daddr.addr); + l = cl_offset(left->mem_info->daddr.addr, chk_double_cl); + if (right->mem_info) - r = cl_offset(right->mem_info->daddr.addr); + r = cl_offset(right->mem_info->daddr.addr, chk_double_cl); return (int64_t)(r - l); } @@ -2590,7 +2591,7 @@ perf_c2c_cacheline_browser__title(struct hist_browser *browser, he = cl_browser->he; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr); + addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); scnprintf(bf, size, "Cacheline 0x%lx", addr); return 0; @@ -2788,15 +2789,16 @@ static int ui_quirks(void) if (!c2c.use_stdio) { dim_offset.width = 5; dim_offset.header = header_offset_tui; - nodestr = "CL"; + nodestr = chk_double_cl ? "Double-CL" : "CL"; } dim_percent_costly_snoop.header = percent_costly_snoop_header[c2c.display]; /* Fix the zero line for dcacheline column. */ - buf = fill_line("Cacheline", dim_dcacheline.width + - dim_dcacheline_node.width + - dim_dcacheline_count.width + 4); + buf = fill_line(chk_double_cl ? "Double-Cacheline" : "Cacheline", + dim_dcacheline.width + + dim_dcacheline_node.width + + dim_dcacheline_count.width + 4); if (!buf) return -ENOMEM; @@ -3037,6 +3039,7 @@ static int perf_c2c__report(int argc, const char **argv) OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), OPT_BOOLEAN(0, "stitch-lbr", &c2c.stitch_lbr, "Enable LBR callgraph stitching approach"), + OPT_BOOLEAN(0, "double-cl", &chk_double_cl, "Detect adjacent cacheline false sharing"), OPT_PARENT(c2c_options), OPT_END() }; diff --git a/tools/perf/util/cacheline.h b/tools/perf/util/cacheline.h index dec8c0fb1f4a..fe6d5b60a031 100644 --- a/tools/perf/util/cacheline.h +++ b/tools/perf/util/cacheline.h @@ -6,16 +6,31 @@ int __pure cacheline_size(void); -static inline u64 cl_address(u64 address) + +/* + * Some architectures have 'Adjacent Cacheline Prefetch' feature, + * which performs like the cacheline size being doubled. + */ +static inline u64 cl_address(u64 address, bool double_cl) { + u64 size = cacheline_size(); + + if (double_cl) + size *= 2; + /* return the cacheline of the address */ - return (address & ~(cacheline_size() - 1)); + return (address & ~(size - 1)); } -static inline u64 cl_offset(u64 address) +static inline u64 cl_offset(u64 address, bool double_cl) { - /* return the cacheline of the address */ - return (address & (cacheline_size() - 1)); + u64 size = cacheline_size(); + + if (double_cl) + size *= 2; + + /* return the offset inside cacheline */ + return (address & (size - 1)); } #endif // PERF_CACHELINE_H diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 4a648231fe72..093a0c8b2e3d 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -53,6 +53,13 @@ enum sort_mode sort__mode = SORT_MODE__NORMAL; static const char *const dynamic_headers[] = {"local_ins_lat", "ins_lat", "local_p_stage_cyc", "p_stage_cyc"}; static const char *const arch_specific_sort_keys[] = {"local_p_stage_cyc", "p_stage_cyc"}; +/* + * Some architectures have Adjacent Cacheline Prefetch feature, which + * behaves like the cacheline size is doubled. Enable this flag to + * check things in double cacheline granularity. + */ +bool chk_double_cl; + /* * Replaces all occurrences of a char used with the: * @@ -1500,8 +1507,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) addr: /* al_addr does all the right addr - start + offset calculations */ - l = cl_address(left->mem_info->daddr.al_addr); - r = cl_address(right->mem_info->daddr.al_addr); + l = cl_address(left->mem_info->daddr.al_addr, chk_double_cl); + r = cl_address(right->mem_info->daddr.al_addr, chk_double_cl); if (l > r) return -1; if (l < r) return 1; @@ -1520,7 +1527,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, if (he->mem_info) { struct map *map = he->mem_info->daddr.ms.map; - addr = cl_address(he->mem_info->daddr.al_addr); + addr = cl_address(he->mem_info->daddr.al_addr, chk_double_cl); ms = &he->mem_info->daddr.ms; /* print [s] for shared data mmaps */ diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 9a91d0df2833..d79a100e5999 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -35,6 +35,7 @@ extern struct sort_entry sort_sym_from; extern struct sort_entry sort_sym_to; extern struct sort_entry sort_srcline; extern const char default_mem_sort_order[]; +extern bool chk_double_cl; struct res_sample { u64 time;