68a6772f11
Add 2 benchmarks: 1. Performance of thread creation/exiting in presence of breakpoints. 2. Performance of breakpoint modification in presence of threads. The benchmarks capture use cases that we are interested in: using inheritable breakpoints in large highly-threaded applications. The benchmarks show significant slowdown imposed by breakpoints (even when they don't fire). Testing on Intel 8173M with 112 HW threads show: perf bench --repeat=56 breakpoint thread --breakpoints=0 --parallelism=56 --threads=20 78.675000 usecs/op perf bench --repeat=56 breakpoint thread --breakpoints=4 --parallelism=56 --threads=20 12967.135714 usecs/op That's 165x slowdown due to presence of the breakpoints. perf bench --repeat=20000 breakpoint enable --passive=0 --active=0 1.433250 usecs/op perf bench --repeat=20000 breakpoint enable --passive=224 --active=0 585.318400 usecs/op perf bench --repeat=20000 breakpoint enable --passive=0 --active=111 635.953000 usecs/op That's 408x and 444x slowdown due to presence of threads. Profiles show some overhead in toggle_bp_slot, but also very high contention: 90.83% breakpoint-thre [kernel.kallsyms] [k] osq_lock 4.69% breakpoint-thre [kernel.kallsyms] [k] mutex_spin_on_owner 2.06% breakpoint-thre [kernel.kallsyms] [k] __reserve_bp_slot 2.04% breakpoint-thre [kernel.kallsyms] [k] toggle_bp_slot 79.01% breakpoint-enab [kernel.kallsyms] [k] smp_call_function_single 9.94% breakpoint-enab [kernel.kallsyms] [k] llist_add_batch 5.70% breakpoint-enab [kernel.kallsyms] [k] _raw_spin_lock_irq 1.84% breakpoint-enab [kernel.kallsyms] [k] event_function_call 1.12% breakpoint-enab [kernel.kallsyms] [k] send_call_function_single_ipi 0.37% breakpoint-enab [kernel.kallsyms] [k] generic_exec_single 0.24% breakpoint-enab [kernel.kallsyms] [k] __perf_event_disable 0.20% breakpoint-enab [kernel.kallsyms] [k] _perf_event_enable 0.18% breakpoint-enab [kernel.kallsyms] [k] toggle_bp_slot Committer notes: Fixup struct init for older compilers: 3 32.90 alpine:3.5 : FAIL clang version 3.8.1 (tags/RELEASE_381/final) bench/breakpoint.c:49:34: error: missing field 'size' initializer [-Werror,-Wmissing-field-initializers] struct perf_event_attr attr = {0}; ^ 1 error generated. 7 37.31 alpine:3.9 : FAIL gcc version 8.3.0 (Alpine 8.3.0) bench/breakpoint.c:49:34: error: missing field 'size' initializer [-Werror,-Wmissing-field-initializers] struct perf_event_attr attr = {0}; ^ 1 error generated. Signed-off-by: Dmitriy Vyukov <dvyukov@google.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Marco Elver <elver@google.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20220505155745.1690906-1-dvyukov@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
77 lines
2.3 KiB
C
77 lines
2.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef BENCH_H
|
|
#define BENCH_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
extern struct timeval bench__start, bench__end, bench__runtime;
|
|
|
|
/*
|
|
* The madvise transparent hugepage constants were added in glibc
|
|
* 2.13. For compatibility with older versions of glibc, define these
|
|
* tokens if they are not already defined.
|
|
*
|
|
* PA-RISC uses different madvise values from other architectures and
|
|
* needs to be special-cased.
|
|
*/
|
|
#ifdef __hppa__
|
|
# ifndef MADV_HUGEPAGE
|
|
# define MADV_HUGEPAGE 67
|
|
# endif
|
|
# ifndef MADV_NOHUGEPAGE
|
|
# define MADV_NOHUGEPAGE 68
|
|
# endif
|
|
#else
|
|
# ifndef MADV_HUGEPAGE
|
|
# define MADV_HUGEPAGE 14
|
|
# endif
|
|
# ifndef MADV_NOHUGEPAGE
|
|
# define MADV_NOHUGEPAGE 15
|
|
# endif
|
|
#endif
|
|
|
|
int bench_numa(int argc, const char **argv);
|
|
int bench_sched_messaging(int argc, const char **argv);
|
|
int bench_sched_pipe(int argc, const char **argv);
|
|
int bench_syscall_basic(int argc, const char **argv);
|
|
int bench_mem_memcpy(int argc, const char **argv);
|
|
int bench_mem_memset(int argc, const char **argv);
|
|
int bench_mem_find_bit(int argc, const char **argv);
|
|
int bench_futex_hash(int argc, const char **argv);
|
|
int bench_futex_wake(int argc, const char **argv);
|
|
int bench_futex_wake_parallel(int argc, const char **argv);
|
|
int bench_futex_requeue(int argc, const char **argv);
|
|
/* pi futexes */
|
|
int bench_futex_lock_pi(int argc, const char **argv);
|
|
int bench_epoll_wait(int argc, const char **argv);
|
|
int bench_epoll_ctl(int argc, const char **argv);
|
|
int bench_synthesize(int argc, const char **argv);
|
|
int bench_kallsyms_parse(int argc, const char **argv);
|
|
int bench_inject_build_id(int argc, const char **argv);
|
|
int bench_evlist_open_close(int argc, const char **argv);
|
|
int bench_breakpoint_thread(int argc, const char **argv);
|
|
int bench_breakpoint_enable(int argc, const char **argv);
|
|
|
|
#define BENCH_FORMAT_DEFAULT_STR "default"
|
|
#define BENCH_FORMAT_DEFAULT 0
|
|
#define BENCH_FORMAT_SIMPLE_STR "simple"
|
|
#define BENCH_FORMAT_SIMPLE 1
|
|
|
|
#define BENCH_FORMAT_UNKNOWN -1
|
|
|
|
extern int bench_format;
|
|
extern unsigned int bench_repeat;
|
|
|
|
#ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
|
|
#include <pthread.h>
|
|
#include <linux/compiler.h>
|
|
static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused,
|
|
size_t cpusetsize __maybe_unused,
|
|
cpu_set_t *cpuset __maybe_unused)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#endif
|