Add a new benchmark which measures hashmap lookup operations speed. A user can
control the following parameters of the benchmark:
* key_size (max 1024): the key size to use
* max_entries: the hashmap max entries
* nr_entries: the number of entries to insert/lookup
* nr_loops: the number of loops for the benchmark
* map_flags The hashmap flags passed to BPF_MAP_CREATE
The BPF program performing the benchmarks calls two nested bpf_loop:
bpf_loop(nr_loops/nr_entries)
bpf_loop(nr_entries)
bpf_map_lookup()
So the nr_loops determines the number of actual map lookups. All lookups are
successful.
Example (the output is generated on a AMD Ryzen 9 3950X machine):
for nr_entries in `seq 4096 4096 65536`; do echo -n "$((nr_entries*100/65536))% full: "; sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=4 --nr_entries=$nr_entries --max_entries=65536 --nr_loops=1000000 --map_flags=0x40 | grep cpu; done
6% full: cpu01: lookup 50.739M ± 0.018M events/sec (approximated from 32 samples of ~19ms)
12% full: cpu01: lookup 47.751M ± 0.015M events/sec (approximated from 32 samples of ~20ms)
18% full: cpu01: lookup 45.153M ± 0.013M events/sec (approximated from 32 samples of ~22ms)
25% full: cpu01: lookup 43.826M ± 0.014M events/sec (approximated from 32 samples of ~22ms)
31% full: cpu01: lookup 41.971M ± 0.012M events/sec (approximated from 32 samples of ~23ms)
37% full: cpu01: lookup 41.034M ± 0.015M events/sec (approximated from 32 samples of ~24ms)
43% full: cpu01: lookup 39.946M ± 0.012M events/sec (approximated from 32 samples of ~25ms)
50% full: cpu01: lookup 38.256M ± 0.014M events/sec (approximated from 32 samples of ~26ms)
56% full: cpu01: lookup 36.580M ± 0.018M events/sec (approximated from 32 samples of ~27ms)
62% full: cpu01: lookup 36.252M ± 0.012M events/sec (approximated from 32 samples of ~27ms)
68% full: cpu01: lookup 35.200M ± 0.012M events/sec (approximated from 32 samples of ~28ms)
75% full: cpu01: lookup 34.061M ± 0.009M events/sec (approximated from 32 samples of ~29ms)
81% full: cpu01: lookup 34.374M ± 0.010M events/sec (approximated from 32 samples of ~29ms)
87% full: cpu01: lookup 33.244M ± 0.011M events/sec (approximated from 32 samples of ~30ms)
93% full: cpu01: lookup 32.182M ± 0.013M events/sec (approximated from 32 samples of ~31ms)
100% full: cpu01: lookup 31.497M ± 0.016M events/sec (approximated from 32 samples of ~31ms)
Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-8-aspsk@isovalent.com
64 lines
1.5 KiB
C
64 lines
1.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright (c) 2023 Isovalent */
|
|
|
|
#include "vmlinux.h"
|
|
|
|
#include <bpf/bpf_helpers.h>
|
|
#include "bpf_misc.h"
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
} hash_map_bench SEC(".maps");
|
|
|
|
/* The number of slots to store times */
|
|
#define NR_SLOTS 32
|
|
#define NR_CPUS 256
|
|
#define CPU_MASK (NR_CPUS-1)
|
|
|
|
/* Configured by userspace */
|
|
u64 nr_entries;
|
|
u64 nr_loops;
|
|
u32 __attribute__((__aligned__(8))) key[NR_CPUS];
|
|
|
|
/* Filled by us */
|
|
u64 __attribute__((__aligned__(256))) percpu_times_index[NR_CPUS];
|
|
u64 __attribute__((__aligned__(256))) percpu_times[NR_CPUS][NR_SLOTS];
|
|
|
|
static inline void patch_key(u32 i)
|
|
{
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
key[0] = i + 1;
|
|
#else
|
|
key[0] = __builtin_bswap32(i + 1);
|
|
#endif
|
|
/* the rest of key is random and is configured by userspace */
|
|
}
|
|
|
|
static int lookup_callback(__u32 index, u32 *unused)
|
|
{
|
|
patch_key(index);
|
|
return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
|
|
}
|
|
|
|
static int loop_lookup_callback(__u32 index, u32 *unused)
|
|
{
|
|
return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
|
|
}
|
|
|
|
SEC("fentry/" SYS_PREFIX "sys_getpgid")
|
|
int benchmark(void *ctx)
|
|
{
|
|
u32 cpu = bpf_get_smp_processor_id();
|
|
u32 times_index;
|
|
u64 start_time;
|
|
|
|
times_index = percpu_times_index[cpu & CPU_MASK] % NR_SLOTS;
|
|
start_time = bpf_ktime_get_ns();
|
|
bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
|
|
percpu_times[cpu & CPU_MASK][times_index] = bpf_ktime_get_ns() - start_time;
|
|
percpu_times_index[cpu & CPU_MASK] += 1;
|
|
return 0;
|
|
}
|