Merge branch 'bpf-enable-irq-after-irq_work_raise-completes'
Hou Tao says: ==================== bpf: Enable IRQ after irq_work_raise() completes From: Hou Tao <houtao1@huawei.com> Hi, The patchset aims to fix the problem that bpf_mem_alloc() may return NULL unexpectedly when multiple bpf_mem_alloc() are invoked concurrently under process context and there is still free memory available. The problem was found when doing stress test for qp-trie but the same problem also exists for bpf_obj_new() as demonstrated in patch #3. As pointed out by Alexei, the patchset can only fix ENOMEM problem for normal process context and can not fix the problem for irq-disabled context or RT-enabled kernel. Patch #1 fixes the race between unit_alloc() and unit_alloc(). Patch #2 fixes the race between unit_alloc() and unit_free(). And patch #3 adds a selftest for the problem. The major change compared with v1 is using local_irq_{save,restore)() pair to disable and enable preemption instead of preempt_{disable,enable}_notrace pair. The main reason is to prevent potential overhead from __preempt_schedule_notrace(). I also run htab_mem benchmark and hash_map_perf on a 8-CPUs KVM VM to compare the performance between local_irq_{save,restore} and preempt_{disable,enable}_notrace(), but the results are similar as shown below: (1) use preempt_{disable,enable}_notrace() [root@hello bpf]# ./map_perf_test 4 8 0:hash_map_perf kmalloc 652179 events per sec 1:hash_map_perf kmalloc 651880 events per sec 2:hash_map_perf kmalloc 651382 events per sec 3:hash_map_perf kmalloc 650791 events per sec 5:hash_map_perf kmalloc 650140 events per sec 6:hash_map_perf kmalloc 652773 events per sec 7:hash_map_perf kmalloc 652751 events per sec 4:hash_map_perf kmalloc 648199 events per sec [root@hello bpf]# ./benchs/run_bench_htab_mem.sh normal bpf ma ============= overwrite per-prod-op: 110.82 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.73MiB batch_add_batch_del per-prod-op: 89.79 ± 0.75k/s, avg mem: 1.68 ± 0.38MiB, peak mem: 2.73MiB add_del_on_diff_cpu per-prod-op: 17.83 ± 0.07k/s, avg mem: 25.68 ± 2.92MiB, peak mem: 35.10MiB (2) use local_irq_{save,restore} [root@hello bpf]# ./map_perf_test 4 8 0:hash_map_perf kmalloc 656299 events per sec 1:hash_map_perf kmalloc 656397 events per sec 2:hash_map_perf kmalloc 656046 events per sec 3:hash_map_perf kmalloc 655723 events per sec 5:hash_map_perf kmalloc 655221 events per sec 4:hash_map_perf kmalloc 654617 events per sec 6:hash_map_perf kmalloc 650269 events per sec 7:hash_map_perf kmalloc 653665 events per sec [root@hello bpf]# ./benchs/run_bench_htab_mem.sh normal bpf ma ============= overwrite per-prod-op: 116.10 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.74MiB batch_add_batch_del per-prod-op: 88.76 ± 0.61k/s, avg mem: 1.94 ± 0.33MiB, peak mem: 2.74MiB add_del_on_diff_cpu per-prod-op: 18.12 ± 0.08k/s, avg mem: 25.10 ± 2.70MiB, peak mem: 34.78MiB As ususal comments are always welcome. Change Log: v2: * Use local_irq_save to disable preemption instead of using preempt_{disable,enable}_notrace pair to prevent potential overhead v1: https://lore.kernel.org/bpf/20230822133807.3198625-1-houtao@huaweicloud.com/ ==================== Link: https://lore.kernel.org/r/20230901111954.1804721-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
35897c3c52
@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
|
||||
}
|
||||
}
|
||||
local_dec(&c->active);
|
||||
local_irq_restore(flags);
|
||||
|
||||
WARN_ON(cnt < 0);
|
||||
|
||||
if (cnt < c->low_watermark)
|
||||
irq_work_raise(c);
|
||||
/* Enable IRQ after the enqueue of irq work completes, so irq work
|
||||
* will run after IRQ is enabled and free_llist may be refilled by
|
||||
* irq work before other task preempts current task.
|
||||
*/
|
||||
local_irq_restore(flags);
|
||||
|
||||
return llnode;
|
||||
}
|
||||
|
||||
@ -773,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
|
||||
llist_add(llnode, &c->free_llist_extra);
|
||||
}
|
||||
local_dec(&c->active);
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (cnt > c->high_watermark)
|
||||
/* free few objects from current cpu into global kmalloc pool */
|
||||
irq_work_raise(c);
|
||||
/* Enable IRQ after irq_work_raise() completes, otherwise when current
|
||||
* task is preempted by task which does unit_alloc(), unit_alloc() may
|
||||
* return NULL unexpectedly because irq work is already pending but can
|
||||
* not been triggered and free_llist can not be refilled timely.
|
||||
*/
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
|
||||
@ -795,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
|
||||
llist_add(llnode, &c->free_llist_extra_rcu);
|
||||
}
|
||||
local_dec(&c->active);
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (!atomic_read(&c->call_rcu_in_progress))
|
||||
irq_work_raise(c);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/* Called from BPF program or from sys_bpf syscall.
|
||||
|
89
tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c
Normal file
89
tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c
Normal file
@ -0,0 +1,89 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
|
||||
#define _GNU_SOURCE
|
||||
#include <sched.h>
|
||||
#include <pthread.h>
|
||||
#include <stdbool.h>
|
||||
#include <test_progs.h>
|
||||
|
||||
#include "preempted_bpf_ma_op.skel.h"
|
||||
|
||||
#define ALLOC_THREAD_NR 4
|
||||
#define ALLOC_LOOP_NR 512
|
||||
|
||||
struct alloc_ctx {
|
||||
/* output */
|
||||
int run_err;
|
||||
/* input */
|
||||
int fd;
|
||||
bool *nomem_err;
|
||||
};
|
||||
|
||||
static void *run_alloc_prog(void *data)
|
||||
{
|
||||
struct alloc_ctx *ctx = data;
|
||||
cpu_set_t cpu_set;
|
||||
int i;
|
||||
|
||||
CPU_ZERO(&cpu_set);
|
||||
CPU_SET(0, &cpu_set);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
|
||||
|
||||
for (i = 0; i < ALLOC_LOOP_NR && !*ctx->nomem_err; i++) {
|
||||
LIBBPF_OPTS(bpf_test_run_opts, topts);
|
||||
int err;
|
||||
|
||||
err = bpf_prog_test_run_opts(ctx->fd, &topts);
|
||||
ctx->run_err |= err | topts.retval;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void test_preempted_bpf_ma_op(void)
|
||||
{
|
||||
struct alloc_ctx ctx[ALLOC_THREAD_NR];
|
||||
struct preempted_bpf_ma_op *skel;
|
||||
pthread_t tid[ALLOC_THREAD_NR];
|
||||
int i, err;
|
||||
|
||||
skel = preempted_bpf_ma_op__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
err = preempted_bpf_ma_op__attach(skel);
|
||||
if (!ASSERT_OK(err, "attach"))
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ctx); i++) {
|
||||
struct bpf_program *prog;
|
||||
char name[8];
|
||||
|
||||
snprintf(name, sizeof(name), "test%d", i);
|
||||
prog = bpf_object__find_program_by_name(skel->obj, name);
|
||||
if (!ASSERT_OK_PTR(prog, "no test prog"))
|
||||
goto out;
|
||||
|
||||
ctx[i].run_err = 0;
|
||||
ctx[i].fd = bpf_program__fd(prog);
|
||||
ctx[i].nomem_err = &skel->bss->nomem_err;
|
||||
}
|
||||
|
||||
memset(tid, 0, sizeof(tid));
|
||||
for (i = 0; i < ARRAY_SIZE(tid); i++) {
|
||||
err = pthread_create(&tid[i], NULL, run_alloc_prog, &ctx[i]);
|
||||
if (!ASSERT_OK(err, "pthread_create"))
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tid); i++) {
|
||||
if (!tid[i])
|
||||
break;
|
||||
pthread_join(tid[i], NULL);
|
||||
ASSERT_EQ(ctx[i].run_err, 0, "run prog err");
|
||||
}
|
||||
|
||||
ASSERT_FALSE(skel->bss->nomem_err, "ENOMEM");
|
||||
out:
|
||||
preempted_bpf_ma_op__destroy(skel);
|
||||
}
|
106
tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c
Normal file
106
tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c
Normal file
@ -0,0 +1,106 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#include "bpf_experimental.h"
|
||||
|
||||
struct bin_data {
|
||||
char data[256];
|
||||
struct bpf_spin_lock lock;
|
||||
};
|
||||
|
||||
struct map_value {
|
||||
struct bin_data __kptr * data;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
__uint(max_entries, 2048);
|
||||
} array SEC(".maps");
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
bool nomem_err = false;
|
||||
|
||||
static int del_array(unsigned int i, int *from)
|
||||
{
|
||||
struct map_value *value;
|
||||
struct bin_data *old;
|
||||
|
||||
value = bpf_map_lookup_elem(&array, from);
|
||||
if (!value)
|
||||
return 1;
|
||||
|
||||
old = bpf_kptr_xchg(&value->data, NULL);
|
||||
if (old)
|
||||
bpf_obj_drop(old);
|
||||
|
||||
(*from)++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int add_array(unsigned int i, int *from)
|
||||
{
|
||||
struct bin_data *old, *new;
|
||||
struct map_value *value;
|
||||
|
||||
value = bpf_map_lookup_elem(&array, from);
|
||||
if (!value)
|
||||
return 1;
|
||||
|
||||
new = bpf_obj_new(typeof(*new));
|
||||
if (!new) {
|
||||
nomem_err = true;
|
||||
return 1;
|
||||
}
|
||||
|
||||
old = bpf_kptr_xchg(&value->data, new);
|
||||
if (old)
|
||||
bpf_obj_drop(old);
|
||||
|
||||
(*from)++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void del_then_add_array(int from)
|
||||
{
|
||||
int i;
|
||||
|
||||
i = from;
|
||||
bpf_loop(512, del_array, &i, 0);
|
||||
|
||||
i = from;
|
||||
bpf_loop(512, add_array, &i, 0);
|
||||
}
|
||||
|
||||
SEC("fentry/bpf_fentry_test1")
|
||||
int BPF_PROG2(test0, int, a)
|
||||
{
|
||||
del_then_add_array(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fentry/bpf_fentry_test2")
|
||||
int BPF_PROG2(test1, int, a, u64, b)
|
||||
{
|
||||
del_then_add_array(512);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fentry/bpf_fentry_test3")
|
||||
int BPF_PROG2(test2, char, a, int, b, u64, c)
|
||||
{
|
||||
del_then_add_array(1024);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fentry/bpf_fentry_test4")
|
||||
int BPF_PROG2(test3, void *, a, char, b, int, c, u64, d)
|
||||
{
|
||||
del_then_add_array(1536);
|
||||
return 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user