From 566f6de3cea3482d75d836a2398792a8be32ec26 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 1 Sep 2023 19:19:52 +0800 Subject: [PATCH 1/3] bpf: Enable IRQ after irq_work_raise() completes in unit_alloc() When doing stress test for qp-trie, bpf_mem_alloc() returned NULL unexpectedly because all qp-trie operations were initiated from bpf syscalls and there was still available free memory. bpf_obj_new() has the same problem as shown by the following selftest. The failure is due to the preemption. irq_work_raise() will invoke irq_work_claim() first to mark the irq work as pending and then inovke __irq_work_queue_local() to raise an IPI. So when the current task which is invoking irq_work_raise() is preempted by other task, unit_alloc() may return NULL for preemption task as shown below: task A task B unit_alloc() // low_watermark = 32 // free_cnt = 31 after alloc irq_work_raise() // mark irq work as IRQ_WORK_PENDING irq_work_claim() // task B preempts task A unit_alloc() // free_cnt = 30 after alloc // irq work is already PENDING, // so just return irq_work_raise() // does unit_alloc() 30-times ...... unit_alloc() // free_cnt = 0 before alloc return NULL Fix it by enabling IRQ after irq_work_raise() completes. An alternative fix is using preempt_{disable|enable}_notrace() pair, but it may have extra overhead. Another feasible fix is to only disable preemption or IRQ before invoking irq_work_queue() and enable preemption or IRQ after the invocation completes, but it can't handle the case when c->low_watermark is 1. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230901111954.1804721-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index cb60445de98a..c5d822d7cfaa 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } From 62cf51cb0ebe997a9903208e546755b63eb7ff9d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 1 Sep 2023 19:19:53 +0800 Subject: [PATCH 2/3] bpf: Enable IRQ after irq_work_raise() completes in unit_free{_rcu}() Both unit_free() and unit_free_rcu() invoke irq_work_raise() to free freed objects back to slab and the invocation may also be preempted by unit_alloc() and unit_alloc() may return NULL unexpectedly as shown in the following case: task A task B unit_free() // high_watermark = 48 // free_cnt = 49 after free irq_work_raise() // mark irq work as IRQ_WORK_PENDING irq_work_claim() // task B preempts task A unit_alloc() // free_cnt = 48 after alloc // does unit_alloc() 32-times ...... // free_cnt = 16 unit_alloc() // free_cnt = 15 after alloc // irq work is already PENDING, // so just return irq_work_raise() // does unit_alloc() 15-times ...... // free_cnt = 0 unit_alloc() // free_cnt = 0 before alloc return NULL Fix it by enabling IRQ after irq_work_raise() completes. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230901111954.1804721-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index c5d822d7cfaa..961df89d45f1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -778,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -800,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. From 29c11aa8082b6dbef2cffbcd5e81be27e9b50a5b Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 1 Sep 2023 19:19:54 +0800 Subject: [PATCH 3/3] selftests/bpf: Test preemption between bpf_obj_new() and bpf_obj_drop() The test case creates 4 threads and then pins these 4 threads in CPU 0. These 4 threads will run different bpf program through bpf_prog_test_run_opts() and these bpf program will use bpf_obj_new() and bpf_obj_drop() to allocate and free local kptrs concurrently. Under preemptible kernel, bpf_obj_new() and bpf_obj_drop() may preempt each other, bpf_obj_new() may return NULL and the test will fail before applying these fixes as shown below: test_preempted_bpf_ma_op:PASS:open_and_load 0 nsec test_preempted_bpf_ma_op:PASS:attach 0 nsec test_preempted_bpf_ma_op:PASS:no test prog 0 nsec test_preempted_bpf_ma_op:PASS:no test prog 0 nsec test_preempted_bpf_ma_op:PASS:no test prog 0 nsec test_preempted_bpf_ma_op:PASS:no test prog 0 nsec test_preempted_bpf_ma_op:PASS:pthread_create 0 nsec test_preempted_bpf_ma_op:PASS:pthread_create 0 nsec test_preempted_bpf_ma_op:PASS:pthread_create 0 nsec test_preempted_bpf_ma_op:PASS:pthread_create 0 nsec test_preempted_bpf_ma_op:PASS:run prog err 0 nsec test_preempted_bpf_ma_op:PASS:run prog err 0 nsec test_preempted_bpf_ma_op:PASS:run prog err 0 nsec test_preempted_bpf_ma_op:PASS:run prog err 0 nsec test_preempted_bpf_ma_op:FAIL:ENOMEM unexpected ENOMEM: got TRUE #168 preempted_bpf_ma_op:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230901111954.1804721-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/preempted_bpf_ma_op.c | 89 +++++++++++++++ .../selftests/bpf/progs/preempted_bpf_ma_op.c | 106 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c create mode 100644 tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c diff --git a/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c b/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c new file mode 100644 index 000000000000..3a2ec3923fca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "preempted_bpf_ma_op.skel.h" + +#define ALLOC_THREAD_NR 4 +#define ALLOC_LOOP_NR 512 + +struct alloc_ctx { + /* output */ + int run_err; + /* input */ + int fd; + bool *nomem_err; +}; + +static void *run_alloc_prog(void *data) +{ + struct alloc_ctx *ctx = data; + cpu_set_t cpu_set; + int i; + + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + + for (i = 0; i < ALLOC_LOOP_NR && !*ctx->nomem_err; i++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err; + + err = bpf_prog_test_run_opts(ctx->fd, &topts); + ctx->run_err |= err | topts.retval; + } + + return NULL; +} + +void test_preempted_bpf_ma_op(void) +{ + struct alloc_ctx ctx[ALLOC_THREAD_NR]; + struct preempted_bpf_ma_op *skel; + pthread_t tid[ALLOC_THREAD_NR]; + int i, err; + + skel = preempted_bpf_ma_op__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = preempted_bpf_ma_op__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto out; + + for (i = 0; i < ARRAY_SIZE(ctx); i++) { + struct bpf_program *prog; + char name[8]; + + snprintf(name, sizeof(name), "test%d", i); + prog = bpf_object__find_program_by_name(skel->obj, name); + if (!ASSERT_OK_PTR(prog, "no test prog")) + goto out; + + ctx[i].run_err = 0; + ctx[i].fd = bpf_program__fd(prog); + ctx[i].nomem_err = &skel->bss->nomem_err; + } + + memset(tid, 0, sizeof(tid)); + for (i = 0; i < ARRAY_SIZE(tid); i++) { + err = pthread_create(&tid[i], NULL, run_alloc_prog, &ctx[i]); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + for (i = 0; i < ARRAY_SIZE(tid); i++) { + if (!tid[i]) + break; + pthread_join(tid[i], NULL); + ASSERT_EQ(ctx[i].run_err, 0, "run prog err"); + } + + ASSERT_FALSE(skel->bss->nomem_err, "ENOMEM"); +out: + preempted_bpf_ma_op__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c b/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c new file mode 100644 index 000000000000..55907ef961bf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#include +#include +#include + +#include "bpf_experimental.h" + +struct bin_data { + char data[256]; + struct bpf_spin_lock lock; +}; + +struct map_value { + struct bin_data __kptr * data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 2048); +} array SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +bool nomem_err = false; + +static int del_array(unsigned int i, int *from) +{ + struct map_value *value; + struct bin_data *old; + + value = bpf_map_lookup_elem(&array, from); + if (!value) + return 1; + + old = bpf_kptr_xchg(&value->data, NULL); + if (old) + bpf_obj_drop(old); + + (*from)++; + return 0; +} + +static int add_array(unsigned int i, int *from) +{ + struct bin_data *old, *new; + struct map_value *value; + + value = bpf_map_lookup_elem(&array, from); + if (!value) + return 1; + + new = bpf_obj_new(typeof(*new)); + if (!new) { + nomem_err = true; + return 1; + } + + old = bpf_kptr_xchg(&value->data, new); + if (old) + bpf_obj_drop(old); + + (*from)++; + return 0; +} + +static void del_then_add_array(int from) +{ + int i; + + i = from; + bpf_loop(512, del_array, &i, 0); + + i = from; + bpf_loop(512, add_array, &i, 0); +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG2(test0, int, a) +{ + del_then_add_array(0); + return 0; +} + +SEC("fentry/bpf_fentry_test2") +int BPF_PROG2(test1, int, a, u64, b) +{ + del_then_add_array(512); + return 0; +} + +SEC("fentry/bpf_fentry_test3") +int BPF_PROG2(test2, char, a, int, b, u64, c) +{ + del_then_add_array(1024); + return 0; +} + +SEC("fentry/bpf_fentry_test4") +int BPF_PROG2(test3, void *, a, char, b, int, c, u64, d) +{ + del_then_add_array(1536); + return 0; +}