bpf: implement sleepable uprobes by chaining gps

uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe
programs is therefore on the bpf side.

Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu. In order for uprobe bpf programs to become sleepable, it has to be
protected by the tasks_trace rcu flavor instead (and kfree() called after
a corresponding grace period).

Therefore, the free path for bpf_prog_array now chains a tasks_trace and
normal grace periods one after the other.

Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations).

The downside is that the tasks_trace latency affects all perf_event-attached
bpf programs (and not just uprobe ones). This is deemed safe given the
possible attach rates for kprobe/uprobe/tp programs.

Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes
an rcu read section, in addition to the overarching tasks_trace section.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Delyan Kratunov 2022-06-14 23:10:46 +00:00 committed by Alexei Starovoitov
parent d687f621c5
commit 8c7dcb84e3
4 changed files with 71 additions and 5 deletions

View File

@ -26,6 +26,7 @@
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/bpfptr.h> #include <linux/bpfptr.h>
#include <linux/btf.h> #include <linux/btf.h>
#include <linux/rcupdate_trace.h>
struct bpf_verifier_env; struct bpf_verifier_env;
struct bpf_verifier_log; struct bpf_verifier_log;
@ -1372,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs); void bpf_prog_array_free(struct bpf_prog_array *progs);
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs);
bool bpf_prog_array_is_empty(struct bpf_prog_array *array); bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@ -1463,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
return ret; return ret;
} }
/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
*
* We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
* overall. As a result, we must use the bpf_prog_array_free_sleepable
* in order to use the tasks_trace rcu grace period.
*
* When a non-sleepable program is inside the array, we take the rcu read
* section and disable preemption for that program alone, so it can access
* rcu-protected dynamically sized maps.
*/
static __always_inline u32
bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
const void *ctx, bpf_prog_run_fn run_prog)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
u32 ret = 1;
might_fault();
rcu_read_lock_trace();
migrate_disable();
array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
if (unlikely(!array))
goto out;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
item = &array->items[0];
while ((prog = READ_ONCE(item->prog))) {
if (!prog->aux->sleepable)
rcu_read_lock();
run_ctx.bpf_cookie = item->bpf_cookie;
ret &= run_prog(prog, ctx);
item++;
if (!prog->aux->sleepable)
rcu_read_unlock();
}
bpf_reset_run_ctx(old_run_ctx);
out:
migrate_enable();
rcu_read_unlock_trace();
return ret;
}
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active); DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex; extern struct mutex bpf_stats_enabled_mutex;

View File

@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
kfree_rcu(progs, rcu); kfree_rcu(progs, rcu);
} }
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
progs = container_of(rcu, struct bpf_prog_array, rcu);
kfree_rcu(progs, rcu);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}
int bpf_prog_array_length(struct bpf_prog_array *array) int bpf_prog_array_length(struct bpf_prog_array *array)
{ {
struct bpf_prog_array_item *item; struct bpf_prog_array_item *item;

View File

@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
event->prog = prog; event->prog = prog;
event->bpf_cookie = bpf_cookie; event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array); rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array); bpf_prog_array_free_sleepable(old_array);
unlock: unlock:
mutex_unlock(&bpf_event_mutex); mutex_unlock(&bpf_event_mutex);
@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_delete_safe(old_array, event->prog); bpf_prog_array_delete_safe(old_array, event->prog);
} else { } else {
rcu_assign_pointer(event->tp_event->prog_array, new_array); rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array); bpf_prog_array_free_sleepable(old_array);
} }
bpf_prog_put(event->prog); bpf_prog_put(event->prog);

View File

@ -16,6 +16,7 @@
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/rculist.h> #include <linux/rculist.h>
#include <linux/filter.h>
#include "trace_dynevent.h" #include "trace_dynevent.h"
#include "trace_probe.h" #include "trace_probe.h"
@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (bpf_prog_array_valid(call)) { if (bpf_prog_array_valid(call)) {
u32 ret; u32 ret;
preempt_disable(); ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
ret = trace_call_bpf(call, regs);
preempt_enable();
if (!ret) if (!ret)
return; return;
} }