Merge branch 'introduce-bpf_wq'

Benjamin Tissoires says:

====================
Introduce bpf_wq

This is a followup of sleepable bpf_timer[0].

When discussing sleepable bpf_timer, it was thought that we should give
a try to bpf_wq, as the 2 APIs are similar but distinct enough to
justify a new one.

So here it is.

I tried to keep as much as possible common code in kernel/bpf/helpers.c
but I couldn't get away with code duplication in kernel/bpf/verifier.c.

This series introduces a basic bpf_wq support:
- creation is supported
- assignment is supported
- running a simple bpf_wq is also supported.

We will probably need to extend the API further with:
- a full delayed_work API (can be piggy backed on top with a correct
  flag)
- bpf_wq_cancel() <- apparently not, this is shooting ourself in the
  foot
- bpf_wq_cancel_sync() (for sleepable programs)
- documentation
---

For reference, the use cases I have in mind:

---

Basically, I need to be able to defer a HID-BPF program for the
following reasons (from the aforementioned patch):
1. defer an event:
   Sometimes we receive an out of proximity event, but the device can not
   be trusted enough, and we need to ensure that we won't receive another
   one in the following n milliseconds. So we need to wait those n
   milliseconds, and eventually re-inject that event in the stack.

2. inject new events in reaction to one given event:
   We might want to transform one given event into several. This is the
   case for macro keys where a single key press is supposed to send
   a sequence of key presses. But this could also be used to patch a
   faulty behavior, if a device forgets to send a release event.

3. communicate with the device in reaction to one event:
   We might want to communicate back to the device after a given event.
   For example a device might send us an event saying that it came back
   from sleeping state and needs to be re-initialized.

Currently we can achieve that by keeping a userspace program around,
raise a bpf event, and let that userspace program inject the events and
commands.
However, we are just keeping that program alive as a daemon for just
scheduling commands. There is no logic in it, so it doesn't really justify
an actual userspace wakeup. So a kernel workqueue seems simpler to handle.

bpf_timers are currently running in a soft IRQ context, this patch
series implements a sleppable context for them.

Cheers,
Benjamin

[0] https://lore.kernel.org/all/20240408-hid-bpf-sleepable-v6-0-0499ddd91b94@kernel.org/

Changes in v2:
- took previous review into account
- mainly dropped BPF_F_WQ_SLEEPABLE
- Link to v1: https://lore.kernel.org/r/20240416-bpf_wq-v1-0-c9e66092f842@kernel.org

====================

Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-0-6c986a5a741f@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2024-04-23 18:31:26 -07:00
commit 6e10b6350a
16 changed files with 891 additions and 105 deletions

View File

@ -185,7 +185,7 @@ struct bpf_map_ops {
enum {
/* Support at most 10 fields in a BTF type */
BTF_FIELDS_MAX = 10,
BTF_FIELDS_MAX = 11,
};
enum btf_field_type {
@ -202,6 +202,7 @@ enum btf_field_type {
BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
BPF_REFCOUNT = (1 << 9),
BPF_WORKQUEUE = (1 << 10),
};
typedef void (*btf_dtor_kfunc_t)(void *);
@ -238,6 +239,7 @@ struct btf_record {
u32 field_mask;
int spin_lock_off;
int timer_off;
int wq_off;
int refcount_off;
struct btf_field fields[];
};
@ -312,6 +314,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
return "bpf_spin_lock";
case BPF_TIMER:
return "bpf_timer";
case BPF_WORKQUEUE:
return "bpf_wq";
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
return "kptr";
@ -340,6 +344,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
return sizeof(struct bpf_spin_lock);
case BPF_TIMER:
return sizeof(struct bpf_timer);
case BPF_WORKQUEUE:
return sizeof(struct bpf_wq);
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
@ -367,6 +373,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
return __alignof__(struct bpf_spin_lock);
case BPF_TIMER:
return __alignof__(struct bpf_timer);
case BPF_WORKQUEUE:
return __alignof__(struct bpf_wq);
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
@ -406,6 +414,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
/* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_WORKQUEUE:
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
@ -525,6 +534,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
void bpf_wq_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
@ -2195,6 +2205,7 @@ void bpf_map_free_record(struct bpf_map *map);
struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);

View File

@ -426,6 +426,7 @@ struct bpf_verifier_state {
* while they are still in use.
*/
bool used_as_loop_entry;
bool in_sleepable;
/* first and last insn idx of this verifier state */
u32 first_insn_idx;

View File

@ -7306,6 +7306,10 @@ struct bpf_timer {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_wq {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_dynptr {
__u64 __opaque[2];
} __attribute__((aligned(8)));

View File

@ -428,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
static void array_map_free_timers(struct bpf_map *map)
static void array_map_free_timers_wq(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
/* We don't reset or free fields other than timer on uref dropping to zero. */
if (!btf_record_has_field(map->record, BPF_TIMER))
return;
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
if (btf_record_has_field(map->record, BPF_TIMER))
for (i = 0; i < array->map.max_entries; i++)
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
for (i = 0; i < array->map.max_entries; i++)
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
for (i = 0; i < array->map.max_entries; i++)
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@ -782,7 +786,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_release_uref = array_map_free_timers,
.map_release_uref = array_map_free_timers_wq,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,

View File

@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
if (field_mask & BPF_WORKQUEUE) {
if (!strcmp(name, "bpf_wq")) {
if (*seen_mask & BPF_WORKQUEUE)
return -E2BIG;
*seen_mask |= BPF_WORKQUEUE;
type = BPF_WORKQUEUE;
goto end;
}
}
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf,
switch (field_type) {
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_WORKQUEUE:
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
switch (field_type) {
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_WORKQUEUE:
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
case BPF_WORKQUEUE:
WARN_ON_ONCE(rec->wq_off >= 0);
/* Cache offset for faster lookup at runtime */
rec->wq_off = rec->fields[i].offset;
break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */

View File

@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
}
}
static void htab_free_prealloced_wq(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
for (i = 0; i < num_entries; i++) {
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
bpf_obj_free_workqueue(htab->map.record,
elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
static void htab_free_prealloced_fields(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab)
migrate_enable();
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer)
{
int i;
@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
if (is_timer)
bpf_obj_free_timer(htab->map.record,
l->key + round_up(htab->map.key_size, 8));
else
bpf_obj_free_workqueue(htab->map.record,
l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
static void htab_map_free_timers(struct bpf_map *map)
static void htab_map_free_timers_and_wq(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer on uref dropping to zero */
if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
else
htab_free_prealloced_timers(htab);
/* We only free timer and workqueue on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER)) {
if (!htab_is_prealloc(htab))
htab_free_malloced_timers_or_wq(htab, true);
else
htab_free_prealloced_timers(htab);
}
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) {
if (!htab_is_prealloc(htab))
htab_free_malloced_timers_or_wq(htab, false);
else
htab_free_prealloced_wq(htab);
}
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers,
.map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers,
.map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,

View File

@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
struct rcu_head rcu;
u64 flags;
};
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
* That space is used to keep 'struct bpf_timer_kern'.
* That space is used to keep 'struct bpf_async_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
@ -1096,17 +1105,23 @@ const struct bpf_func_proto bpf_snprintf_proto = {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
struct bpf_async_cb cb;
struct hrtimer timer;
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
struct rcu_head rcu;
};
/* the actual struct hidden inside uapi struct bpf_timer */
struct bpf_timer_kern {
struct bpf_hrtimer *timer;
struct bpf_work {
struct bpf_async_cb cb;
struct work_struct work;
struct work_struct delete_work;
};
/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
struct bpf_async_kern {
union {
struct bpf_async_cb *cb;
struct bpf_hrtimer *timer;
struct bpf_work *work;
};
/* bpf_spin_lock is used here instead of spinlock_t to make
* sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
@ -1114,19 +1129,24 @@ struct bpf_timer_kern {
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
enum bpf_async_type {
BPF_ASYNC_TYPE_TIMER = 0,
BPF_ASYNC_TYPE_WQ,
};
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
struct bpf_map *map = t->map;
void *value = t->value;
struct bpf_map *map = t->cb.map;
void *value = t->cb.value;
bpf_callback_t callback_fn;
void *key;
u32 idx;
BTF_TYPE_EMIT(struct bpf_timer);
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@ -1155,46 +1175,120 @@ out:
return HRTIMER_NORESTART;
}
BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
u64, flags)
static void bpf_wq_work(struct work_struct *work)
{
clockid_t clockid = flags & (MAX_CLOCKS - 1);
struct bpf_hrtimer *t;
int ret = 0;
struct bpf_work *w = container_of(work, struct bpf_work, work);
struct bpf_tramp_run_ctx __maybe_unused run_ctx;
struct bpf_async_cb *cb = &w->cb;
struct bpf_prog *prog = cb->prog;
struct bpf_map *map = cb->map;
bpf_callback_t callback_fn;
void *value = cb->value;
void *key;
u32 idx;
BUILD_BUG_ON(MAX_CLOCKS != 16);
BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
BTF_TYPE_EMIT(struct bpf_wq);
callback_fn = READ_ONCE(cb->callback_fn);
if (!callback_fn || !prog)
return;
if (map->map_type == BPF_MAP_TYPE_ARRAY) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
/* compute the key */
idx = ((char *)value - array->value) / array->elem_size;
key = &idx;
} else { /* hash or lru */
key = value - round_up(map->key_size, 8);
}
run_ctx.bpf_cookie = 0;
if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
/* recursion detected */
__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
return;
}
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
&run_ctx);
}
static void bpf_wq_delete_work(struct work_struct *work)
{
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
cancel_work_sync(&w->work);
kfree_rcu(w, cb.rcu);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
struct bpf_async_cb *cb;
struct bpf_hrtimer *t;
struct bpf_work *w;
clockid_t clockid;
size_t size;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
if (flags >= MAX_CLOCKS ||
/* similar to timerfd except _ALARM variants are not supported */
(clockid != CLOCK_MONOTONIC &&
clockid != CLOCK_REALTIME &&
clockid != CLOCK_BOOTTIME))
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
size = sizeof(struct bpf_hrtimer);
break;
case BPF_ASYNC_TYPE_WQ:
size = sizeof(struct bpf_work);
break;
default:
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
}
__bpf_spin_lock_irqsave(&async->lock);
t = async->timer;
if (t) {
ret = -EBUSY;
goto out;
}
/* allocate hrtimer via map_kmalloc to use memcg accounting */
t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
if (!t) {
cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
}
t->value = (void *)timer - map->record->timer_off;
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
WRITE_ONCE(timer->timer, t);
/* Guarantee the order between timer->timer and map->usercnt. So
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
clockid = flags & (MAX_CLOCKS - 1);
t = (struct bpf_hrtimer *)cb;
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
cb->value = (void *)async - map->record->timer_off;
break;
case BPF_ASYNC_TYPE_WQ:
w = (struct bpf_work *)cb;
INIT_WORK(&w->work, bpf_wq_work);
INIT_WORK(&w->delete_work, bpf_wq_delete_work);
cb->value = (void *)async - map->record->wq_off;
break;
}
cb->map = map;
cb->prog = NULL;
cb->flags = flags;
rcu_assign_pointer(cb->callback_fn, NULL);
WRITE_ONCE(async->cb, cb);
/* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
* timer or atomic64_read() below returns a zero usercnt.
@ -1204,15 +1298,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
WRITE_ONCE(timer->timer, NULL);
kfree(t);
WRITE_ONCE(async->cb, NULL);
kfree(cb);
ret = -EPERM;
}
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
__bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
u64, flags)
{
clock_t clockid = flags & (MAX_CLOCKS - 1);
BUILD_BUG_ON(MAX_CLOCKS != 16);
BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
if (flags >= MAX_CLOCKS ||
/* similar to timerfd except _ALARM variants are not supported */
(clockid != CLOCK_MONOTONIC &&
clockid != CLOCK_REALTIME &&
clockid != CLOCK_BOOTTIME))
return -EINVAL;
return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
}
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
@ -1222,22 +1335,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
struct bpf_prog_aux *, aux)
static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
struct bpf_prog_aux *aux, unsigned int flags,
enum bpf_async_type type)
{
struct bpf_prog *prev, *prog = aux->prog;
struct bpf_hrtimer *t;
struct bpf_async_cb *cb;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
__bpf_spin_lock_irqsave(&async->lock);
cb = async->cb;
if (!cb) {
ret = -EINVAL;
goto out;
}
if (!atomic64_read(&t->map->usercnt)) {
if (!atomic64_read(&cb->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
@ -1246,7 +1360,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
ret = -EPERM;
goto out;
}
prev = t->prog;
prev = cb->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
@ -1259,14 +1373,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
t->prog = prog;
cb->prog = prog;
}
rcu_assign_pointer(t->callback_fn, callback_fn);
rcu_assign_pointer(cb->callback_fn, callback_fn);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
__bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
struct bpf_prog_aux *, aux)
{
return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
}
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@ -1275,7 +1395,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
@ -1287,7 +1407,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t || !t->prog) {
if (!t || !t->cb.prog) {
ret = -EINVAL;
goto out;
}
@ -1315,18 +1435,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
static void drop_prog_refcnt(struct bpf_hrtimer *t)
static void drop_prog_refcnt(struct bpf_async_cb *async)
{
struct bpf_prog *prog = t->prog;
struct bpf_prog *prog = async->prog;
if (prog) {
bpf_prog_put(prog);
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
async->prog = NULL;
rcu_assign_pointer(async->callback_fn, NULL);
}
}
BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
struct bpf_hrtimer *t;
int ret = 0;
@ -1348,7 +1468,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
ret = -EDEADLK;
goto out;
}
drop_prog_refcnt(t);
drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
@ -1366,36 +1486,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
struct bpf_async_cb *cb;
/* Performance optimization: read async->cb without lock first. */
if (!READ_ONCE(async->cb))
return NULL;
__bpf_spin_lock_irqsave(&async->lock);
/* re-read it under lock */
cb = async->cb;
if (!cb)
goto out;
drop_prog_refcnt(cb);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
WRITE_ONCE(async->cb, NULL);
out:
__bpf_spin_unlock_irqrestore(&async->lock);
return cb;
}
/* This function is called by map_delete/update_elem for individual element and
* by ops->map_release_uref when the user space reference to a map reaches zero.
*/
void bpf_timer_cancel_and_free(void *val)
{
struct bpf_timer_kern *timer = val;
struct bpf_hrtimer *t;
/* Performance optimization: read timer->timer without lock first. */
if (!READ_ONCE(timer->timer))
return;
t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
__bpf_spin_lock_irqsave(&timer->lock);
/* re-read it under lock */
t = timer->timer;
if (!t)
goto out;
drop_prog_refcnt(t);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
WRITE_ONCE(timer->timer, NULL);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
if (!t)
return;
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
* right after for both preallocated and non-preallocated maps.
* The timer->timer = NULL was already done and no code path can
* The async->cb = NULL was already done and no code path can
* see address 't' anymore.
*
* Check that bpf_map_delete/update_elem() wasn't called from timer
@ -1404,13 +1532,33 @@ out:
* return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since timer->timer = NULL was already done. The timer will be
* since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
kfree_rcu(t, rcu);
kfree_rcu(t, cb.rcu);
}
/* This function is called by map_delete/update_elem for individual element and
* by ops->map_release_uref when the user space reference to a map reaches zero.
*/
void bpf_wq_cancel_and_free(void *val)
{
struct bpf_work *work;
BTF_TYPE_EMIT(struct bpf_wq);
work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
if (!work)
return;
/* Trigger cancel of the sleepable work, but *do not* wait for
* it to finish if it was running as we might not be in a
* sleepable context.
* kfree will be called once the work has finished.
*/
schedule_work(&work->delete_work);
}
BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
@ -2549,6 +2697,51 @@ __bpf_kfunc void bpf_throw(u64 cookie)
WARN(1, "A call to BPF exception callback should never return\n");
}
__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
{
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
struct bpf_map *map = p__map;
BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
if (flags)
return -EINVAL;
return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
}
__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
{
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
struct bpf_work *w;
if (in_nmi())
return -EOPNOTSUPP;
if (flags)
return -EINVAL;
w = READ_ONCE(async->work);
if (!w || !READ_ONCE(w->cb.prog))
return -EINVAL;
schedule_work(&w->work);
return 0;
}
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
unsigned int flags,
void *aux__ign)
{
struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
if (flags)
return -EINVAL;
return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@ -2626,6 +2819,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
BTF_ID_FLAGS(func, bpf_wq_init)
BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {

View File

@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
/* Nothing to release */
break;
default:
@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
/* Nothing to acquire */
break;
default:
@ -659,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
{
if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
return;
bpf_wq_cancel_and_free(obj + rec->wq_off);
}
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@ -679,6 +688,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
break;
case BPF_WORKQUEUE:
bpf_wq_cancel_and_free(field_ptr);
break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@ -1085,7 +1097,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
BPF_RB_ROOT | BPF_REFCOUNT,
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@ -1115,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
}
break;
case BPF_TIMER:
case BPF_WORKQUEUE:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {

View File

@ -332,6 +332,10 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
struct {
struct bpf_map *ptr;
int uid;
} map;
u64 mem_size;
};
@ -497,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@ -526,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
static bool is_async_callback_calling_insn(struct bpf_insn *insn)
{
return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
(bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}
static bool is_may_goto_insn(struct bpf_insn *insn)
@ -1425,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
dst_state->active_lock.ptr = src->active_lock.ptr;
dst_state->active_lock.id = src->active_lock.id;
@ -1838,6 +1848,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
*/
if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@ -2398,7 +2410,7 @@ static void init_func_state(struct bpf_verifier_env *env,
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
int subprog)
int subprog, bool is_sleepable)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
@ -2425,6 +2437,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* Initialize it similar to do_check_common().
*/
elem->st.branches = 1;
elem->st.in_sleepable = is_sleepable;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@ -5272,7 +5285,8 @@ bad_type:
static bool in_sleepable(struct bpf_verifier_env *env)
{
return env->prog->sleepable;
return env->prog->sleepable ||
(env->cur_state && env->cur_state->in_sleepable);
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@ -7596,6 +7610,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
static int process_wq_func(struct bpf_verifier_env *env, int regno,
struct bpf_kfunc_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
if (map->record->wq_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
val + reg->off, map->record->wq_off);
return -EINVAL;
}
meta->map.uid = reg->map_uid;
meta->map.ptr = map;
return 0;
}
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
@ -9490,7 +9521,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
*/
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
!is_sync_callback_calling_kfunc(insn->imm)) {
!is_callback_calling_kfunc(insn->imm)) {
verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
@ -9504,10 +9535,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer callbacks are async */
/* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog);
insn_idx, subprog,
is_bpf_wq_set_callback_impl_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@ -10841,6 +10873,7 @@ enum {
KF_ARG_LIST_NODE_ID,
KF_ARG_RB_ROOT_ID,
KF_ARG_RB_NODE_ID,
KF_ARG_WORKQUEUE_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
@ -10849,6 +10882,7 @@ BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@ -10892,6 +10926,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
{
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@ -10961,6 +11000,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
KF_ARG_PTR_TO_WORKQUEUE,
};
enum special_kfunc_type {
@ -10986,6 +11026,7 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
KF_bpf_wq_set_callback_impl,
KF_bpf_iter_css_task_new,
};
@ -11010,6 +11051,7 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#endif
@ -11038,6 +11080,7 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
@ -11117,6 +11160,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_map(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_MAP;
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@ -11368,12 +11414,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
static bool is_async_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
}
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
insn->imm == special_kfunc_list[KF_bpf_throw];
}
static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
}
static bool is_callback_calling_kfunc(u32 btf_id)
{
return is_sync_callback_calling_kfunc(btf_id) ||
is_async_callback_calling_kfunc(btf_id);
}
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@ -11718,6 +11780,34 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_NULL:
continue;
case KF_ARG_PTR_TO_MAP:
if (!reg->map_ptr) {
verbose(env, "pointer in R%d isn't map pointer\n", regno);
return -EINVAL;
}
if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
* if (inner_map1 && inner_map2) {
* wq = bpf_map_lookup_elem(inner_map1);
* if (wq)
* // mismatch would have been allowed
* bpf_wq_init(wq, inner_map2);
* }
*
* Comparing map_ptr is enough to distinguish normal and outer maps.
*/
if (meta->map.ptr != reg->map_ptr ||
meta->map.uid != reg->map_uid) {
verbose(env,
"workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map.uid, reg->map_uid);
return -EINVAL;
}
}
meta->map.ptr = reg->map_ptr;
meta->map.uid = reg->map_uid;
fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@ -11750,6 +11840,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_CALLBACK:
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
/* Trusted by default */
break;
default:
@ -12036,6 +12127,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret)
return ret;
break;
case KF_ARG_PTR_TO_WORKQUEUE:
if (reg->type != PTR_TO_MAP_VALUE) {
verbose(env, "arg#%d doesn't point to a map value\n", i);
return -EINVAL;
}
ret = process_wq_func(env, regno, meta);
if (ret < 0)
return ret;
break;
}
}
@ -12147,6 +12247,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_timer_callback_state);
if (err) {
verbose(env, "kfunc %s#%d failed callback verification\n",
func_name, meta.func_id);
return err;
}
}
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
@ -16896,6 +17006,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
if (old->in_sleepable != cur->in_sleepable)
return false;
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@ -18141,6 +18254,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
if (is_tracing_prog_type(prog_type)) {
verbose(env, "tracing progs cannot use bpf_wq yet\n");
return -EINVAL;
}
}
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@ -19560,6 +19680,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
insn_buf[0] = ld_addrs[0];
insn_buf[1] = ld_addrs[1];
insn_buf[2] = *insn;
*cnt = 3;
}
return 0;
}

View File

@ -7306,6 +7306,10 @@ struct bpf_timer {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_wq {
__u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_dynptr {
__u64 __opaque[2];
} __attribute__((aligned(8)));

View File

@ -470,4 +470,11 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it,
extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
unsigned int flags__k, void *aux__ign) __ksym;
#define bpf_wq_set_callback(timer, cb, flags) \
bpf_wq_set_callback_impl(timer, cb, flags, NULL)
#endif

View File

@ -494,6 +494,10 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused
return arg;
}
__bpf_kfunc void bpf_kfunc_call_test_sleepable(void)
{
}
BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@ -520,6 +524,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
static int bpf_testmod_ops_init(struct btf *btf)

View File

@ -96,6 +96,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
void bpf_kfunc_call_test_destructive(void) __ksym;
void bpf_kfunc_call_test_sleepable(void) __ksym;
void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p);
struct prog_test_member *bpf_kfunc_call_memb_acquire(void);

View File

@ -0,0 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Benjamin Tissoires */
#include <test_progs.h>
#include "wq.skel.h"
#include "wq_failures.skel.h"
void serial_test_wq(void)
{
struct wq *wq_skel = NULL;
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts);
RUN_TESTS(wq);
/* re-run the success test to check if the timer was actually executed */
wq_skel = wq__open_and_load();
if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load"))
return;
err = wq__attach(wq_skel);
if (!ASSERT_OK(err, "wq_attach"))
return;
prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable);
err = bpf_prog_test_run_opts(prog_fd, &topts);
ASSERT_OK(err, "test_run");
ASSERT_EQ(topts.retval, 0, "test_run");
usleep(50); /* 10 usecs should be enough, but give it extra */
ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable");
}
void serial_test_failures_wq(void)
{
LIBBPF_OPTS(bpf_test_run_opts, topts);
RUN_TESTS(wq_failures);
}

View File

@ -0,0 +1,180 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Benjamin Tissoires
*/
#include "bpf_experimental.h"
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
#include "../bpf_testmod/bpf_testmod_kfunc.h"
char _license[] SEC("license") = "GPL";
struct hmap_elem {
int counter;
struct bpf_timer timer; /* unused */
struct bpf_spin_lock lock; /* unused */
struct bpf_wq work;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1000);
__type(key, int);
__type(value, struct hmap_elem);
} hmap SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(map_flags, BPF_F_NO_PREALLOC);
__uint(max_entries, 1000);
__type(key, int);
__type(value, struct hmap_elem);
} hmap_malloc SEC(".maps");
struct elem {
struct bpf_wq w;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 2);
__type(key, int);
__type(value, struct elem);
} array SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 4);
__type(key, int);
__type(value, struct elem);
} lru SEC(".maps");
__u32 ok;
__u32 ok_sleepable;
static int test_elem_callback(void *map, int *key,
int (callback_fn)(void *map, int *key, struct bpf_wq *wq))
{
struct elem init = {}, *val;
struct bpf_wq *wq;
if ((ok & (1 << *key) ||
(ok_sleepable & (1 << *key))))
return -22;
if (map == &lru &&
bpf_map_update_elem(map, key, &init, 0))
return -1;
val = bpf_map_lookup_elem(map, key);
if (!val)
return -2;
wq = &val->w;
if (bpf_wq_init(wq, map, 0) != 0)
return -3;
if (bpf_wq_set_callback(wq, callback_fn, 0))
return -4;
if (bpf_wq_start(wq, 0))
return -5;
return 0;
}
static int test_hmap_elem_callback(void *map, int *key,
int (callback_fn)(void *map, int *key, struct bpf_wq *wq))
{
struct hmap_elem init = {}, *val;
struct bpf_wq *wq;
if ((ok & (1 << *key) ||
(ok_sleepable & (1 << *key))))
return -22;
if (bpf_map_update_elem(map, key, &init, 0))
return -1;
val = bpf_map_lookup_elem(map, key);
if (!val)
return -2;
wq = &val->work;
if (bpf_wq_init(wq, map, 0) != 0)
return -3;
if (bpf_wq_set_callback(wq, callback_fn, 0))
return -4;
if (bpf_wq_start(wq, 0))
return -5;
return 0;
}
/* callback for non sleepable workqueue */
static int wq_callback(void *map, int *key, struct bpf_wq *work)
{
bpf_kfunc_common_test();
ok |= (1 << *key);
return 0;
}
/* callback for sleepable workqueue */
static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work)
{
bpf_kfunc_call_test_sleepable();
ok_sleepable |= (1 << *key);
return 0;
}
SEC("tc")
/* test that workqueues can be used from an array */
__retval(0)
long test_call_array_sleepable(void *ctx)
{
int key = 0;
return test_elem_callback(&array, &key, wq_cb_sleepable);
}
SEC("syscall")
/* Same test than above but from a sleepable context. */
__retval(0)
long test_syscall_array_sleepable(void *ctx)
{
int key = 1;
return test_elem_callback(&array, &key, wq_cb_sleepable);
}
SEC("tc")
/* test that workqueues can be used from a hashmap */
__retval(0)
long test_call_hash_sleepable(void *ctx)
{
int key = 2;
return test_hmap_elem_callback(&hmap, &key, wq_callback);
}
SEC("tc")
/* test that workqueues can be used from a hashmap with NO_PREALLOC. */
__retval(0)
long test_call_hash_malloc_sleepable(void *ctx)
{
int key = 3;
return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback);
}
SEC("tc")
/* test that workqueues can be used from a LRU map */
__retval(0)
long test_call_lru_sleepable(void *ctx)
{
int key = 4;
return test_elem_callback(&lru, &key, wq_callback);
}

View File

@ -0,0 +1,144 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Benjamin Tissoires
*/
#include "bpf_experimental.h"
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
#include "../bpf_testmod/bpf_testmod_kfunc.h"
char _license[] SEC("license") = "GPL";
struct elem {
struct bpf_wq w;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 2);
__type(key, int);
__type(value, struct elem);
} array SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 4);
__type(key, int);
__type(value, struct elem);
} lru SEC(".maps");
/* callback for non sleepable workqueue */
static int wq_callback(void *map, int *key, struct bpf_wq *work)
{
bpf_kfunc_common_test();
return 0;
}
/* callback for sleepable workqueue */
static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work)
{
bpf_kfunc_call_test_sleepable();
return 0;
}
SEC("tc")
/* test that bpf_wq_init takes a map as a second argument
*/
__log_level(2)
__flag(BPF_F_TEST_STATE_FREQ)
__failure
__msg(": (85) call bpf_wq_init#") /* anchor message */
__msg("pointer in R2 isn't map pointer")
long test_wq_init_nomap(void *ctx)
{
struct bpf_wq *wq;
struct elem *val;
int key = 0;
val = bpf_map_lookup_elem(&array, &key);
if (!val)
return -1;
wq = &val->w;
if (bpf_wq_init(wq, &key, 0) != 0)
return -3;
return 0;
}
SEC("tc")
/* test that the workqueue is part of the map in bpf_wq_init
*/
__log_level(2)
__flag(BPF_F_TEST_STATE_FREQ)
__failure
__msg(": (85) call bpf_wq_init#") /* anchor message */
__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0")
long test_wq_init_wrong_map(void *ctx)
{
struct bpf_wq *wq;
struct elem *val;
int key = 0;
val = bpf_map_lookup_elem(&array, &key);
if (!val)
return -1;
wq = &val->w;
if (bpf_wq_init(wq, &lru, 0) != 0)
return -3;
return 0;
}
SEC("?tc")
__log_level(2)
__failure
/* check that the first argument of bpf_wq_set_callback()
* is a correct bpf_wq pointer.
*/
__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
__msg("arg#0 doesn't point to a map value")
long test_wrong_wq_pointer(void *ctx)
{
int key = 0;
struct bpf_wq *wq;
wq = bpf_map_lookup_elem(&array, &key);
if (!wq)
return 1;
if (bpf_wq_init(wq, &array, 0))
return 2;
if (bpf_wq_set_callback((void *)&wq, wq_callback, 0))
return 3;
return -22;
}
SEC("?tc")
__log_level(2)
__failure
/* check that the first argument of bpf_wq_set_callback()
* is a correct bpf_wq pointer.
*/
__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0")
long test_wrong_wq_pointer_offset(void *ctx)
{
int key = 0;
struct bpf_wq *wq;
wq = bpf_map_lookup_elem(&array, &key);
if (!wq)
return 1;
if (bpf_wq_init(wq, &array, 0))
return 2;
if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0))
return 3;
return -22;
}