diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..ead35f39f185 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as @@ -5387,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5598,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41aeaf3862ec..cb5564c77482 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b483aea35f41..9e832acf4692 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,43 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -136,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -224,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -251,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -264,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, - .map_poll = ringbuf_map_poll, + .map_mmap = ringbuf_map_mmap_kern, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -278,6 +350,20 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to @@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -553,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c6fbcd0afaf..c76fa45a5906 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, - [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -6066,6 +6073,13 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + break; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6240,6 +6254,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + if (func_id != BPF_FUNC_user_ringbuf_drain) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -6359,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -6885,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -7344,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn case BPF_FUNC_dynptr_data: for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - /* Find the id of the dynptr we're tracking the reference of */ - meta.ref_obj_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + + if (base_type(reg->type) != PTR_TO_DYNPTR) + /* Find the id of the dynptr we're + * tracking the reference of + */ + meta.ref_obj_id = stack_slot_get_id(env, reg); break; } } @@ -7358,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) @@ -12635,6 +12690,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 7c188a598444..7f3b67a8b48f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 38b6bc9c26c3..9a6ca9f31133 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter }\n" + " task_storage | bloom_filter | user_ringbuf }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..ead35f39f185 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as @@ -5387,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5598,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2ca30ccc774c..67bc18506150 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { @@ -2372,6 +2373,12 @@ static size_t adjust_ringbuf_sz(size_t sz) return sz; } +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) { map->def.type = def->map_type; @@ -2386,7 +2393,7 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def map->btf_value_type_id = def->value_type_id; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); if (def->parts & MAP_DEF_MAP_TYPE) @@ -4369,7 +4376,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) map->def.max_entries = max_entries; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 88a1ac34b12a..e2d8c17f2e85 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1011,6 +1011,7 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, /* Ring buffer APIs */ struct ring_buffer; +struct user_ring_buffer; typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); @@ -1030,6 +1031,112 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/* @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/* @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + /* Perf buffer APIs */ struct perf_buffer; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 2b928dc21af0..c1d6aa7c82b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -368,3 +368,13 @@ LIBBPF_1.0.0 { libbpf_bpf_prog_type_str; perf_buffer__buffer; }; + +LIBBPF_1.1.0 { + global: + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 6d495656f554..f3a8e8e74eb8 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -231,6 +231,7 @@ static int probe_map_create(enum bpf_map_type map_type) return btf_fd; break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: key_size = 0; value_size = 0; max_entries = 4096; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index 2fb2f4290080..e944f5bce728 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 0 +#define LIBBPF_MINOR_VERSION 1 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8bc117bcc7bc..d285171d4b69 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "libbpf.h" #include "libbpf_internal.h" @@ -39,6 +40,23 @@ struct ring_buffer { int ring_cnt; }; +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) { if (r->consumer_pos) { @@ -300,3 +318,256 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb) { return rb->epoll_fd; } + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, + PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 168c5b287b5c..981c2be922f4 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -71,3 +71,4 @@ cb_refs # expected error message unexpected err cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) tracing_struct # failed to auto-attach: -524 (trampoline) +user_ringbuf # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3 (?) diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c new file mode 100644 index 000000000000..02b18d018b36 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -0,0 +1,754 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "user_ringbuf_fail.skel.h" +#include "user_ringbuf_success.skel.h" + +#include "../progs/test_user_ringbuf.h" + +static size_t log_buf_sz = 1 << 20; /* 1 MB */ +static char obj_log_buf[1048576]; +static const long c_sample_size = sizeof(struct sample) + BPF_RINGBUF_HDR_SZ; +static const long c_ringbuf_size = 1 << 12; /* 1 small page */ +static const long c_max_entries = c_ringbuf_size / c_sample_size; + +static void drain_current_samples(void) +{ + syscall(__NR_getpgid); +} + +static int write_samples(struct user_ring_buffer *ringbuf, uint32_t num_samples) +{ + int i, err = 0; + + /* Write some number of samples to the ring buffer. */ + for (i = 0; i < num_samples; i++) { + struct sample *entry; + int read; + + entry = user_ring_buffer__reserve(ringbuf, sizeof(*entry)); + if (!entry) { + err = -errno; + goto done; + } + + entry->pid = getpid(); + entry->seq = i; + entry->value = i * i; + + read = snprintf(entry->comm, sizeof(entry->comm), "%u", i); + if (read <= 0) { + /* Assert on the error path to avoid spamming logs with + * mostly success messages. + */ + ASSERT_GT(read, 0, "snprintf_comm"); + err = read; + user_ring_buffer__discard(ringbuf, entry); + goto done; + } + + user_ring_buffer__submit(ringbuf, entry); + } + +done: + drain_current_samples(); + + return err; +} + +static struct user_ringbuf_success *open_load_ringbuf_skel(void) +{ + struct user_ringbuf_success *skel; + int err; + + skel = user_ringbuf_success__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_max_entries(skel->maps.user_ringbuf, c_ringbuf_size); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.kernel_ringbuf, c_ringbuf_size); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = user_ringbuf_success__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + return skel; + +cleanup: + user_ringbuf_success__destroy(skel); + return NULL; +} + +static void test_user_ringbuf_mappings(void) +{ + int err, rb_fd; + int page_size = getpagesize(); + void *mmap_ptr; + struct user_ringbuf_success *skel; + + skel = open_load_ringbuf_skel(); + if (!skel) + return; + + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + /* cons_pos can be mapped R/O, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rb_fd, 0); + ASSERT_OK_PTR(mmap_ptr, "ro_cons_pos"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_WRITE), "write_cons_pos_protect"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_cons_pos_protect"); + ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 4 * page_size, MREMAP_MAYMOVE), "wr_prod_pos"); + err = -errno; + ASSERT_ERR(err, "wr_prod_pos_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_ro_cons"); + + /* prod_pos can be mapped RW, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + rb_fd, page_size); + ASSERT_OK_PTR(mmap_ptr, "rw_prod_pos"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_prod_pos_protect"); + err = -errno; + ASSERT_ERR(err, "wr_prod_pos_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_prod"); + + /* data pages can be mapped RW, can't add +X with mprotect. */ + mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, + 2 * page_size); + ASSERT_OK_PTR(mmap_ptr, "rw_data"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_data_protect"); + err = -errno; + ASSERT_ERR(err, "exec_data_err"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_data"); + + user_ringbuf_success__destroy(skel); +} + +static int load_skel_create_ringbufs(struct user_ringbuf_success **skel_out, + struct ring_buffer **kern_ringbuf_out, + ring_buffer_sample_fn callback, + struct user_ring_buffer **user_ringbuf_out) +{ + struct user_ringbuf_success *skel; + struct ring_buffer *kern_ringbuf = NULL; + struct user_ring_buffer *user_ringbuf = NULL; + int err = -ENOMEM, rb_fd; + + skel = open_load_ringbuf_skel(); + if (!skel) + return err; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + if (kern_ringbuf_out) { + rb_fd = bpf_map__fd(skel->maps.kernel_ringbuf); + kern_ringbuf = ring_buffer__new(rb_fd, callback, skel, NULL); + if (!ASSERT_OK_PTR(kern_ringbuf, "kern_ringbuf_create")) + goto cleanup; + + *kern_ringbuf_out = kern_ringbuf; + } + + if (user_ringbuf_out) { + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + user_ringbuf = user_ring_buffer__new(rb_fd, NULL); + if (!ASSERT_OK_PTR(user_ringbuf, "user_ringbuf_create")) + goto cleanup; + + *user_ringbuf_out = user_ringbuf; + ASSERT_EQ(skel->bss->read, 0, "no_reads_after_load"); + } + + err = user_ringbuf_success__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + *skel_out = skel; + return 0; + +cleanup: + if (kern_ringbuf_out) + *kern_ringbuf_out = NULL; + if (user_ringbuf_out) + *user_ringbuf_out = NULL; + ring_buffer__free(kern_ringbuf); + user_ring_buffer__free(user_ringbuf); + user_ringbuf_success__destroy(skel); + return err; +} + +static int load_skel_create_user_ringbuf(struct user_ringbuf_success **skel_out, + struct user_ring_buffer **ringbuf_out) +{ + return load_skel_create_ringbufs(skel_out, NULL, NULL, ringbuf_out); +} + +static void manually_write_test_invalid_sample(struct user_ringbuf_success *skel, + __u32 size, __u64 producer_pos, int err) +{ + void *data_ptr; + __u64 *producer_pos_ptr; + int rb_fd, page_size = getpagesize(); + + rb_fd = bpf_map__fd(skel->maps.user_ringbuf); + + ASSERT_EQ(skel->bss->read, 0, "num_samples_before_bad_sample"); + + /* Map the producer_pos as RW. */ + producer_pos_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, rb_fd, page_size); + ASSERT_OK_PTR(producer_pos_ptr, "producer_pos_ptr"); + + /* Map the data pages as RW. */ + data_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size); + ASSERT_OK_PTR(data_ptr, "rw_data"); + + memset(data_ptr, 0, BPF_RINGBUF_HDR_SZ); + *(__u32 *)data_ptr = size; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in the kernel. */ + smp_store_release(producer_pos_ptr, producer_pos + BPF_RINGBUF_HDR_SZ); + + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 0, "num_samples_after_bad_sample"); + ASSERT_EQ(skel->bss->err, err, "err_after_bad_sample"); + + ASSERT_OK(munmap(producer_pos_ptr, page_size), "unmap_producer_pos"); + ASSERT_OK(munmap(data_ptr, page_size), "unmap_data_ptr"); +} + +static void test_user_ringbuf_post_misaligned(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = (1 << 5) + 7; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "misaligned_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size, -EINVAL); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_producer_wrong_offset(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = (1 << 5); + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "wrong_offset_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size - 8, -EINVAL); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_larger_than_ringbuf_sz(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + __u32 size = c_ringbuf_size; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "huge_sample_skel")) + return; + + manually_write_test_invalid_sample(skel, size, size, -E2BIG); + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_basic(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_basic_skel")) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + err = write_samples(ringbuf, 2); + if (!ASSERT_OK(err, "write_samples")) + goto cleanup; + + ASSERT_EQ(skel->bss->read, 2, "num_samples_read_after"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_sample_full_ring_buffer(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + void *sample; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_full_sample_skel")) + return; + + sample = user_ring_buffer__reserve(ringbuf, c_ringbuf_size - BPF_RINGBUF_HDR_SZ); + if (!ASSERT_OK_PTR(sample, "full_sample")) + goto cleanup; + + user_ring_buffer__submit(ringbuf, sample); + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_post_alignment_autoadjust(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + struct sample *sample; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (!ASSERT_OK(err, "ringbuf_align_autoadjust_skel")) + return; + + /* libbpf should automatically round any sample up to an 8-byte alignment. */ + sample = user_ring_buffer__reserve(ringbuf, sizeof(*sample) + 1); + ASSERT_OK_PTR(sample, "reserve_autoaligned"); + user_ring_buffer__submit(ringbuf, sample); + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after"); + + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_overfill(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + err = write_samples(ringbuf, c_max_entries * 5); + ASSERT_ERR(err, "write_samples"); + ASSERT_EQ(skel->bss->read, c_max_entries, "max_entries"); + + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_discards_properly_ignored(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err, num_discarded = 0; + __u64 *token; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + while (1) { + /* Write samples until the buffer is full. */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + if (!token) + break; + + user_ring_buffer__discard(ringbuf, token); + num_discarded++; + } + + if (!ASSERT_GE(num_discarded, 0, "num_discarded")) + goto cleanup; + + /* Should not read any samples, as they are all discarded. */ + ASSERT_EQ(skel->bss->read, 0, "num_pre_kick"); + drain_current_samples(); + ASSERT_EQ(skel->bss->read, 0, "num_post_kick"); + + /* Now that the ring buffer has been drained, we should be able to + * reserve another token. + */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + + if (!ASSERT_OK_PTR(token, "new_token")) + goto cleanup; + + user_ring_buffer__discard(ringbuf, token); +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void test_user_ringbuf_loop(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + uint32_t total_samples = 8192; + uint32_t remaining_samples = total_samples; + int err; + + BUILD_BUG_ON(total_samples <= c_max_entries); + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + do { + uint32_t curr_samples; + + curr_samples = remaining_samples > c_max_entries + ? c_max_entries : remaining_samples; + err = write_samples(ringbuf, curr_samples); + if (err != 0) { + /* Assert inside of if statement to avoid flooding logs + * on the success path. + */ + ASSERT_OK(err, "write_samples"); + goto cleanup; + } + + remaining_samples -= curr_samples; + ASSERT_EQ(skel->bss->read, total_samples - remaining_samples, + "current_batched_entries"); + } while (remaining_samples > 0); + ASSERT_EQ(skel->bss->read, total_samples, "total_batched_entries"); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static int send_test_message(struct user_ring_buffer *ringbuf, + enum test_msg_op op, s64 operand_64, + s32 operand_32) +{ + struct test_msg *msg; + + msg = user_ring_buffer__reserve(ringbuf, sizeof(*msg)); + if (!msg) { + /* Assert on the error path to avoid spamming logs with mostly + * success messages. + */ + ASSERT_OK_PTR(msg, "reserve_msg"); + return -ENOMEM; + } + + msg->msg_op = op; + + switch (op) { + case TEST_MSG_OP_INC64: + case TEST_MSG_OP_MUL64: + msg->operand_64 = operand_64; + break; + case TEST_MSG_OP_INC32: + case TEST_MSG_OP_MUL32: + msg->operand_32 = operand_32; + break; + default: + PRINT_FAIL("Invalid operand %d\n", op); + user_ring_buffer__discard(ringbuf, msg); + return -EINVAL; + } + + user_ring_buffer__submit(ringbuf, msg); + + return 0; +} + +static void kick_kernel_read_messages(void) +{ + syscall(__NR_prctl); +} + +static int handle_kernel_msg(void *ctx, void *data, size_t len) +{ + struct user_ringbuf_success *skel = ctx; + struct test_msg *msg = data; + + switch (msg->msg_op) { + case TEST_MSG_OP_INC64: + skel->bss->user_mutated += msg->operand_64; + return 0; + case TEST_MSG_OP_INC32: + skel->bss->user_mutated += msg->operand_32; + return 0; + case TEST_MSG_OP_MUL64: + skel->bss->user_mutated *= msg->operand_64; + return 0; + case TEST_MSG_OP_MUL32: + skel->bss->user_mutated *= msg->operand_32; + return 0; + default: + fprintf(stderr, "Invalid operand %d\n", msg->msg_op); + return -EINVAL; + } +} + +static void drain_kernel_messages_buffer(struct ring_buffer *kern_ringbuf, + struct user_ringbuf_success *skel) +{ + int cnt; + + cnt = ring_buffer__consume(kern_ringbuf); + ASSERT_EQ(cnt, 8, "consume_kern_ringbuf"); + ASSERT_OK(skel->bss->err, "consume_kern_ringbuf_err"); +} + +static void test_user_ringbuf_msg_protocol(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *user_ringbuf; + struct ring_buffer *kern_ringbuf; + int err, i; + __u64 expected_kern = 0; + + err = load_skel_create_ringbufs(&skel, &kern_ringbuf, handle_kernel_msg, &user_ringbuf); + if (!ASSERT_OK(err, "create_ringbufs")) + return; + + for (i = 0; i < 64; i++) { + enum test_msg_op op = i % TEST_MSG_OP_NUM_OPS; + __u64 operand_64 = TEST_OP_64; + __u32 operand_32 = TEST_OP_32; + + err = send_test_message(user_ringbuf, op, operand_64, operand_32); + if (err) { + /* Only assert on a failure to avoid spamming success logs. */ + ASSERT_OK(err, "send_test_message"); + goto cleanup; + } + + switch (op) { + case TEST_MSG_OP_INC64: + expected_kern += operand_64; + break; + case TEST_MSG_OP_INC32: + expected_kern += operand_32; + break; + case TEST_MSG_OP_MUL64: + expected_kern *= operand_64; + break; + case TEST_MSG_OP_MUL32: + expected_kern *= operand_32; + break; + default: + PRINT_FAIL("Unexpected op %d\n", op); + goto cleanup; + } + + if (i % 8 == 0) { + kick_kernel_read_messages(); + ASSERT_EQ(skel->bss->kern_mutated, expected_kern, "expected_kern"); + ASSERT_EQ(skel->bss->err, 0, "bpf_prog_err"); + drain_kernel_messages_buffer(kern_ringbuf, skel); + } + } + +cleanup: + ring_buffer__free(kern_ringbuf); + user_ring_buffer__free(user_ringbuf); + user_ringbuf_success__destroy(skel); +} + +static void *kick_kernel_cb(void *arg) +{ + /* Kick the kernel, causing it to drain the ring buffer and then wake + * up the test thread waiting on epoll. + */ + syscall(__NR_getrlimit); + + return NULL; +} + +static int spawn_kick_thread_for_poll(void) +{ + pthread_t thread; + + return pthread_create(&thread, NULL, kick_kernel_cb, NULL); +} + +static void test_user_ringbuf_blocking_reserve(void) +{ + struct user_ringbuf_success *skel; + struct user_ring_buffer *ringbuf; + int err, num_written = 0; + __u64 *token; + + err = load_skel_create_user_ringbuf(&skel, &ringbuf); + if (err) + return; + + ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before"); + + while (1) { + /* Write samples until the buffer is full. */ + token = user_ring_buffer__reserve(ringbuf, sizeof(*token)); + if (!token) + break; + + *token = 0xdeadbeef; + + user_ring_buffer__submit(ringbuf, token); + num_written++; + } + + if (!ASSERT_GE(num_written, 0, "num_written")) + goto cleanup; + + /* Should not have read any samples until the kernel is kicked. */ + ASSERT_EQ(skel->bss->read, 0, "num_pre_kick"); + + /* We correctly time out after 1 second, without a sample. */ + token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 1000); + if (!ASSERT_EQ(token, NULL, "pre_kick_timeout_token")) + goto cleanup; + + err = spawn_kick_thread_for_poll(); + if (!ASSERT_EQ(err, 0, "deferred_kick_thread\n")) + goto cleanup; + + /* After spawning another thread that asychronously kicks the kernel to + * drain the messages, we're able to block and successfully get a + * sample once we receive an event notification. + */ + token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 10000); + + if (!ASSERT_OK_PTR(token, "block_token")) + goto cleanup; + + ASSERT_GT(skel->bss->read, 0, "num_post_kill"); + ASSERT_LE(skel->bss->read, num_written, "num_post_kill"); + ASSERT_EQ(skel->bss->err, 0, "err_post_poll"); + user_ring_buffer__discard(ringbuf, token); + +cleanup: + user_ring_buffer__free(ringbuf); + user_ringbuf_success__destroy(skel); +} + +static struct { + const char *prog_name; + const char *expected_err_msg; +} failure_tests[] = { + /* failure cases */ + {"user_ringbuf_callback_bad_access1", "negative offset dynptr_ptr ptr"}, + {"user_ringbuf_callback_bad_access2", "dereference of modified dynptr_ptr ptr"}, + {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"}, + {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"}, + {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"}, + {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, +}; + +#define SUCCESS_TEST(_func) { _func, #_func } + +static struct { + void (*test_callback)(void); + const char *test_name; +} success_tests[] = { + SUCCESS_TEST(test_user_ringbuf_mappings), + SUCCESS_TEST(test_user_ringbuf_post_misaligned), + SUCCESS_TEST(test_user_ringbuf_post_producer_wrong_offset), + SUCCESS_TEST(test_user_ringbuf_post_larger_than_ringbuf_sz), + SUCCESS_TEST(test_user_ringbuf_basic), + SUCCESS_TEST(test_user_ringbuf_sample_full_ring_buffer), + SUCCESS_TEST(test_user_ringbuf_post_alignment_autoadjust), + SUCCESS_TEST(test_user_ringbuf_overfill), + SUCCESS_TEST(test_user_ringbuf_discards_properly_ignored), + SUCCESS_TEST(test_user_ringbuf_loop), + SUCCESS_TEST(test_user_ringbuf_msg_protocol), + SUCCESS_TEST(test_user_ringbuf_blocking_reserve), +}; + +static void verify_fail(const char *prog_name, const char *expected_err_msg) +{ + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct bpf_program *prog; + struct user_ringbuf_fail *skel; + int err; + + opts.kernel_log_buf = obj_log_buf; + opts.kernel_log_size = log_buf_sz; + opts.kernel_log_level = 1; + + skel = user_ringbuf_fail__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + bpf_program__set_autoload(prog, true); + + bpf_map__set_max_entries(skel->maps.user_ringbuf, getpagesize()); + + err = user_ringbuf_fail__load(skel); + if (!ASSERT_ERR(err, "unexpected load success")) + goto cleanup; + + if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) { + fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg); + fprintf(stderr, "Verifier output: %s\n", obj_log_buf); + } + +cleanup: + user_ringbuf_fail__destroy(skel); +} + +void test_user_ringbuf(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(success_tests); i++) { + if (!test__start_subtest(success_tests[i].test_name)) + continue; + + success_tests[i].test_callback(); + } + + for (i = 0; i < ARRAY_SIZE(failure_tests); i++) { + if (!test__start_subtest(failure_tests[i].prog_name)) + continue; + + verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg); + } +} diff --git a/tools/testing/selftests/bpf/progs/test_user_ringbuf.h b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h new file mode 100644 index 000000000000..1643b4d59ba7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#ifndef _TEST_USER_RINGBUF_H +#define _TEST_USER_RINGBUF_H + +#define TEST_OP_64 4 +#define TEST_OP_32 2 + +enum test_msg_op { + TEST_MSG_OP_INC64, + TEST_MSG_OP_INC32, + TEST_MSG_OP_MUL64, + TEST_MSG_OP_MUL32, + + // Must come last. + TEST_MSG_OP_NUM_OPS, +}; + +struct test_msg { + enum test_msg_op msg_op; + union { + __s64 operand_64; + __s32 operand_32; + }; +}; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +#endif /* _TEST_USER_RINGBUF_H */ diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c new file mode 100644 index 000000000000..82aba4529aa9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); +} user_ringbuf SEC(".maps"); + +static long +bad_access1(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample; + + sample = bpf_dynptr_data(dynptr - 1, 0, sizeof(*sample)); + bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr - 1); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read before the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_bad_access1(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0); + + return 0; +} + +static long +bad_access2(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample; + + sample = bpf_dynptr_data(dynptr + 1, 0, sizeof(*sample)); + bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr + 1); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_bad_access2(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0); + + return 0; +} + +static long +write_forbidden(struct bpf_dynptr *dynptr, void *context) +{ + *((long *)dynptr) = 0; + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_write_forbidden(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0); + + return 0; +} + +static long +null_context_write(struct bpf_dynptr *dynptr, void *context) +{ + *((__u64 *)context) = 0; + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_null_context_write(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0); + + return 0; +} + +static long +null_context_read(struct bpf_dynptr *dynptr, void *context) +{ + __u64 id = *((__u64 *)context); + + bpf_printk("Read id %lu\n", id); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_null_context_read(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0); + + return 0; +} + +static long +try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) +{ + bpf_ringbuf_discard_dynptr(dynptr, 0); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_discard_dynptr(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); + + return 0; +} + +static long +try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) +{ + bpf_ringbuf_submit_dynptr(dynptr, 0); + + return 0; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to read past the end of the pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_submit_dynptr(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); + + return 0; +} + +static long +invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context) +{ + return 2; +} + +/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should + * not be able to write to that pointer. + */ +SEC("?raw_tp/sys_nanosleep") +int user_ringbuf_callback_invalid_return(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_success.c b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c new file mode 100644 index 000000000000..099c23d9aa21 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "test_user_ringbuf.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); +} user_ringbuf SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} kernel_ringbuf SEC(".maps"); + +/* inputs */ +int pid, err, val; + +int read = 0; + +/* Counter used for end-to-end protocol test */ +__u64 kern_mutated = 0; +__u64 user_mutated = 0; +__u64 expected_user_mutated = 0; + +static int +is_test_process(void) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + return cur_pid == pid; +} + +static long +record_sample(struct bpf_dynptr *dynptr, void *context) +{ + const struct sample *sample = NULL; + struct sample stack_sample; + int status; + static int num_calls; + + if (num_calls++ % 2 == 0) { + status = bpf_dynptr_read(&stack_sample, sizeof(stack_sample), dynptr, 0, 0); + if (status) { + bpf_printk("bpf_dynptr_read() failed: %d\n", status); + err = 1; + return 0; + } + } else { + sample = bpf_dynptr_data(dynptr, 0, sizeof(*sample)); + if (!sample) { + bpf_printk("Unexpectedly failed to get sample\n"); + err = 2; + return 0; + } + stack_sample = *sample; + } + + __sync_fetch_and_add(&read, 1); + return 0; +} + +static void +handle_sample_msg(const struct test_msg *msg) +{ + switch (msg->msg_op) { + case TEST_MSG_OP_INC64: + kern_mutated += msg->operand_64; + break; + case TEST_MSG_OP_INC32: + kern_mutated += msg->operand_32; + break; + case TEST_MSG_OP_MUL64: + kern_mutated *= msg->operand_64; + break; + case TEST_MSG_OP_MUL32: + kern_mutated *= msg->operand_32; + break; + default: + bpf_printk("Unrecognized op %d\n", msg->msg_op); + err = 2; + } +} + +static long +read_protocol_msg(struct bpf_dynptr *dynptr, void *context) +{ + const struct test_msg *msg = NULL; + + msg = bpf_dynptr_data(dynptr, 0, sizeof(*msg)); + if (!msg) { + err = 1; + bpf_printk("Unexpectedly failed to get msg\n"); + return 0; + } + + handle_sample_msg(msg); + + return 0; +} + +static int publish_next_kern_msg(__u32 index, void *context) +{ + struct test_msg *msg = NULL; + int operand_64 = TEST_OP_64; + int operand_32 = TEST_OP_32; + + msg = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*msg), 0); + if (!msg) { + err = 4; + return 1; + } + + switch (index % TEST_MSG_OP_NUM_OPS) { + case TEST_MSG_OP_INC64: + msg->operand_64 = operand_64; + msg->msg_op = TEST_MSG_OP_INC64; + expected_user_mutated += operand_64; + break; + case TEST_MSG_OP_INC32: + msg->operand_32 = operand_32; + msg->msg_op = TEST_MSG_OP_INC32; + expected_user_mutated += operand_32; + break; + case TEST_MSG_OP_MUL64: + msg->operand_64 = operand_64; + msg->msg_op = TEST_MSG_OP_MUL64; + expected_user_mutated *= operand_64; + break; + case TEST_MSG_OP_MUL32: + msg->operand_32 = operand_32; + msg->msg_op = TEST_MSG_OP_MUL32; + expected_user_mutated *= operand_32; + break; + default: + bpf_ringbuf_discard(msg, 0); + err = 5; + return 1; + } + + bpf_ringbuf_submit(msg, 0); + + return 0; +} + +static void +publish_kern_messages(void) +{ + if (expected_user_mutated != user_mutated) { + bpf_printk("%lu != %lu\n", expected_user_mutated, user_mutated); + err = 3; + return; + } + + bpf_loop(8, publish_next_kern_msg, NULL, 0); +} + +SEC("fentry/" SYS_PREFIX "sys_prctl") +int test_user_ringbuf_protocol(void *ctx) +{ + long status = 0; + struct sample *sample = NULL; + struct bpf_dynptr ptr; + + if (!is_test_process()) + return 0; + + status = bpf_user_ringbuf_drain(&user_ringbuf, read_protocol_msg, NULL, 0); + if (status < 0) { + bpf_printk("Drain returned: %ld\n", status); + err = 1; + return 0; + } + + publish_kern_messages(); + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_user_ringbuf(void *ctx) +{ + int status = 0; + struct sample *sample = NULL; + struct bpf_dynptr ptr; + + if (!is_test_process()) + return 0; + + err = bpf_user_ringbuf_drain(&user_ringbuf, record_sample, NULL, 0); + + return 0; +} + +static long +do_nothing_cb(struct bpf_dynptr *dynptr, void *context) +{ + __sync_fetch_and_add(&read, 1); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getrlimit") +int test_user_ringbuf_epoll(void *ctx) +{ + long num_samples; + + if (!is_test_process()) + return 0; + + num_samples = bpf_user_ringbuf_drain(&user_ringbuf, do_nothing_cb, NULL, 0); + if (num_samples <= 0) + err = 1; + + return 0; +}