Merge branch 'bpf-introduce-bpf-arena'

Alexei Starovoitov says:

====================
bpf: Introduce BPF arena.

From: Alexei Starovoitov <ast@kernel.org>

v2->v3:
- contains bpf bits only, but cc-ing past audience for continuity
- since prerequisite patches landed, this series focus on the main
  functionality of bpf_arena.
- adopted Andrii's approach to support arena in libbpf.
- simplified LLVM support. Instead of two instructions it's now only one.
- switched to cond_break (instead of open coded iters) in selftests
- implemented several follow-ups that will be sent after this set
  . remember first IP and bpf insn that faulted in arena.
    report to user space via bpftool
  . copy paste and tweak glob_match() aka mini-regex as a selftests/bpf
- see patch 1 for detailed description of bpf_arena

v1->v2:
- Improved commit log with reasons for using vmap_pages_range() in arena.
  Thanks to Johannes
- Added support for __arena global variables in bpf programs
- Fixed race conditions spotted by Barret
- Fixed wrap32 issue spotted by Barret
- Fixed bpf_map_mmap_sz() the way Andrii suggested

The work on bpf_arena was inspired by Barret's work:
https://github.com/google/ghost-userspace/blob/main/lib/queue.bpf.h
that implements queues, lists and AVL trees completely as bpf programs
using giant bpf array map and integer indices instead of pointers.
bpf_arena is a sparse array that allows to use normal C pointers to
build such data structures. Last few patches implement page_frag
allocator, link list and hash table as bpf programs.

v1:
bpf programs have multiple options to communicate with user space:
- Various ring buffers (perf, ftrace, bpf): The data is streamed
  unidirectionally from bpf to user space.
- Hash map: The bpf program populates elements, and user space consumes
  them via bpf syscall.
- mmap()-ed array map: Libbpf creates an array map that is directly
  accessed by the bpf program and mmap-ed to user space. It's the fastest
  way. Its disadvantage is that memory for the whole array is reserved at
  the start.
====================

Link: https://lore.kernel.org/r/20240308010812.89848-1-alexei.starovoitov@gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
This commit is contained in:
Andrii Nakryiko 2024-03-11 15:37:26 -07:00
commit 08701e306e
37 changed files with 2028 additions and 40 deletions

View File

@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)
/*
* The following table maps BPF registers to x86-64 registers.
@ -139,6 +140,7 @@ static const int reg2hex[] = {
[BPF_REG_AX] = 2, /* R10 temp register */
[AUX_REG] = 3, /* R11 temp register */
[X86_REG_R9] = 1, /* R9 register, 6th function argument */
[X86_REG_R12] = 4, /* R12 callee saved */
};
static const int reg2pt_regs[] = {
@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
BIT(BPF_REG_8) |
BIT(BPF_REG_9) |
BIT(X86_REG_R9) |
BIT(X86_REG_R12) |
BIT(BPF_REG_AX));
}
@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
return byte;
}
static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
{
if (is_ereg(r1))
byte |= 1;
if (is_ereg(index))
byte |= 2;
if (is_ereg(r2))
byte |= 4;
return byte;
}
/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
static u8 add_1reg(u8 byte, u32 dst_reg)
{
@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
pop_r12(&prog);
} else {
pop_callee_regs(&prog, callee_regs_used);
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
pop_r12(&prog);
}
EMIT1(0x58); /* pop rax */
@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
pop_r12(&prog);
} else {
pop_callee_regs(&prog, callee_regs_used);
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
pop_r12(&prog);
}
EMIT1(0x58); /* pop rax */
@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
*pprog = prog;
}
static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
{
u8 *prog = *pprog;
if (is_imm8(off)) {
EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
} else {
EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
}
*pprog = prog;
}
/*
* Emit a REX byte if it will be necessary to address these registers
*/
@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
u8 *prog = *pprog;
switch (size) {
case BPF_B:
/* movzx rax, byte ptr [rax + r12 + off] */
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
break;
case BPF_H:
/* movzx rax, word ptr [rax + r12 + off] */
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
break;
case BPF_W:
/* mov eax, dword ptr [rax + r12 + off] */
EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
break;
case BPF_DW:
/* mov rax, qword ptr [rax + r12 + off] */
EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
break;
}
emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
*pprog = prog;
}
static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}
/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
u8 *prog = *pprog;
switch (size) {
case BPF_B:
/* mov byte ptr [rax + r12 + off], al */
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
break;
case BPF_H:
/* mov word ptr [rax + r12 + off], ax */
EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
break;
case BPF_W:
/* mov dword ptr [rax + r12 + 1], eax */
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
break;
case BPF_DW:
/* mov qword ptr [rax + r12 + 1], rax */
EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
break;
}
emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
*pprog = prog;
}
static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}
/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
{
u8 *prog = *pprog;
switch (size) {
case BPF_B:
/* mov byte ptr [rax + r12 + off], imm8 */
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
break;
case BPF_H:
/* mov word ptr [rax + r12 + off], imm16 */
EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
break;
case BPF_W:
/* mov dword ptr [rax + r12 + 1], imm32 */
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
break;
case BPF_DW:
/* mov qword ptr [rax + r12 + 1], imm32 */
EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
break;
}
emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
EMIT(imm, bpf_size_to_x86_bytes(size));
*pprog = prog;
}
static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
{
emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
}
static int emit_atomic(u8 **pprog, u8 atomic_op,
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{
@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
return 0;
}
#define DONT_CLEAR 1
bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{
u32 reg = x->fixup >> 8;
/* jump over faulting load and clear dest register */
*(unsigned long *)((void *)regs + reg) = 0;
if (reg != DONT_CLEAR)
*(unsigned long *)((void *)regs + reg) = 0;
regs->ip += x->fixup & 0xff;
return true;
}
@ -1147,11 +1276,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
bool tail_call_seen = false;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
u64 arena_vm_start, user_vm_start;
int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
int err;
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
detect_reg_usage(insn, insn_cnt, callee_regs_used,
&tail_call_seen);
@ -1172,8 +1305,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
push_r12(&prog);
push_callee_regs(&prog, all_callee_regs_used);
} else {
if (arena_vm_start)
push_r12(&prog);
push_callee_regs(&prog, callee_regs_used);
}
if (arena_vm_start)
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
ilen = prog - temp;
if (rw_image)
@ -1213,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
break;
case BPF_ALU64 | BPF_MOV | BPF_X:
if (insn->off == BPF_ADDR_SPACE_CAST &&
insn->imm == 1U << 16) {
if (dst_reg != src_reg)
/* 32-bit mov */
emit_mov_reg(&prog, false, dst_reg, src_reg);
/* shl dst_reg, 32 */
maybe_emit_1mod(&prog, dst_reg, true);
EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);
/* or dst_reg, user_vm_start */
maybe_emit_1mod(&prog, dst_reg, true);
if (is_axreg(dst_reg))
EMIT1_off32(0x0D, user_vm_start >> 32);
else
EMIT2_off32(0x81, add_1reg(0xC8, dst_reg), user_vm_start >> 32);
/* rol dst_reg, 32 */
maybe_emit_1mod(&prog, dst_reg, true);
EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);
/* xor r11, r11 */
EMIT3(0x4D, 0x31, 0xDB);
/* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
maybe_emit_mod(&prog, dst_reg, dst_reg, false);
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
/* cmove r11, dst_reg; if so, set dst_reg to zero */
/* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
break;
}
fallthrough;
case BPF_ALU | BPF_MOV | BPF_X:
if (insn->off == 0)
emit_mov_reg(&prog,
@ -1564,6 +1736,56 @@ st: if (is_imm8(insn->off))
emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
break;
case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
start_of_ldx = prog;
emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
goto populate_extable;
/* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
start_of_ldx = prog;
if (BPF_CLASS(insn->code) == BPF_LDX)
emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
else
emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
populate_extable:
{
struct exception_table_entry *ex;
u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
if (!bpf_prog->aux->extable)
break;
if (excnt >= bpf_prog->aux->num_exentries) {
pr_err("mem32 extable bug\n");
return -EFAULT;
}
ex = &bpf_prog->aux->extable[excnt++];
delta = _insn - (u8 *)&ex->insn;
/* switch ex to rw buffer for writes */
ex = (void *)rw_image + ((void *)ex - (void *)image);
ex->insn = delta;
ex->data = EX_TYPE_BPF;
ex->fixup = (prog - start_of_ldx) |
((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
}
break;
/* LDX: dst_reg = *(u8*)(src_reg + off) */
case BPF_LDX | BPF_MEM | BPF_B:
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@ -2036,6 +2258,8 @@ emit_jmp:
pop_r12(&prog);
} else {
pop_callee_regs(&prog, callee_regs_used);
if (arena_vm_start)
pop_r12(&prog);
}
EMIT1(0xC9); /* leave */
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
@ -3243,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
}
}
bool bpf_jit_supports_arena(void)
{
return true;
}
bool bpf_jit_supports_ptr_xchg(void)
{
return true;

View File

@ -37,6 +37,7 @@ struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct bpf_arena;
struct sock;
struct seq_file;
struct btf;
@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
@ -711,6 +712,7 @@ enum bpf_arg_type {
* on eBPF program stack
*/
ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */
ARG_PTR_TO_ARENA,
ARG_CONST_SIZE, /* number of bytes accessed from memory */
ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
@ -882,6 +884,7 @@ enum bpf_reg_type {
* an explicit null check is required for this struct.
*/
PTR_TO_MEM, /* reg points to valid memory region */
PTR_TO_ARENA,
PTR_TO_BUF, /* reg points to a read/write buffer */
PTR_TO_FUNC, /* reg points to a bpf program function */
CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */
@ -1457,6 +1460,7 @@ struct bpf_prog_aux {
bool xdp_has_frags;
bool exception_cb;
bool exception_boundary;
struct bpf_arena *arena;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
/* function name for valid attach_btf_id */
@ -2215,6 +2219,8 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG_KMEM
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node);

View File

@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)

View File

@ -548,6 +548,7 @@ struct bpf_insn_aux_data {
u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
bool zext_dst; /* this insn zero extends dst reg */
bool needs_zext; /* alu op needs to clear upper bits */
bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */

View File

@ -72,6 +72,9 @@ struct ctl_table_header;
/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX 0x40
/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32 0xa0
/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS 0xe0
@ -959,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);

View File

@ -1009,6 +1009,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_BLOOM_FILTER,
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
__MAX_BPF_MAP_TYPE
};
@ -1338,6 +1339,10 @@ enum {
*/
#define BPF_PSEUDO_KFUNC_CALL 2
enum bpf_addr_space_cast {
BPF_ADDR_SPACE_CAST = 1,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
enum {
BPF_ANY = 0, /* create new element or update existing */
@ -1396,6 +1401,12 @@ enum {
/* BPF token FD is passed in a corresponding command's token_fd field */
BPF_F_TOKEN_FD = (1U << 16),
/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
BPF_F_SEGV_ON_FAULT = (1U << 17),
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
};
/* Flags for BPF_PROG_QUERY. */
@ -1467,6 +1478,9 @@ union bpf_attr {
* BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
* number of hash functions (if 0, the bloom filter will default
* to using 5 hash functions).
*
* BPF_MAP_TYPE_ARENA - contains the address where user space
* is going to mmap() the arena. It has to be page aligned.
*/
__u64 map_extra;

View File

@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
obj-$(CONFIG_BPF_SYSCALL) += arena.o
endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o

558
kernel/bpf/arena.c Normal file
View File

@ -0,0 +1,558 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
/*
* bpf_arena is a sparsely populated shared memory region between bpf program and
* user space process.
*
* For example on x86-64 the values could be:
* user_vm_start 7f7d26200000 // picked by mmap()
* kern_vm_start ffffc90001e69000 // picked by get_vm_area()
* For user space all pointers within the arena are normal 8-byte addresses.
* In this example 7f7d26200000 is the address of the first page (pgoff=0).
* The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
* (u32)7f7d26200000 -> 26200000
* hence
* ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
* kernel memory region.
*
* BPF JITs generate the following code to access arena:
* mov eax, eax // eax has lower 32-bit of user pointer
* mov word ptr [rax + r12 + off], bx
* where r12 == kern_vm_start and off is s16.
* Hence allocate 4Gb + GUARD_SZ/2 on each side.
*
* Initially kernel vm_area and user vma are not populated.
* User space can fault-in any address which will insert the page
* into kernel and user vma.
* bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
* which will insert it into kernel vm_area.
* The later fault-in from user space will populate that page into user vma.
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
struct bpf_arena {
struct bpf_map map;
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
struct maple_tree mt;
struct list_head vma_list;
struct mutex lock;
};
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
}
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
return arena ? arena->user_vm_start : 0;
}
static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
return -EOPNOTSUPP;
}
static long arena_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
static long arena_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
return -EOPNOTSUPP;
}
static long compute_pgoff(struct bpf_arena *arena, long uaddr)
{
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_arena *arena;
u64 vm_range;
int err = -ENOMEM;
if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
/* BPF_F_MMAPABLE must be set */
!(attr->map_flags & BPF_F_MMAPABLE) ||
/* No unsupported flags present */
(attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
return ERR_PTR(-EINVAL);
if (attr->map_extra & ~PAGE_MASK)
/* If non-zero the map_extra is an expected user VMA start address */
return ERR_PTR(-EINVAL);
vm_range = (u64)attr->max_entries * PAGE_SIZE;
if (vm_range > (1ull << 32))
return ERR_PTR(-E2BIG);
if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
/* user vma must not cross 32-bit boundary */
return ERR_PTR(-ERANGE);
kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
if (!kern_vm)
return ERR_PTR(-ENOMEM);
arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
if (!arena)
goto err;
arena->kern_vm = kern_vm;
arena->user_vm_start = attr->map_extra;
if (arena->user_vm_start)
arena->user_vm_end = arena->user_vm_start + vm_range;
INIT_LIST_HEAD(&arena->vma_list);
bpf_map_init_from_attr(&arena->map, attr);
mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
mutex_init(&arena->lock);
return &arena->map;
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
}
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
struct page *page;
pte_t pte;
pte = ptep_get(ptep);
if (!pte_present(pte)) /* sanity check */
return 0;
page = pte_page(pte);
/*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
* 2. TLB flushing is batched or deferred. Even if we clear pte,
* the TLB entries can stick around and continue to permit access to
* the freed page. So it all relies on 1.
*/
__free_page(page);
return 0;
}
static void arena_map_free(struct bpf_map *map)
{
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
/*
* Check that user vma-s are not around when bpf map is freed.
* mmap() holds vm_file which holds bpf_map refcnt.
* munmap() must have happened on vma followed by arena_vm_close()
* which would clear arena->vma_list.
*/
if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
return;
/*
* free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
* It unmaps everything from vmalloc area and clears pgtables.
* Call apply_to_existing_page_range() first to find populated ptes and
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
mtree_destroy(&arena->mt);
bpf_map_area_free(arena);
}
static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
{
return ERR_PTR(-EINVAL);
}
static long arena_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
return -EOPNOTSUPP;
}
static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
return 0;
}
static u64 arena_map_mem_usage(const struct bpf_map *map)
{
return 0;
}
struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
{
struct vma_list *vml;
vml = kmalloc(sizeof(*vml), GFP_KERNEL);
if (!vml)
return -ENOMEM;
vma->vm_private_data = vml;
vml->vma = vma;
list_add(&vml->head, &arena->vma_list);
return 0;
}
static void arena_vm_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
struct vma_list *vml;
guard(mutex)(&arena->lock);
vml = vma->vm_private_data;
list_del(&vml->head);
vma->vm_private_data = NULL;
kfree(vml);
}
#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
struct bpf_map *map = vmf->vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
struct page *page;
long kbase, kaddr;
int ret;
kbase = bpf_arena_get_kern_vm_start(arena);
kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
guard(mutex)(&arena->lock);
page = vmalloc_to_page((void *)kaddr);
if (page)
/* already have a page vmap-ed */
goto out;
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
return VM_FAULT_SIGSEGV;
ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
if (ret)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
if (ret) {
mtree_erase(&arena->mt, vmf->pgoff);
return VM_FAULT_SIGSEGV;
}
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
if (ret) {
mtree_erase(&arena->mt, vmf->pgoff);
__free_page(page);
return VM_FAULT_SIGSEGV;
}
out:
page_ref_add(page, 1);
vmf->page = page;
return 0;
}
static const struct vm_operations_struct arena_vm_ops = {
.close = arena_vm_close,
.fault = arena_vm_fault,
};
static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
struct bpf_map *map = filp->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
long ret;
if (pgoff)
return -EINVAL;
if (len > (1ull << 32))
return -E2BIG;
/* if user_vm_start was specified at arena creation time */
if (arena->user_vm_start) {
if (len > arena->user_vm_end - arena->user_vm_start)
return -E2BIG;
if (len != arena->user_vm_end - arena->user_vm_start)
return -EINVAL;
if (addr != arena->user_vm_start)
return -EINVAL;
}
ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
if (IS_ERR_VALUE(ret))
return ret;
if ((ret >> 32) == ((ret + len - 1) >> 32))
return ret;
if (WARN_ON_ONCE(arena->user_vm_start))
/* checks at map creation time should prevent this */
return -EFAULT;
return round_up(ret, 1ull << 32);
}
static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
guard(mutex)(&arena->lock);
if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
/*
* If map_extra was not specified at arena creation time then
* 1st user process can do mmap(NULL, ...) to pick user_vm_start
* 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
* or
* specify addr in map_extra and
* use the same addr later with mmap(addr, MAP_FIXED..);
*/
return -EBUSY;
if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
/* all user processes must have the same size of mmap-ed region */
return -EBUSY;
/* Earlier checks should prevent this */
if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
return -EFAULT;
if (remember_vma(arena, vma))
return -ENOMEM;
arena->user_vm_start = vma->vm_start;
arena->user_vm_end = vma->vm_end;
/*
* bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
* clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
* potential change of user_vm_start.
*/
vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_ops = &arena_vm_ops;
return 0;
}
static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
{
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
if ((u64)off > arena->user_vm_end - arena->user_vm_start)
return -ERANGE;
*imm = (unsigned long)arena->user_vm_start;
return 0;
}
BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
const struct bpf_map_ops arena_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = arena_map_alloc,
.map_free = arena_map_free,
.map_direct_value_addr = arena_map_direct_value_addr,
.map_mmap = arena_map_mmap,
.map_get_unmapped_area = arena_get_unmapped_area,
.map_get_next_key = arena_map_get_next_key,
.map_push_elem = arena_map_push_elem,
.map_peek_elem = arena_map_peek_elem,
.map_pop_elem = arena_map_pop_elem,
.map_lookup_elem = arena_map_lookup_elem,
.map_update_elem = arena_map_update_elem,
.map_delete_elem = arena_map_delete_elem,
.map_check_btf = arena_map_check_btf,
.map_mem_usage = arena_map_mem_usage,
.map_btf_id = &bpf_arena_map_btf_ids[0],
};
static u64 clear_lo32(u64 val)
{
return val & ~(u64)~0U;
}
/*
* Allocate pages and vmap them into kernel vmalloc area.
* Later the pages will be mmaped into user space vma.
*/
static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
{
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
struct page **pages;
long pgoff = 0;
u32 uaddr32;
int ret, i;
if (page_cnt > page_cnt_max)
return 0;
if (uaddr) {
if (uaddr & ~PAGE_MASK)
return 0;
pgoff = compute_pgoff(arena, uaddr);
if (pgoff + page_cnt > page_cnt_max)
/* requested address will be outside of user VMA */
return 0;
}
/* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return 0;
guard(mutex)(&arena->lock);
if (uaddr)
ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
MT_ENTRY, GFP_KERNEL);
else
ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
if (ret)
goto out_free_pages;
ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
node_id, page_cnt, pages);
if (ret)
goto out;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
/* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
if (ret) {
for (i = 0; i < page_cnt; i++)
__free_page(pages[i]);
goto out;
}
kvfree(pages);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
mtree_erase(&arena->mt, pgoff);
out_free_pages:
kvfree(pages);
return 0;
}
/*
* If page is present in vmalloc area, unmap it from vmalloc area,
* unmap it from all user space vma-s,
* and free it.
*/
static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
struct vma_list *vml;
list_for_each_entry(vml, &arena->vma_list, head)
zap_page_range_single(vml->vma, uaddr,
PAGE_SIZE * page_cnt, NULL);
}
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
u64 full_uaddr, uaddr_end;
long kaddr, pgoff, i;
struct page *page;
/* only aligned lower 32-bit are relevant */
uaddr = (u32)uaddr;
uaddr &= PAGE_MASK;
full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
if (full_uaddr >= uaddr_end)
return;
page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
guard(mutex)(&arena->lock);
pgoff = compute_pgoff(arena, uaddr);
/* clear range */
mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */
zap_pages(arena, full_uaddr, page_cnt);
kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
page = vmalloc_to_page((void *)kaddr);
if (!page)
continue;
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
zap_pages(arena, full_uaddr, 1);
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
}
}
__bpf_kfunc_start_defs();
__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
int node_id, u64 flags)
{
struct bpf_map *map = p__map;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
}
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
return;
arena_free_pages(arena, (long)ptr__ign, page_cnt);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
.owner = THIS_MODULE,
.set = &arena_kfuncs,
};
static int __init kfunc_init(void)
{
return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);

View File

@ -7111,10 +7111,11 @@ cand_cache_unlock:
}
enum btf_arg_tag {
ARG_TAG_CTX = 0x1,
ARG_TAG_NONNULL = 0x2,
ARG_TAG_TRUSTED = 0x4,
ARG_TAG_NULLABLE = 0x8,
ARG_TAG_CTX = BIT_ULL(0),
ARG_TAG_NONNULL = BIT_ULL(1),
ARG_TAG_TRUSTED = BIT_ULL(2),
ARG_TAG_NULLABLE = BIT_ULL(3),
ARG_TAG_ARENA = BIT_ULL(4),
};
/* Process BTF of a function to produce high-level expectation of function
@ -7226,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tags |= ARG_TAG_NONNULL;
} else if (strcmp(tag, "nullable") == 0) {
tags |= ARG_TAG_NULLABLE;
} else if (strcmp(tag, "arena") == 0) {
tags |= ARG_TAG_ARENA;
} else {
bpf_log(log, "arg#%d has unsupported set of tags\n", i);
return -EOPNOTSUPP;
@ -7280,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
sub->args[i].btf_id = kern_type_id;
continue;
}
if (tags & ARG_TAG_ARENA) {
if (tags & ~ARG_TAG_ARENA) {
bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
return -EINVAL;
}
sub->args[i].arg_type = ARG_PTR_TO_ARENA;
continue;
}
if (is_global) { /* generic user data pointer */
u32 mem_size;

View File

@ -2932,6 +2932,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
return false;
}
bool __weak bpf_jit_supports_arena(void)
{
return false;
}
/* Return TRUE if the JIT backend satisfies the following two conditions:
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
* 2) Under the specific arch, the implementation of xchg() is the same
@ -2976,6 +2981,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
{
}
/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
return 0;
}
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
return 0;
}
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{

View File

@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn)
(insn->off == 8 || insn->off == 16 || insn->off == 32);
}
static bool is_addr_space_cast(const struct bpf_insn *insn)
{
return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
insn->off == BPF_ADDR_SPACE_CAST;
}
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (is_addr_space_cast(insn)) {
verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
insn->code, insn->dst_reg,
insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
} else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',

View File

@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_MEM] = "mem",
[PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
}
verbose(env, "%s", reg_type_str(env, t));
if (t == PTR_TO_ARENA)
return;
if (t == PTR_TO_STACK) {
if (state->frameno != reg->frameno)
verbose(env, "[%d]", reg->frameno);

View File

@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_ARENA ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
struct page *pg;
int ret = 0;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg, *old_memcg;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
__free_page(pages[j]);
ret = -ENOMEM;
break;
}
#ifdef CONFIG_MEMCG_KMEM
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
#endif
return ret;
}
static int btf_field_cmp(const void *a, const void *b)
{
const struct btf_field *f1 = a, *f2 = b;
@ -1176,6 +1210,7 @@ static int map_create(union bpf_attr *attr)
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_type != BPF_MAP_TYPE_ARENA &&
attr->map_extra != 0)
return -EINVAL;
@ -1265,6 +1300,7 @@ static int map_create(union bpf_attr *attr)
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
case BPF_MAP_TYPE_ARENA:
if (!bpf_token_capable(token, CAP_BPF))
goto put_token;
break;
@ -4417,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
continue;
}
if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
continue;
}
if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;

View File

@ -4386,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
case PTR_TO_ARENA:
return true;
default:
return false;
@ -5828,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_XDP_SOCK:
pointer_desc = "xdp_sock ";
break;
case PTR_TO_ARENA:
return 0;
default:
break;
}
@ -6937,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_ARENA) {
if (t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str(env, reg->type));
@ -8408,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
case PTR_TO_ARENA:
case SCALAR_VALUE:
return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@ -9372,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
return -EINVAL;
}
} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
/*
* Can pass any value and the kernel won't crash, but
* only PTR_TO_ARENA or SCALAR make sense. Everything
* else is a bug in the bpf program. Point it out to
* the user at the verification time instead of
* run-time debug nightmare.
*/
if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
return -EINVAL;
}
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
@ -13852,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
if (dst_reg->type == PTR_TO_ARENA) {
struct bpf_insn_aux_data *aux = cur_aux(env);
if (BPF_CLASS(insn->code) == BPF_ALU64)
/*
* 32-bit operations zero upper bits automatically.
* 64-bit operations need to be converted to 32.
*/
aux->needs_zext = true;
/* Any arithmetic operations are allowed on arena pointers */
return 0;
}
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
else
@ -13969,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_ALU) {
if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
} else if (insn->off == BPF_ADDR_SPACE_CAST) {
if (insn->imm != 1 && insn->imm != 1u << 16) {
verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
return -EINVAL;
}
} else {
if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
insn->off != 32) {
if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
insn->off != 32) || insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@ -14008,7 +14043,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
if (BPF_CLASS(insn->code) == BPF_ALU64) {
if (insn->off == 0) {
if (insn->imm) {
/* off == BPF_ADDR_SPACE_CAST */
mark_reg_unknown(env, regs, insn->dst_reg);
if (insn->imm == 1) /* cast from as(1) to as(0) */
dst_reg->type = PTR_TO_ARENA;
} else if (insn->off == 0) {
/* case: R1 = R2
* copy register state to dest reg
*/
@ -15182,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
if (map->map_type == BPF_MAP_TYPE_ARENA) {
__mark_reg_unknown(env, dst_reg);
return 0;
}
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
WARN_ON_ONCE(map->max_entries != 1);
@ -16568,6 +16612,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* the same stack frame, since fp-8 in foo != fp-8 in bar
*/
return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
case PTR_TO_ARENA:
return true;
default:
return regs_exact(rold, rcur, idmap);
}
@ -17443,6 +17489,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_ARENA:
return false;
default:
return true;
@ -18108,6 +18155,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_CGRP_STORAGE:
case BPF_MAP_TYPE_QUEUE:
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
break;
default:
verbose(env,
@ -18295,6 +18343,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
fdput(f);
return -EBUSY;
}
if (map->map_type == BPF_MAP_TYPE_ARENA) {
if (env->prog->aux->arena) {
verbose(env, "Only one arena per program\n");
fdput(f);
return -EBUSY;
}
if (!env->allow_ptr_leaks || !env->bpf_capable) {
verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
fdput(f);
return -EPERM;
}
if (!env->prog->jit_requested) {
verbose(env, "JIT is required to use arena\n");
return -EOPNOTSUPP;
}
if (!bpf_jit_supports_arena()) {
verbose(env, "JIT doesn't support arena\n");
return -EOPNOTSUPP;
}
env->prog->aux->arena = (void *)map;
if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
verbose(env, "arena's user address must be set via map_extra or mmap()\n");
return -EINVAL;
}
}
fdput(f);
next_insn:
@ -18916,6 +18989,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog->aux->num_exentries++;
}
continue;
case PTR_TO_ARENA:
if (BPF_MODE(insn->code) == BPF_MEMSX) {
verbose(env, "sign extending loads from arena are not supported yet\n");
return -EOPNOTSUPP;
}
insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
env->prog->aux->num_exentries++;
continue;
default:
continue;
}
@ -19101,13 +19182,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i]->aux->arena = prog->aux->arena;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
if ((BPF_CLASS(insn->code) == BPF_STX ||
BPF_CLASS(insn->code) == BPF_ST) &&
BPF_MODE(insn->code) == BPF_PROBE_MEM32)
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
@ -19506,6 +19593,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
for (i = 0; i < insn_cnt;) {
if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
(((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
/* convert to 32-bit mov that clears upper 32-bit */
insn->code = BPF_ALU | BPF_MOV | BPF_X;
/* clear off, so it's a normal 'wX = wY' from JIT pov */
insn->off = 0;
} /* cast from as(0) to as(1) should be handled by JIT */
goto next_insn;
}
if (env->insn_aux_data[i + delta].needs_zext)
/* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
/* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
@ -20358,6 +20460,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
reg->btf_id = arg->btf_id;
reg->id = ++env->id_gen;
} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
/* caller can pass either PTR_TO_ARENA or SCALAR */
mark_reg_unknown(env, regs, i);
} else {
WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
i - BPF_REG_1, arg->arg_type);

View File

@ -55,7 +55,7 @@ MAP COMMANDS
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** }
| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** }
DESCRIPTION
===========

View File

@ -120,6 +120,12 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz)
static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" };
int i, n;
/* recognize hard coded LLVM section name */
if (strcmp(sec_name, ".arena.1") == 0) {
/* this is the name to use in skeleton */
snprintf(buf, buf_sz, "arena");
return true;
}
for (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) {
const char *pfx = pfxs[i];
@ -250,6 +256,13 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map
static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz)
{
size_t tmp_sz;
if (bpf_map__type(map) == BPF_MAP_TYPE_ARENA && bpf_map__initial_value(map, &tmp_sz)) {
snprintf(buf, sz, "arena");
return true;
}
if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE))
return false;

View File

@ -1463,7 +1463,7 @@ static int do_help(int argc, char **argv)
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
" task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n"
" task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n"
" " HELP_SPEC_OPTIONS " |\n"
" {-f|--bpffs} | {-n|--nomount} }\n"
"",

View File

@ -1009,6 +1009,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_BLOOM_FILTER,
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
__MAX_BPF_MAP_TYPE
};
@ -1338,6 +1339,10 @@ enum {
*/
#define BPF_PSEUDO_KFUNC_CALL 2
enum bpf_addr_space_cast {
BPF_ADDR_SPACE_CAST = 1,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
enum {
BPF_ANY = 0, /* create new element or update existing */
@ -1396,6 +1401,12 @@ enum {
/* BPF token FD is passed in a corresponding command's token_fd field */
BPF_F_TOKEN_FD = (1U << 16),
/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
BPF_F_SEGV_ON_FAULT = (1U << 17),
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
};
/* Flags for BPF_PROG_QUERY. */
@ -1467,6 +1478,9 @@ union bpf_attr {
* BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
* number of hash functions (if 0, the bloom filter will default
* to using 5 hash functions).
*
* BPF_MAP_TYPE_ARENA - contains the address where user space
* is going to mmap() the arena. It has to be page aligned.
*/
__u64 map_extra;

View File

@ -193,6 +193,7 @@ enum libbpf_tristate {
#define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull")))
#define __arg_nullable __attribute((btf_decl_tag("arg:nullable")))
#define __arg_trusted __attribute((btf_decl_tag("arg:trusted")))
#define __arg_arena __attribute((btf_decl_tag("arg:arena")))
#ifndef ___bpf_concat
#define ___bpf_concat(a, b) a ## b

View File

@ -185,6 +185,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter",
[BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf",
[BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage",
[BPF_MAP_TYPE_ARENA] = "arena",
};
static const char * const prog_type_name[] = {
@ -497,6 +498,7 @@ struct bpf_struct_ops {
#define KSYMS_SEC ".ksyms"
#define STRUCT_OPS_SEC ".struct_ops"
#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
#define ARENA_SEC ".arena.1"
enum libbpf_map_type {
LIBBPF_MAP_UNSPEC,
@ -628,6 +630,7 @@ struct elf_state {
Elf *elf;
Elf64_Ehdr *ehdr;
Elf_Data *symbols;
Elf_Data *arena_data;
size_t shstrndx; /* section index for section name strings */
size_t strtabidx;
struct elf_sec_desc *secs;
@ -637,6 +640,7 @@ struct elf_state {
int text_shndx;
int symbols_shndx;
bool has_st_ops;
int arena_data_shndx;
};
struct usdt_manager;
@ -696,6 +700,10 @@ struct bpf_object {
struct usdt_manager *usdt_man;
struct bpf_map *arena_map;
void *arena_data;
size_t arena_data_sz;
struct kern_feature_cache *feat_cache;
char *token_path;
int token_fd;
@ -1442,6 +1450,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
elf_end(obj->efile.elf);
obj->efile.elf = NULL;
obj->efile.symbols = NULL;
obj->efile.arena_data = NULL;
zfree(&obj->efile.secs);
obj->efile.sec_cnt = 0;
@ -1684,7 +1693,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj)
return map;
}
static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
static size_t array_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
{
const long page_sz = sysconf(_SC_PAGE_SIZE);
size_t map_sz;
@ -1694,6 +1703,20 @@ static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
return map_sz;
}
static size_t bpf_map_mmap_sz(const struct bpf_map *map)
{
const long page_sz = sysconf(_SC_PAGE_SIZE);
switch (map->def.type) {
case BPF_MAP_TYPE_ARRAY:
return array_map_mmap_sz(map->def.value_size, map->def.max_entries);
case BPF_MAP_TYPE_ARENA:
return page_sz * map->def.max_entries;
default:
return 0; /* not supported */
}
}
static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz)
{
void *mmaped;
@ -1836,7 +1859,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
def->value_size = data_sz;
def->max_entries = 1;
def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG
? BPF_F_RDONLY_PROG : 0;
? BPF_F_RDONLY_PROG : 0;
/* failures are fine because of maps like .rodata.str1.1 */
(void) map_fill_btf_type_info(obj, map);
@ -1847,7 +1870,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n",
map->name, map->sec_idx, map->sec_offset, def->map_flags);
mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
mmap_sz = bpf_map_mmap_sz(map);
map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (map->mmaped == MAP_FAILED) {
@ -2828,6 +2851,32 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
return 0;
}
static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
const char *sec_name, int sec_idx,
void *data, size_t data_sz)
{
const long page_sz = sysconf(_SC_PAGE_SIZE);
size_t mmap_sz;
mmap_sz = bpf_map_mmap_sz(obj->arena_map);
if (roundup(data_sz, page_sz) > mmap_sz) {
pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n",
sec_name, mmap_sz, data_sz);
return -E2BIG;
}
obj->arena_data = malloc(data_sz);
if (!obj->arena_data)
return -ENOMEM;
memcpy(obj->arena_data, data, data_sz);
obj->arena_data_sz = data_sz;
/* make bpf_map__init_value() work for ARENA maps */
map->mmaped = obj->arena_data;
return 0;
}
static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
const char *pin_root_path)
{
@ -2877,6 +2926,33 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
return err;
}
for (i = 0; i < obj->nr_maps; i++) {
struct bpf_map *map = &obj->maps[i];
if (map->def.type != BPF_MAP_TYPE_ARENA)
continue;
if (obj->arena_map) {
pr_warn("map '%s': only single ARENA map is supported (map '%s' is also ARENA)\n",
map->name, obj->arena_map->name);
return -EINVAL;
}
obj->arena_map = map;
if (obj->efile.arena_data) {
err = init_arena_map_data(obj, map, ARENA_SEC, obj->efile.arena_data_shndx,
obj->efile.arena_data->d_buf,
obj->efile.arena_data->d_size);
if (err)
return err;
}
}
if (obj->efile.arena_data && !obj->arena_map) {
pr_warn("elf: sec '%s': to use global __arena variables the ARENA map should be explicitly declared in SEC(\".maps\")\n",
ARENA_SEC);
return -ENOENT;
}
return 0;
}
@ -3756,6 +3832,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
sec_desc->shdr = sh;
sec_desc->data = data;
obj->efile.has_st_ops = true;
} else if (strcmp(name, ARENA_SEC) == 0) {
obj->efile.arena_data = data;
obj->efile.arena_data_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@ -4385,6 +4464,15 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
/* arena data relocation */
if (shdr_idx == obj->efile.arena_data_shndx) {
reloc_desc->type = RELO_DATA;
reloc_desc->insn_idx = insn_idx;
reloc_desc->map_idx = obj->arena_map - obj->maps;
reloc_desc->sym_off = sym->st_value;
return 0;
}
/* generic map reference relocation */
if (type == LIBBPF_MAP_UNSPEC) {
if (!bpf_object__shndx_is_maps(obj, shdr_idx)) {
@ -4925,6 +5013,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
bpf_gen__map_freeze(obj->gen_loader, map - obj->maps);
return 0;
}
err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
if (err) {
err = -errno;
@ -5017,6 +5106,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_QUEUE:
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
create_attr.btf_fd = 0;
create_attr.btf_key_type_id = 0;
create_attr.btf_value_type_id = 0;
@ -5261,7 +5351,23 @@ retry:
if (err < 0)
goto err_out;
}
if (map->def.type == BPF_MAP_TYPE_ARENA) {
map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map),
PROT_READ | PROT_WRITE,
map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
map->fd, 0);
if (map->mmaped == MAP_FAILED) {
err = -errno;
map->mmaped = NULL;
pr_warn("map '%s': failed to mmap arena: %d\n",
map->name, err);
return err;
}
if (obj->arena_data) {
memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz);
zfree(&obj->arena_data);
}
}
if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) {
err = init_map_in_map_slots(obj, map);
if (err < 0)
@ -8758,13 +8864,9 @@ static void bpf_map__destroy(struct bpf_map *map)
zfree(&map->init_slots);
map->init_slots_sz = 0;
if (map->mmaped) {
size_t mmap_sz;
mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
munmap(map->mmaped, mmap_sz);
map->mmaped = NULL;
}
if (map->mmaped && map->mmaped != map->obj->arena_data)
munmap(map->mmaped, bpf_map_mmap_sz(map));
map->mmaped = NULL;
if (map->st_ops) {
zfree(&map->st_ops->data);
@ -8824,6 +8926,8 @@ void bpf_object__close(struct bpf_object *obj)
if (obj->token_fd > 0)
close(obj->token_fd);
zfree(&obj->arena_data);
free(obj);
}
@ -9995,11 +10099,14 @@ int bpf_map__set_value_size(struct bpf_map *map, __u32 size)
return libbpf_err(-EBUSY);
if (map->mmaped) {
int err;
size_t mmap_old_sz, mmap_new_sz;
int err;
mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries);
if (map->def.type != BPF_MAP_TYPE_ARRAY)
return -EOPNOTSUPP;
mmap_old_sz = bpf_map_mmap_sz(map);
mmap_new_sz = array_map_mmap_sz(size, map->def.max_entries);
err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz);
if (err) {
pr_warn("map '%s': failed to resize memory-mapped region: %d\n",
@ -10032,18 +10139,26 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map)
int bpf_map__set_initial_value(struct bpf_map *map,
const void *data, size_t size)
{
size_t actual_sz;
if (map->obj->loaded || map->reused)
return libbpf_err(-EBUSY);
if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG ||
size != map->def.value_size)
if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG)
return libbpf_err(-EINVAL);
if (map->def.type == BPF_MAP_TYPE_ARENA)
actual_sz = map->obj->arena_data_sz;
else
actual_sz = map->def.value_size;
if (size != actual_sz)
return libbpf_err(-EINVAL);
memcpy(map->mmaped, data, size);
return 0;
}
void *bpf_map__initial_value(struct bpf_map *map, size_t *psize)
void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize)
{
if (bpf_map__is_struct_ops(map)) {
if (psize)
@ -10053,7 +10168,12 @@ void *bpf_map__initial_value(struct bpf_map *map, size_t *psize)
if (!map->mmaped)
return NULL;
*psize = map->def.value_size;
if (map->def.type == BPF_MAP_TYPE_ARENA)
*psize = map->obj->arena_data_sz;
else
*psize = map->def.value_size;
return map->mmaped;
}
@ -13530,7 +13650,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
for (i = 0; i < s->map_cnt; i++) {
struct bpf_map *map = *s->maps[i].map;
size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
size_t mmap_sz = bpf_map_mmap_sz(map);
int prot, map_fd = map->fd;
void **mmaped = s->maps[i].mmaped;
@ -13542,6 +13662,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
continue;
}
if (map->def.type == BPF_MAP_TYPE_ARENA) {
*mmaped = map->mmaped;
continue;
}
if (map->def.map_flags & BPF_F_RDONLY_PROG)
prot = PROT_READ;
else

View File

@ -1014,7 +1014,7 @@ LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra);
LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map,
const void *data, size_t size);
LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize);
LIBBPF_API void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize);
/**
* @brief **bpf_map__is_internal()** tells the caller whether or not the

View File

@ -338,6 +338,13 @@ static int probe_map_create(enum bpf_map_type map_type)
key_size = 0;
max_entries = 1;
break;
case BPF_MAP_TYPE_ARENA:
key_size = 0;
value_size = 0;
max_entries = 1; /* one page */
opts.map_extra = 0; /* can mmap() at any address */
opts.map_flags = BPF_F_MMAPABLE;
break;
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_ARRAY:
case BPF_MAP_TYPE_PROG_ARRAY:

View File

@ -10,3 +10,5 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu
fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95
fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95
missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95)
verifier_arena # JIT does not support arena
arena_htab # JIT does not support arena

View File

@ -4,3 +4,5 @@ exceptions # JIT does not support calling kfunc bpf_throw (excepti
get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace)
stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?)
verifier_iterating_callbacks
verifier_arena # JIT does not support arena
arena_htab # JIT does not support arena

View File

@ -0,0 +1,67 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#pragma once
#include "bpf_arena_common.h"
#ifndef __round_mask
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#endif
#ifndef round_up
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#endif
#ifdef __BPF__
#define NR_CPUS (sizeof(struct cpumask) * 8)
static void __arena * __arena page_frag_cur_page[NR_CPUS];
static int __arena page_frag_cur_offset[NR_CPUS];
/* Simple page_frag allocator */
static inline void __arena* bpf_alloc(unsigned int size)
{
__u64 __arena *obj_cnt;
__u32 cpu = bpf_get_smp_processor_id();
void __arena *page = page_frag_cur_page[cpu];
int __arena *cur_offset = &page_frag_cur_offset[cpu];
int offset;
size = round_up(size, 8);
if (size >= PAGE_SIZE - 8)
return NULL;
if (!page) {
refill:
page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (!page)
return NULL;
cast_kern(page);
page_frag_cur_page[cpu] = page;
*cur_offset = PAGE_SIZE - 8;
obj_cnt = page + PAGE_SIZE - 8;
*obj_cnt = 0;
} else {
cast_kern(page);
obj_cnt = page + PAGE_SIZE - 8;
}
offset = *cur_offset - size;
if (offset < 0)
goto refill;
(*obj_cnt)++;
*cur_offset = offset;
return page + offset;
}
static inline void bpf_free(void __arena *addr)
{
__u64 __arena *obj_cnt;
addr = (void __arena *)(((long)addr) & ~(PAGE_SIZE - 1));
obj_cnt = addr + PAGE_SIZE - 8;
if (--(*obj_cnt) == 0)
bpf_arena_free_pages(&arena, addr, 1);
}
#else
static inline void __arena* bpf_alloc(unsigned int size) { return NULL; }
static inline void bpf_free(void __arena *addr) {}
#endif

View File

@ -0,0 +1,70 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#pragma once
#ifndef WRITE_ONCE
#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
#endif
#ifndef NUMA_NO_NODE
#define NUMA_NO_NODE (-1)
#endif
#ifndef arena_container_of
#define arena_container_of(ptr, type, member) \
({ \
void __arena *__mptr = (void __arena *)(ptr); \
((type *)(__mptr - offsetof(type, member))); \
})
#endif
#ifdef __BPF__ /* when compiled as bpf program */
#ifndef PAGE_SIZE
#define PAGE_SIZE __PAGE_SIZE
/*
* for older kernels try sizeof(struct genradix_node)
* or flexible:
* static inline long __bpf_page_size(void) {
* return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
* }
* but generated code is not great.
*/
#endif
#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM)
#define __arena __attribute__((address_space(1)))
#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
#else
#define __arena
#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
#endif
void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
int node_id, __u64 flags) __ksym __weak;
void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
#else /* when compiled as user space code */
#define __arena
#define __arg_arena
#define cast_kern(ptr) /* nop for user space */
#define cast_user(ptr) /* nop for user space */
__weak char arena[1];
#ifndef offsetof
#define offsetof(type, member) ((unsigned long)&((type *)0)->member)
#endif
static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
int node_id, __u64 flags)
{
return NULL;
}
static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
{
}
#endif

View File

@ -0,0 +1,100 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#pragma once
#include <errno.h>
#include "bpf_arena_alloc.h"
#include "bpf_arena_list.h"
struct htab_bucket {
struct arena_list_head head;
};
typedef struct htab_bucket __arena htab_bucket_t;
struct htab {
htab_bucket_t *buckets;
int n_buckets;
};
typedef struct htab __arena htab_t;
static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash)
{
htab_bucket_t *b = htab->buckets;
cast_kern(b);
return &b[hash & (htab->n_buckets - 1)];
}
static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash)
{
return &__select_bucket(htab, hash)->head;
}
struct hashtab_elem {
int hash;
int key;
int value;
struct arena_list_node hash_node;
};
typedef struct hashtab_elem __arena hashtab_elem_t;
static hashtab_elem_t *lookup_elem_raw(arena_list_head_t *head, __u32 hash, int key)
{
hashtab_elem_t *l;
list_for_each_entry(l, head, hash_node)
if (l->hash == hash && l->key == key)
return l;
return NULL;
}
static int htab_hash(int key)
{
return key;
}
__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key)
{
hashtab_elem_t *l_old;
arena_list_head_t *head;
cast_kern(htab);
head = select_bucket(htab, key);
l_old = lookup_elem_raw(head, htab_hash(key), key);
if (l_old)
return l_old->value;
return 0;
}
__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value)
{
hashtab_elem_t *l_new = NULL, *l_old;
arena_list_head_t *head;
cast_kern(htab);
head = select_bucket(htab, key);
l_old = lookup_elem_raw(head, htab_hash(key), key);
l_new = bpf_alloc(sizeof(*l_new));
if (!l_new)
return -ENOMEM;
l_new->key = key;
l_new->hash = htab_hash(key);
l_new->value = value;
list_add_head(&l_new->hash_node, head);
if (l_old) {
list_del(&l_old->hash_node);
bpf_free(l_old);
}
return 0;
}
void htab_init(htab_t *htab)
{
void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
cast_user(buckets);
htab->buckets = buckets;
htab->n_buckets = 2 * PAGE_SIZE / sizeof(struct htab_bucket);
}

View File

@ -0,0 +1,92 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#pragma once
#include "bpf_arena_common.h"
struct arena_list_node;
typedef struct arena_list_node __arena arena_list_node_t;
struct arena_list_node {
arena_list_node_t *next;
arena_list_node_t * __arena *pprev;
};
struct arena_list_head {
struct arena_list_node __arena *first;
};
typedef struct arena_list_head __arena arena_list_head_t;
#define list_entry(ptr, type, member) arena_container_of(ptr, type, member)
#define list_entry_safe(ptr, type, member) \
({ typeof(*ptr) * ___ptr = (ptr); \
___ptr ? ({ cast_kern(___ptr); list_entry(___ptr, type, member); }) : NULL; \
})
#ifndef __BPF__
static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; }
static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
#define cond_break ({})
#endif
/* Safely walk link list elements. Deletion of elements is allowed. */
#define list_for_each_entry(pos, head, member) \
for (void * ___tmp = (pos = list_entry_safe((head)->first, \
typeof(*(pos)), member), \
(void *)0); \
pos && ({ ___tmp = (void *)pos->member.next; 1; }); \
cond_break, \
pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))
static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
{
arena_list_node_t *first = h->first, * __arena *tmp;
cast_user(first);
cast_kern(n);
WRITE_ONCE(n->next, first);
cast_kern(first);
if (first) {
tmp = &n->next;
cast_user(tmp);
WRITE_ONCE(first->pprev, tmp);
}
cast_user(n);
WRITE_ONCE(h->first, n);
tmp = &h->first;
cast_user(tmp);
cast_kern(n);
WRITE_ONCE(n->pprev, tmp);
}
static inline void __list_del(arena_list_node_t *n)
{
arena_list_node_t *next = n->next, *tmp;
arena_list_node_t * __arena *pprev = n->pprev;
cast_user(next);
cast_kern(pprev);
tmp = *pprev;
cast_kern(tmp);
WRITE_ONCE(tmp, next);
if (next) {
cast_user(pprev);
cast_kern(next);
WRITE_ONCE(next->pprev, pprev);
}
}
#define POISON_POINTER_DELTA 0
#define LIST_POISON1 ((void __arena *) 0x100 + POISON_POINTER_DELTA)
#define LIST_POISON2 ((void __arena *) 0x122 + POISON_POINTER_DELTA)
static inline void list_del(arena_list_node_t *n)
{
__list_del(n);
n->next = LIST_POISON1;
n->pprev = LIST_POISON2;
}

View File

@ -343,6 +343,49 @@ l_true: \
asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var))
#endif
/* emit instruction:
* rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
*/
#ifndef bpf_addr_space_cast
#define bpf_addr_space_cast(var, dst_as, src_as)\
asm volatile(".byte 0xBF; \
.ifc %[reg], r0; \
.byte 0x00; \
.endif; \
.ifc %[reg], r1; \
.byte 0x11; \
.endif; \
.ifc %[reg], r2; \
.byte 0x22; \
.endif; \
.ifc %[reg], r3; \
.byte 0x33; \
.endif; \
.ifc %[reg], r4; \
.byte 0x44; \
.endif; \
.ifc %[reg], r5; \
.byte 0x55; \
.endif; \
.ifc %[reg], r6; \
.byte 0x66; \
.endif; \
.ifc %[reg], r7; \
.byte 0x77; \
.endif; \
.ifc %[reg], r8; \
.byte 0x88; \
.endif; \
.ifc %[reg], r9; \
.byte 0x99; \
.endif; \
.short %[off]; \
.long %[as]" \
: [reg]"+r"(var) \
: [off]"i"(BPF_ADDR_SPACE_CAST) \
, [as]"i"((dst_as << 16) | src_as));
#endif
/* Description
* Assert that a conditional expression is true.
* Returns

View File

@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include <sys/mman.h>
#include <network_helpers.h>
#include "arena_htab_asm.skel.h"
#include "arena_htab.skel.h"
#define PAGE_SIZE 4096
#include "bpf_arena_htab.h"
static void test_arena_htab_common(struct htab *htab)
{
int i;
printf("htab %p buckets %p n_buckets %d\n", htab, htab->buckets, htab->n_buckets);
ASSERT_OK_PTR(htab->buckets, "htab->buckets shouldn't be NULL");
for (i = 0; htab->buckets && i < 16; i += 4) {
/*
* Walk htab buckets and link lists since all pointers are correct,
* though they were written by bpf program.
*/
int val = htab_lookup_elem(htab, i);
ASSERT_EQ(i, val, "key == value");
}
}
static void test_arena_htab_llvm(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts);
struct arena_htab *skel;
struct htab *htab;
size_t arena_sz;
void *area;
int ret;
skel = arena_htab__open_and_load();
if (!ASSERT_OK_PTR(skel, "arena_htab__open_and_load"))
return;
area = bpf_map__initial_value(skel->maps.arena, &arena_sz);
/* fault-in a page with pgoff == 0 as sanity check */
*(volatile int *)area = 0x55aa;
/* bpf prog will allocate more pages */
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_llvm), &opts);
ASSERT_OK(ret, "ret");
ASSERT_OK(opts.retval, "retval");
if (skel->bss->skip) {
printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
test__skip();
goto out;
}
htab = skel->bss->htab_for_user;
test_arena_htab_common(htab);
out:
arena_htab__destroy(skel);
}
static void test_arena_htab_asm(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts);
struct arena_htab_asm *skel;
struct htab *htab;
int ret;
skel = arena_htab_asm__open_and_load();
if (!ASSERT_OK_PTR(skel, "arena_htab_asm__open_and_load"))
return;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_asm), &opts);
ASSERT_OK(ret, "ret");
ASSERT_OK(opts.retval, "retval");
htab = skel->bss->htab_for_user;
test_arena_htab_common(htab);
arena_htab_asm__destroy(skel);
}
void test_arena_htab(void)
{
if (test__start_subtest("arena_htab_llvm"))
test_arena_htab_llvm();
if (test__start_subtest("arena_htab_asm"))
test_arena_htab_asm();
}

View File

@ -0,0 +1,68 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include <sys/mman.h>
#include <network_helpers.h>
#define PAGE_SIZE 4096
#include "bpf_arena_list.h"
#include "arena_list.skel.h"
struct elem {
struct arena_list_node node;
__u64 value;
};
static int list_sum(struct arena_list_head *head)
{
struct elem __arena *n;
int sum = 0;
list_for_each_entry(n, head, node)
sum += n->value;
return sum;
}
static void test_arena_list_add_del(int cnt)
{
LIBBPF_OPTS(bpf_test_run_opts, opts);
struct arena_list *skel;
int expected_sum = (u64)cnt * (cnt - 1) / 2;
int ret, sum;
skel = arena_list__open_and_load();
if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load"))
return;
skel->bss->cnt = cnt;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts);
ASSERT_OK(ret, "ret_add");
ASSERT_OK(opts.retval, "retval");
if (skel->bss->skip) {
printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
test__skip();
goto out;
}
sum = list_sum(skel->bss->list_head);
ASSERT_EQ(sum, expected_sum, "sum of elems");
ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems");
ASSERT_EQ(skel->arena->test_val, cnt + 1, "num of elems");
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_del), &opts);
ASSERT_OK(ret, "ret_del");
sum = list_sum(skel->bss->list_head);
ASSERT_EQ(sum, 0, "sum of list elems after del");
ASSERT_EQ(skel->bss->list_sum, expected_sum, "sum of list elems computed by prog");
ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems");
out:
arena_list__destroy(skel);
}
void test_arena_list(void)
{
if (test__start_subtest("arena_list_1"))
test_arena_list_add_del(1);
if (test__start_subtest("arena_list_1000"))
test_arena_list_add_del(1000);
}

View File

@ -4,6 +4,7 @@
#include "cap_helpers.h"
#include "verifier_and.skel.h"
#include "verifier_arena.skel.h"
#include "verifier_array_access.skel.h"
#include "verifier_basic_stack.skel.h"
#include "verifier_bitfield_write.skel.h"
@ -118,6 +119,7 @@ static void run_tests_aux(const char *skel_name,
#define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL)
void test_verifier_and(void) { RUN(verifier_and); }
void test_verifier_arena(void) { RUN(verifier_arena); }
void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); }
void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); }
void test_verifier_bounds(void) { RUN(verifier_bounds); }

View File

@ -0,0 +1,48 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "bpf_experimental.h"
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
__uint(map_flags, BPF_F_MMAPABLE);
__uint(max_entries, 100); /* number of pages */
} arena SEC(".maps");
#include "bpf_arena_htab.h"
void __arena *htab_for_user;
bool skip = false;
int zero = 0;
SEC("syscall")
int arena_htab_llvm(void *ctx)
{
#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM)
struct htab __arena *htab;
__u64 i;
htab = bpf_alloc(sizeof(*htab));
cast_kern(htab);
htab_init(htab);
/* first run. No old elems in the table */
for (i = zero; i < 1000; i++)
htab_update_elem(htab, i, i);
/* should replace all elems with new ones */
for (i = zero; i < 1000; i++)
htab_update_elem(htab, i, i);
cast_user(htab);
htab_for_user = htab;
#else
skip = true;
#endif
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#define BPF_ARENA_FORCE_ASM
#define arena_htab_llvm arena_htab_asm
#include "arena_htab.c"

View File

@ -0,0 +1,87 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "bpf_experimental.h"
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
__uint(map_flags, BPF_F_MMAPABLE);
__uint(max_entries, 100); /* number of pages */
#ifdef __TARGET_ARCH_arm64
__ulong(map_extra, 0x1ull << 32); /* start of mmap() region */
#else
__ulong(map_extra, 0x1ull << 44); /* start of mmap() region */
#endif
} arena SEC(".maps");
#include "bpf_arena_alloc.h"
#include "bpf_arena_list.h"
struct elem {
struct arena_list_node node;
__u64 value;
};
struct arena_list_head __arena *list_head;
int list_sum;
int cnt;
bool skip = false;
#ifdef __BPF_FEATURE_ARENA_CAST
long __arena arena_sum;
int __arena test_val = 1;
struct arena_list_head __arena global_head;
#else
long arena_sum SEC(".arena.1");
int test_val SEC(".arena.1");
#endif
int zero;
SEC("syscall")
int arena_list_add(void *ctx)
{
#ifdef __BPF_FEATURE_ARENA_CAST
__u64 i;
list_head = &global_head;
for (i = zero; i < cnt; cond_break, i++) {
struct elem __arena *n = bpf_alloc(sizeof(*n));
test_val++;
n->value = i;
arena_sum += i;
list_add_head(&n->node, list_head);
}
#else
skip = true;
#endif
return 0;
}
SEC("syscall")
int arena_list_del(void *ctx)
{
#ifdef __BPF_FEATURE_ARENA_CAST
struct elem __arena *n;
int sum = 0;
arena_sum = 0;
list_for_each_entry(n, list_head, node) {
sum += n->value;
arena_sum += n->value;
list_del(&n->node);
bpf_free(n);
}
list_sum = sum;
#else
skip = true;
#endif
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,146 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
#include "bpf_experimental.h"
#include "bpf_arena_common.h"
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
__uint(map_flags, BPF_F_MMAPABLE);
__uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/
__ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */
} arena SEC(".maps");
SEC("syscall")
__success __retval(0)
int basic_alloc1(void *ctx)
{
#if defined(__BPF_FEATURE_ARENA_CAST)
volatile int __arena *page1, *page2, *no_page, *page3;
page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (!page1)
return 1;
*page1 = 1;
page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (!page2)
return 2;
*page2 = 2;
no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (no_page)
return 3;
if (*page1 != 1)
return 4;
if (*page2 != 2)
return 5;
bpf_arena_free_pages(&arena, (void __arena *)page2, 1);
if (*page1 != 1)
return 6;
if (*page2 != 0) /* use-after-free should return 0 */
return 7;
page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
if (!page3)
return 8;
*page3 = 3;
if (page2 != page3)
return 9;
if (*page1 != 1)
return 10;
#endif
return 0;
}
SEC("syscall")
__success __retval(0)
int basic_alloc2(void *ctx)
{
#if defined(__BPF_FEATURE_ARENA_CAST)
volatile char __arena *page1, *page2, *page3, *page4;
page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
if (!page1)
return 1;
page2 = page1 + __PAGE_SIZE;
page3 = page1 + __PAGE_SIZE * 2;
page4 = page1 - __PAGE_SIZE;
*page1 = 1;
*page2 = 2;
*page3 = 3;
*page4 = 4;
if (*page1 != 1)
return 1;
if (*page2 != 2)
return 2;
if (*page3 != 0)
return 3;
if (*page4 != 0)
return 4;
bpf_arena_free_pages(&arena, (void __arena *)page1, 2);
if (*page1 != 0)
return 5;
if (*page2 != 0)
return 6;
if (*page3 != 0)
return 7;
if (*page4 != 0)
return 8;
#endif
return 0;
}
struct bpf_arena___l {
struct bpf_map map;
} __attribute__((preserve_access_index));
SEC("syscall")
__success __retval(0) __log_level(2)
int basic_alloc3(void *ctx)
{
struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena;
volatile char __arena *pages;
pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0);
if (!pages)
return 1;
return 0;
}
SEC("iter.s/bpf_map")
__success __log_level(2)
int iter_maps1(struct bpf_iter__bpf_map *ctx)
{
struct bpf_map *map = ctx->map;
if (!map)
return 0;
bpf_arena_alloc_pages(map, NULL, map->max_entries, 0, 0);
return 0;
}
SEC("iter.s/bpf_map")
__failure __msg("expected pointer to STRUCT bpf_map")
int iter_maps2(struct bpf_iter__bpf_map *ctx)
{
struct seq_file *seq = ctx->meta->seq;
bpf_arena_alloc_pages((void *)seq, NULL, 1, 0, 0);
return 0;
}
SEC("iter.s/bpf_map")
__failure __msg("untrusted_ptr_bpf_map")
int iter_maps3(struct bpf_iter__bpf_map *ctx)
{
struct bpf_map *map = ctx->map;
if (!map)
return 0;
bpf_arena_alloc_pages(map->inner_map_meta, NULL, map->max_entries, 0, 0);
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -501,7 +501,7 @@ static bool is_unpriv_capable_map(struct bpf_map *map)
}
}
static int do_prog_test_run(int fd_prog, int *retval)
static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts)
{
__u8 tmp_out[TEST_DATA_LEN << 2] = {};
__u8 tmp_in[TEST_DATA_LEN] = {};
@ -514,6 +514,10 @@ static int do_prog_test_run(int fd_prog, int *retval)
.repeat = 1,
);
if (empty_opts) {
memset(&topts, 0, sizeof(struct bpf_test_run_opts));
topts.sz = sizeof(struct bpf_test_run_opts);
}
err = bpf_prog_test_run_opts(fd_prog, &topts);
saved_errno = errno;
@ -649,7 +653,8 @@ void run_subtest(struct test_loader *tester,
}
}
do_prog_test_run(bpf_program__fd(tprog), &retval);
do_prog_test_run(bpf_program__fd(tprog), &retval,
bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false);
if (retval != subspec->retval && subspec->retval != POINTER_VALUE) {
PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval);
goto tobj_cleanup;