diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6fddb63271d9..e0e0d00cf103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -159,6 +159,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index b7421780e4e9..4cc18ba1b75e 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5007c3ffe96f..018b61febf0e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1102,6 +1102,40 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(addr, opcode, len); } +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke((void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + static void do_sync_core(void *info) { sync_core(); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 36f6fc3e6e69..643f38b91e30 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip) } static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr, - const bool text_live) + void *old_addr, void *new_addr) { const u8 *nop_insn = x86_nops[5]; u8 old_insn[X86_PATCH_SIZE]; @@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - if (text_live) - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); - else - memcpy(ip, new_insn, X86_PATCH_SIZE); + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: @@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, /* BPF poking in modules is not supported */ return -EINVAL; - return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); + return __bpf_arch_text_poke(ip, t, old_addr, new_addr); } #define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) @@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) mutex_lock(&array->aux->poke_mutex); target = array->ptrs[poke->tail_call.key]; if (target) { - /* Plain memcpy is used when image is not live yet - * and still not locked as read-only. Once poke - * location is active (poke->tailcall_target_stable), - * any parallel bpf_arch_text_poke() might occur - * still on the read-write image until we finally - * locked it as read-only. Both modifications on - * the given image are under text_mutex to avoid - * interference. - */ ret = __bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + - poke->adj_off, false); + poke->adj_off); BUG_ON(ret < 0); ret = __bpf_arch_text_poke(poke->tailcall_bypass, BPF_MOD_JUMP, (u8 *)poke->tailcall_target + - X86_PATCH_SIZE, NULL, false); + X86_PATCH_SIZE, NULL); BUG_ON(ret < 0); } WRITE_ONCE(poke->tailcall_target_stable, true); @@ -866,7 +853,7 @@ static void emit_nops(u8 **pprog, int len) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; @@ -893,8 +880,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, push_callee_regs(&prog, callee_regs_used); ilen = prog - temp; - if (image) - memcpy(image + proglen, temp, ilen); + if (rw_image) + memcpy(rw_image + proglen, temp, ilen); proglen += ilen; addrs[0] = proglen; prog = temp; @@ -1323,6 +1310,9 @@ st: if (is_imm8(insn->off)) pr_err("extable->insn doesn't fit into 32-bit\n"); return -EFAULT; } + /* switch ex to rw buffer for writes */ + ex = (void *)rw_image + ((void *)ex - (void *)image); + ex->insn = delta; ex->data = EX_TYPE_BPF; @@ -1705,7 +1695,7 @@ emit_jmp: pr_err("bpf_jit: fatal error\n"); return -EFAULT; } - memcpy(image + proglen, temp, ilen); + memcpy(rw_image + proglen, temp, ilen); } proglen += ilen; addrs[i] = proglen; @@ -2246,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) } struct x64_jit_data { + struct bpf_binary_header *rw_header; struct bpf_binary_header *header; int *addrs; u8 *image; @@ -2258,6 +2249,7 @@ struct x64_jit_data { struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; struct x64_jit_data *jit_data; @@ -2266,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bool tmp_blinded = false; bool extra_pass = false; bool padding = false; + u8 *rw_image = NULL; u8 *image = NULL; int *addrs; int pass; @@ -2301,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) oldproglen = jit_data->proglen; image = jit_data->image; header = jit_data->header; + rw_header = jit_data->rw_header; + rw_image = (void *)rw_header + ((void *)image - (void *)header); extra_pass = true; padding = true; goto skip_init_addrs; @@ -2331,12 +2326,12 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); + proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; if (header) - bpf_jit_binary_free(header); + bpf_jit_binary_pack_free(header, rw_header); prog = orig_prog; goto out_addrs; } @@ -2360,8 +2355,9 @@ out_image: sizeof(struct exception_table_entry); /* allocate module memory for x86 insns and extable */ - header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, - &image, align, jit_fill_hole); + header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size, + &image, align, &rw_header, &rw_image, + jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; @@ -2377,14 +2373,22 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + /* + * bpf_jit_binary_pack_finalize fails in two scenarios: + * 1) header is not pointing to proper module memory; + * 2) the arch doesn't support bpf_arch_text_copy(). + * + * Both cases are serious bugs that we should not continue. + */ + BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header)); bpf_tail_call_direct_fixup(prog); - bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; jit_data->ctx = ctx; jit_data->proglen = proglen; jit_data->image = image; jit_data->header = header; + jit_data->rw_header = rw_header; } prog->bpf_func = (void *)image; prog->jited = 1; @@ -2412,3 +2416,10 @@ bool bpf_jit_supports_kfunc_call(void) { return true; } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + if (text_poke_copy(dst, src, len) == NULL) + return ERR_PTR(-EINVAL); + return dst; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6eb0b180d33b..2fc7e5c5ef41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -int bpf_jit_charge_modmem(u32 pages); -void bpf_jit_uncharge_modmem(u32 pages); +int bpf_jit_charge_modmem(u32 size); +void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, @@ -953,6 +953,7 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool use_bpf_prog_pack; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; @@ -2362,6 +2363,8 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void *bpf_arch_text_copy(void *dst, void *src, size_t len); + struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); diff --git a/include/linux/filter.h b/include/linux/filter.h index d23e999dc032..1cb1af917617 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,7 +548,7 @@ struct sock_fprog_kern { #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { - u32 pages; + u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; @@ -886,17 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->pages); - set_memory_x((unsigned long)hdr, hdr->pages); -} - -static inline struct bpf_binary_header * -bpf_jit_binary_hdr(const struct bpf_prog *fp) -{ - unsigned long real_start = (unsigned long)fp->bpf_func; - unsigned long addr = real_start & PAGE_MASK; - - return (void *)addr; + set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); @@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, + unsigned int alignment, + struct bpf_binary_header **rw_hdr, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); + int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 04a8d5bea552..306aa63fa58e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); - unsigned long addr = (unsigned long)hdr; - WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void @@ -808,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return slot; } +/* + * BPF program pack allocator. + * + * Most BPF programs are pretty small. Allocating a hole page for each + * program is sometime a waste. Many small bpf program also adds pressure + * to instruction TLB. To solve this issue, we introduce a BPF program pack + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) + * to host BPF programs. + */ +#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE +#define BPF_PROG_CHUNK_SHIFT 6 +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) + +struct bpf_prog_pack { + struct list_head list; + void *ptr; + unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)]; +}; + +#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + +static DEFINE_MUTEX(pack_mutex); +static LIST_HEAD(pack_list); + +static struct bpf_prog_pack *alloc_new_pack(void) +{ + struct bpf_prog_pack *pack; + + pack = kzalloc(sizeof(*pack), GFP_KERNEL); + if (!pack) + return NULL; + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + if (!pack->ptr) { + kfree(pack); + return NULL; + } + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); + list_add_tail(&pack->list, &pack_list); + + set_vm_flush_reset_perms(pack->ptr); + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + return pack; +} + +static void *bpf_prog_pack_alloc(u32 size) +{ + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); + struct bpf_prog_pack *pack; + unsigned long pos; + void *ptr = NULL; + + if (size > BPF_PROG_MAX_PACK_PROG_SIZE) { + size = round_up(size, PAGE_SIZE); + ptr = module_alloc(size); + if (ptr) { + set_vm_flush_reset_perms(ptr); + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); + set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + } + return ptr; + } + mutex_lock(&pack_mutex); + list_for_each_entry(pack, &pack_list, list) { + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + nbits, 0); + if (pos < BPF_PROG_CHUNK_COUNT) + goto found_free_area; + } + + pack = alloc_new_pack(); + if (!pack) + goto out; + + pos = 0; + +found_free_area: + bitmap_set(pack->bitmap, pos, nbits); + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); + +out: + mutex_unlock(&pack_mutex); + return ptr; +} + +static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +{ + struct bpf_prog_pack *pack = NULL, *tmp; + unsigned int nbits; + unsigned long pos; + void *pack_ptr; + + if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) { + module_memfree(hdr); + return; + } + + pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1)); + mutex_lock(&pack_mutex); + + list_for_each_entry(tmp, &pack_list, list) { + if (tmp->ptr == pack_ptr) { + pack = tmp; + break; + } + } + + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) + goto out; + + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + + bitmap_clear(pack->bitmap, pos, nbits); + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + BPF_PROG_CHUNK_COUNT, 0) == 0) { + list_del(&pack->list); + module_memfree(pack->ptr); + kfree(pack); + } +out: + mutex_unlock(&pack_mutex); +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, @@ -833,12 +957,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +969,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -867,7 +990,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -877,20 +1000,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -903,10 +1025,113 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); +} + +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * Also, ro_header->size in 2) is not properly set yet, so rw_header->size + * is used for uncharge. + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = rw_header ? rw_header->size : ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; } /* This symbol is only overridden by archs that have different @@ -918,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } @@ -2445,6 +2673,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..e76a488c09c3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ae41d0cf96c..bbef86cb4e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13067,6 +13067,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog);