From 5d946c5abbaf68083fa6a41824dd79e1f06286d8 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Wed, 20 Nov 2019 01:10:42 +0100 Subject: [PATCH 01/27] xsk: Fix xsk_poll()'s return type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xsk_poll() is defined as returning 'unsigned int' but the .poll method is declared as returning '__poll_t', a bitwise type. Fix this by using the proper return type and using the EPOLL constants instead of the POLL ones, as required for __poll_t. Signed-off-by: Luc Van Oostenryck Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20191120001042.30830-1-luc.vanoostenryck@gmail.com --- net/xdp/xsk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6040bc2b0088..956793893c9d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -447,10 +447,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return __xsk_sendmsg(sk); } -static unsigned int xsk_poll(struct file *file, struct socket *sock, +static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; @@ -472,9 +472,9 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, } if (xs->rx && !xskq_empty_desc(xs->rx)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && !xskq_full_desc(xs->tx)) - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; return mask; } From a0f17cc6665c80ab2765f9244c41ec127821f343 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 19 Nov 2019 11:17:06 +0000 Subject: [PATCH 02/27] tools, bpftool: Fix warning on ignored return value for 'read' When building bpftool, a warning was introduced by commit a94364603610 ("bpftool: Allow to read btf as raw data"), because the return value from a call to 'read()' is ignored. Let's address it. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191119111706.22440-1-quentin.monnet@netronome.com --- tools/bpf/bpftool/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index a7b8bf233cf5..e5bc97b71ceb 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -428,15 +428,15 @@ exit_close: static bool is_btf_raw(const char *file) { __u16 magic = 0; - int fd; + int fd, nb_read; fd = open(file, O_RDONLY); if (fd < 0) return false; - read(fd, &magic, sizeof(magic)); + nb_read = read(fd, &magic, sizeof(magic)); close(fd); - return magic == BTF_MAGIC; + return nb_read == sizeof(magic) && magic == BTF_MAGIC; } static int do_dump(int argc, char **argv) From ffc88174cdcf5f51fb7f6298fe9203a36c904f1f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:40 -0800 Subject: [PATCH 03/27] selftests/bpf: Ensure no DWARF relocations for BPF object files Add -mattr=dwarfris attribute to llc to avoid having relocations against DWARF data. These relocations make it impossible to inspect DWARF contents: all strings are invalid. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-2-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4fe4aec0367c..085678d88ef8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -161,7 +161,7 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h define CLANG_BPF_BUILD_RULE ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2 + $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE From a89b2cbf71d64b61e79bbe5cb7ff4664797eeaaf Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 19 Nov 2019 10:56:26 +0000 Subject: [PATCH 04/27] tools, bpf: Fix build for 'make -s tools/bpf O=' Building selftests with 'make TARGETS=bpf kselftest' was fixed in commit 55d554f5d140 ("tools: bpf: Use !building_out_of_srctree to determine srctree"). However, by updating $(srctree) in tools/bpf/Makefile for in-tree builds only, we leave out the case where we pass an output directory to build BPF tools, but $(srctree) is not set. This typically happens for: $ make -s tools/bpf O=/tmp/foo Makefile:40: /tools/build/Makefile.feature: No such file or directory Fix it by updating $(srctree) in the Makefile not only for out-of-tree builds, but also if $(srctree) is empty. Detected with test_bpftool_build.sh. Fixes: 55d554f5d140 ("tools: bpf: Use !building_out_of_srctree to determine srctree") Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191119105626.21453-1-quentin.monnet@netronome.com --- tools/bpf/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 5d1995fd369c..5535650800ab 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -16,7 +16,13 @@ CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include # isn't set and when invoked from selftests build, where srctree # is set to ".". building_out_of_srctree is undefined for in srctree # builds +ifeq ($(srctree),) +update_srctree := 1 +endif ifndef building_out_of_srctree +update_srctree := 1 +endif +ifeq ($(update_srctree),1) srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif From 1f8e2bcb2cd5ee1a731fb625a5438e2c305f6a7c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:41 -0800 Subject: [PATCH 05/27] libbpf: Refactor relocation handling Relocation handling code is convoluted and unnecessarily deeply nested. Split out per-relocation logic into separate function. Also refactor the logic to be more a sequence of per-relocation type checks and processing steps, making it simpler to follow control flow. This makes it easier to further extends it to new kinds of relocations (e.g., support for extern variables). This patch also makes relocation's section verification more robust. Previously relocations against not yet supported externs were silently ignored because of obj->efile.text_shndx was zero, when all BPF programs had custom section names and there was no .text section. Also, invalid LDIMM64 relocations against non-map sections were passed through, if they were pointing to a .text section (or 0, which is invalid section). All these bugs are fixed within this refactoring and checks are made more appropriate for each type of relocation. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 261 ++++++++++++++++++++++------------------- 1 file changed, 143 insertions(+), 118 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a7d183f7ac72..4c3592c4ec5d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -276,8 +276,8 @@ struct bpf_object { struct { GElf_Shdr shdr; Elf_Data *data; - } *reloc; - int nr_reloc; + } *reloc_sects; + int nr_reloc_sects; int maps_shndx; int btf_maps_shndx; int text_shndx; @@ -575,8 +575,8 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.rodata = NULL; obj->efile.bss = NULL; - zfree(&obj->efile.reloc); - obj->efile.nr_reloc = 0; + zfree(&obj->efile.reloc_sects); + obj->efile.nr_reloc_sects = 0; zclose(obj->efile.fd); obj->efile.obj_buf = NULL; obj->efile.obj_buf_sz = 0; @@ -1693,8 +1693,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, pr_debug("skip section(%d) %s\n", idx, name); } } else if (sh.sh_type == SHT_REL) { - int nr_reloc = obj->efile.nr_reloc; - void *reloc = obj->efile.reloc; + int nr_sects = obj->efile.nr_reloc_sects; + void *sects = obj->efile.reloc_sects; int sec = sh.sh_info; /* points to other section */ /* Only do relo for section with exec instructions */ @@ -1704,18 +1704,18 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, continue; } - reloc = reallocarray(reloc, nr_reloc + 1, - sizeof(*obj->efile.reloc)); - if (!reloc) { - pr_warn("realloc failed\n"); + sects = reallocarray(sects, nr_sects + 1, + sizeof(*obj->efile.reloc_sects)); + if (!sects) { + pr_warn("reloc_sects realloc failed\n"); return -ENOMEM; } - obj->efile.reloc = reloc; - obj->efile.nr_reloc++; + obj->efile.reloc_sects = sects; + obj->efile.nr_reloc_sects++; - obj->efile.reloc[nr_reloc].shdr = sh; - obj->efile.reloc[nr_reloc].data = data; + obj->efile.reloc_sects[nr_sects].shdr = sh; + obj->efile.reloc_sects[nr_sects].data = data; } else if (sh.sh_type == SHT_NOBITS && strcmp(name, ".bss") == 0) { obj->efile.bss = data; obj->efile.bss_shndx = idx; @@ -1780,14 +1780,6 @@ static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, shndx == obj->efile.btf_maps_shndx; } -static bool bpf_object__relo_in_known_section(const struct bpf_object *obj, - int shndx) -{ - return shndx == obj->efile.text_shndx || - bpf_object__shndx_is_maps(obj, shndx) || - bpf_object__shndx_is_data(obj, shndx); -} - static enum libbpf_map_type bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) { @@ -1801,14 +1793,124 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) return LIBBPF_MAP_UNSPEC; } +static int bpf_program__record_reloc(struct bpf_program *prog, + struct reloc_desc *reloc_desc, + __u32 insn_idx, const char *name, + const GElf_Sym *sym, const GElf_Rel *rel) +{ + struct bpf_insn *insn = &prog->insns[insn_idx]; + size_t map_idx, nr_maps = prog->obj->nr_maps; + struct bpf_object *obj = prog->obj; + __u32 shdr_idx = sym->st_shndx; + enum libbpf_map_type type; + struct bpf_map *map; + + /* sub-program call relocation */ + if (insn->code == (BPF_JMP | BPF_CALL)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("incorrect bpf_call opcode\n"); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + pr_warn("bad call relo against section %u\n", shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % 8) { + pr_warn("bad call relo offset: %lu\n", sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->text_off = sym->st_value / 8; + obj->has_pseudo_calls = true; + return 0; + } + + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", + insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { + pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", + name, shdr_idx, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); + + /* generic map reference relocation */ + if (type == LIBBPF_MAP_UNSPEC) { + if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { + pr_warn("bad map relo against section %u\n", + shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || + map->sec_idx != sym->st_shndx || + map->sec_offset != sym->st_value) + continue; + pr_debug("found map %zd (%s, sec %d, off %zu) for insn %u\n", + map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("map relo failed to find map for sec %u, off %llu\n", + shdr_idx, (__u64)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + return 0; + } + + /* global data map relocation */ + if (!bpf_object__shndx_is_data(obj, shdr_idx)) { + pr_warn("bad data relo against section %u\n", shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL) { + pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", + name, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + if (!obj->caps.global_data) { + pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", + name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type) + continue; + pr_debug("found data map %zd (%s, sec %d, off %zu) for insn %u\n", + map_idx, map->name, map->sec_idx, map->sec_offset, + insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("data relo failed to find map for sec %u\n", + shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + return 0; +} + static int bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, Elf_Data *data, struct bpf_object *obj) { Elf_Data *symbols = obj->efile.symbols; - struct bpf_map *maps = obj->maps; - size_t nr_maps = obj->nr_maps; - int i, nrels; + int err, i, nrels; pr_debug("collecting relocating info for: '%s'\n", prog->section_name); nrels = shdr->sh_size / shdr->sh_entsize; @@ -1821,12 +1923,8 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, prog->nr_reloc = nrels; for (i = 0; i < nrels; i++) { - struct bpf_insn *insns = prog->insns; - enum libbpf_map_type type; - unsigned int insn_idx; - unsigned int shdr_idx; const char *name; - size_t map_idx; + __u32 insn_idx; GElf_Sym sym; GElf_Rel rel; @@ -1834,101 +1932,28 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, pr_warn("relocation: failed to get %d reloc\n", i); return -LIBBPF_ERRNO__FORMAT; } - if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { pr_warn("relocation: symbol %"PRIx64" not found\n", GELF_R_SYM(rel.r_info)); return -LIBBPF_ERRNO__FORMAT; } + if (rel.r_offset % sizeof(struct bpf_insn)) + return -LIBBPF_ERRNO__FORMAT; + insn_idx = rel.r_offset / sizeof(struct bpf_insn); name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, sym.st_name) ? : ""; - pr_debug("relo for %lld value %lld name %d (\'%s\')\n", - (long long) (rel.r_info >> 32), - (long long) sym.st_value, sym.st_name, name); + pr_debug("relo for shdr %u, symb %llu, value %llu, type %d, bind %d, name %d (\'%s\'), insn %u\n", + (__u32)sym.st_shndx, (__u64)GELF_R_SYM(rel.r_info), + (__u64)sym.st_value, GELF_ST_TYPE(sym.st_info), + GELF_ST_BIND(sym.st_info), sym.st_name, name, + insn_idx); - shdr_idx = sym.st_shndx; - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - pr_debug("relocation: insn_idx=%u, shdr_idx=%u\n", - insn_idx, shdr_idx); - - if (shdr_idx >= SHN_LORESERVE) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", - name, shdr_idx, insn_idx, - insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - if (!bpf_object__relo_in_known_section(obj, shdr_idx)) { - pr_warn("Program '%s' contains unrecognized relo data pointing to section %u\n", - prog->section_name, shdr_idx); - return -LIBBPF_ERRNO__RELOC; - } - - if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) { - if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) { - pr_warn("incorrect bpf_call opcode\n"); - return -LIBBPF_ERRNO__RELOC; - } - if (sym.st_value % 8) { - pr_warn("bad call relo offset: %lu\n", sym.st_value); - return -LIBBPF_ERRNO__RELOC; - } - prog->reloc_desc[i].type = RELO_CALL; - prog->reloc_desc[i].insn_idx = insn_idx; - prog->reloc_desc[i].text_off = sym.st_value / 8; - obj->has_pseudo_calls = true; - continue; - } - - if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", - insn_idx, insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - - if (bpf_object__shndx_is_maps(obj, shdr_idx) || - bpf_object__shndx_is_data(obj, shdr_idx)) { - type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); - if (type != LIBBPF_MAP_UNSPEC) { - if (GELF_ST_BIND(sym.st_info) == STB_GLOBAL) { - pr_warn("bpf: relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", - name, insn_idx, insns[insn_idx].code); - return -LIBBPF_ERRNO__RELOC; - } - if (!obj->caps.global_data) { - pr_warn("bpf: relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", - name, insn_idx); - return -LIBBPF_ERRNO__RELOC; - } - } - - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].libbpf_type != type) - continue; - if (type != LIBBPF_MAP_UNSPEC || - (maps[map_idx].sec_idx == sym.st_shndx && - maps[map_idx].sec_offset == sym.st_value)) { - pr_debug("relocation: found map %zd (%s, sec_idx %d, offset %zu) for insn %u\n", - map_idx, maps[map_idx].name, - maps[map_idx].sec_idx, - maps[map_idx].sec_offset, - insn_idx); - break; - } - } - - if (map_idx >= nr_maps) { - pr_warn("bpf relocation: map_idx %d larger than %d\n", - (int)map_idx, (int)nr_maps - 1); - return -LIBBPF_ERRNO__RELOC; - } - - prog->reloc_desc[i].type = type != LIBBPF_MAP_UNSPEC ? - RELO_DATA : RELO_LD64; - prog->reloc_desc[i].insn_idx = insn_idx; - prog->reloc_desc[i].map_idx = map_idx; - } + err = bpf_program__record_reloc(prog, &prog->reloc_desc[i], + insn_idx, name, &sym, &rel); + if (err) + return err; } return 0; } @@ -3671,9 +3696,9 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) return -LIBBPF_ERRNO__INTERNAL; } - for (i = 0; i < obj->efile.nr_reloc; i++) { - GElf_Shdr *shdr = &obj->efile.reloc[i].shdr; - Elf_Data *data = obj->efile.reloc[i].data; + for (i = 0; i < obj->efile.nr_reloc_sects; i++) { + GElf_Shdr *shdr = &obj->efile.reloc_sects[i].shdr; + Elf_Data *data = obj->efile.reloc_sects[i].data; int idx = shdr->sh_info; struct bpf_program *prog; From 31f8b8295bb8997f139fe34b68654f8f1408f0da Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 19 Nov 2019 10:50:09 +0000 Subject: [PATCH 06/27] selftests, bpftool: Set EXIT trap after usage function The trap on EXIT is used to clean up any temporary directory left by the build attempts. It is not needed when the user simply calls the script with its --help option, and may not be needed either if we add checks (e.g. on the availability of bpftool files) before the build attempts. Let's move this trap and related variables lower down in the code, so that we don't accidentally change the value returned from the script on early exits at pre-checks. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191119105010.19189-2-quentin.monnet@netronome.com --- .../selftests/bpf/test_bpftool_build.sh | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 4ba5a34bff56..1fc6f6247f9b 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -1,6 +1,26 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +case $1 in + -h|--help) + echo -e "$0 [-j ]" + echo -e "\tTest the different ways of building bpftool." + echo -e "" + echo -e "\tOptions:" + echo -e "\t\t-j :\tPass -j flag to 'make'." + exit 0 + ;; +esac + +J=$* + +# Assume script is located under tools/testing/selftests/bpf/. We want to start +# build attempts from the top of kernel repository. +SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) +KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +cd $KDIR_ROOT_DIR + ERROR=0 TMPDIR= @@ -13,26 +33,6 @@ return_value() { } trap return_value EXIT -case $1 in - -h|--help) - echo -e "$0 [-j ]" - echo -e "\tTest the different ways of building bpftool." - echo -e "" - echo -e "\tOptions:" - echo -e "\t\t-j :\tPass -j flag to 'make'." - exit - ;; -esac - -J=$* - -# Assume script is located under tools/testing/selftests/bpf/. We want to start -# build attempts from the top of kernel repository. -SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) -SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) -KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) -cd $KDIR_ROOT_DIR - check() { local dir=$(realpath $1) From 8983b731ceb42939acaa6158abcf8adb56f834bf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:42 -0800 Subject: [PATCH 07/27] libbpf: Fix various errors and warning reported by checkpatch.pl Fix a bunch of warnings and errors reported by checkpatch.pl, to make it easier to spot new problems. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-4-andriin@fb.com --- tools/lib/bpf/libbpf.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4c3592c4ec5d..64bc75fc6723 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -105,7 +105,7 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) err = action; \ if (err) \ goto out; \ -} while(0) +} while (0) /* Copied from tools/perf/util/util.h */ @@ -965,8 +965,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) obj->path, nr_maps, data->d_size); if (!data->d_size || nr_maps == 0 || (data->d_size % nr_maps) != 0) { - pr_warn("unable to determine map definition size " - "section %s, %d maps in %zd bytes\n", + pr_warn("unable to determine map definition size section %s, %d maps in %zd bytes\n", obj->path, nr_maps, data->d_size); return -EINVAL; } @@ -1030,12 +1029,11 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) * incompatible. */ char *b; + for (b = ((char *)def) + sizeof(struct bpf_map_def); b < ((char *)def) + map_def_sz; b++) { if (*b != 0) { - pr_warn("maps section in %s: \"%s\" " - "has unrecognized, non-zero " - "options\n", + pr_warn("maps section in %s: \"%s\" has unrecognized, non-zero options\n", obj->path, map_name); if (strict) return -EINVAL; @@ -1073,7 +1071,8 @@ skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) */ static bool get_map_field_int(const char *map_name, const struct btf *btf, const struct btf_type *def, - const struct btf_member *m, __u32 *res) { + const struct btf_member *m, __u32 *res) +{ const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); const char *name = btf__name_by_offset(btf, m->name_off); const struct btf_array *arr_info; @@ -1387,7 +1386,8 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, for (i = 0; i < vlen; i++) { err = bpf_object__init_user_btf_map(obj, sec, i, obj->efile.btf_maps_shndx, - data, strict, pin_root_path); + data, strict, + pin_root_path); if (err) return err; } @@ -1673,12 +1673,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps, if (strcmp(name, ".text") == 0) obj->efile.text_shndx = idx; err = bpf_object__add_program(obj, data->d_buf, - data->d_size, name, idx); + data->d_size, + name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = libbpf_strerror_r(-err, errmsg, - sizeof(errmsg)); + char *cp; + cp = libbpf_strerror_r(-err, errmsg, + sizeof(errmsg)); pr_warn("failed to alloc program %s (%s): %s", name, obj->path, cp); return err; @@ -1828,7 +1830,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { - pr_warn("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", + pr_warn("invalid relo for insns[%d].code 0x%x\n", insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; } @@ -2145,7 +2147,7 @@ bpf_object__probe_global_data(struct bpf_object *obj) static int bpf_object__probe_btf_func(struct bpf_object *obj) { - const char strs[] = "\0int\0x\0a"; + static const char strs[] = "\0int\0x\0a"; /* void x(int a) {} */ __u32 types[] = { /* int */ @@ -2171,7 +2173,7 @@ static int bpf_object__probe_btf_func(struct bpf_object *obj) static int bpf_object__probe_btf_datasec(struct bpf_object *obj) { - const char strs[] = "\0x\0.data"; + static const char strs[] = "\0x\0.data"; /* static int a; */ __u32 types[] = { /* int */ @@ -5112,7 +5114,7 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, *expected_attach_type = section_names[i].expected_attach_type; return 0; } - pr_warn("failed to guess program type based on ELF section name '%s'\n", name); + pr_warn("failed to guess program type from ELF section '%s'\n", name); type_names = libbpf_get_type_names(false); if (type_names != NULL) { pr_info("supported section(type) names are:%s\n", type_names); @@ -6338,7 +6340,8 @@ static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { }; -static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, int offset) +static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, + int offset) { __u32 *array = (__u32 *)info; @@ -6347,7 +6350,8 @@ static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, int offse return -(int)offset; } -static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, int offset) +static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, + int offset) { __u64 *array = (__u64 *)info; From 5940c5bf6504f66f57f03f1d0046abfaf2198b3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Nov 2019 10:50:10 +0000 Subject: [PATCH 08/27] selftests, bpftool: Skip the build test if not in tree If selftests are copied over to another machine/location for execution the build test of bpftool will obviously not work, since the sources are not copied. Skip it if we can't find bpftool's Makefile. Reported-by: Naresh Kamboju Signed-off-by: Jakub Kicinski Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191119105010.19189-3-quentin.monnet@netronome.com --- tools/testing/selftests/bpf/test_bpftool_build.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 1fc6f6247f9b..ac349a5cea7e 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -20,6 +20,10 @@ SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) cd $KDIR_ROOT_DIR +if [ ! -e tools/bpf/bpftool/Makefile ]; then + echo -e "skip: bpftool files not found!\n" + exit 0 +fi ERROR=0 TMPDIR= From 393cdfbee809891dc6ba859a44cc6441fa8dce9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 20 Nov 2019 23:07:43 -0800 Subject: [PATCH 09/27] libbpf: Support initialized global variables Initialized global variables are no different in ELF from static variables, and don't require any extra support from libbpf. But they are matching semantics of global data (backed by BPF maps) more closely, preventing LLVM/Clang from aggressively inlining constant values and not requiring volatile incantations to prevent those. This patch enables global variables. It still disables uninitialized variables, which will be put into special COM (common) ELF section, because BPF doesn't allow uninitialized data to be accessed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121070743.1309473-5-andriin@fb.com --- tools/lib/bpf/libbpf.c | 9 ++------- .../testing/selftests/bpf/progs/test_core_reloc_arrays.c | 4 ++-- .../bpf/progs/test_core_reloc_bitfields_direct.c | 4 ++-- .../bpf/progs/test_core_reloc_bitfields_probed.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_existence.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_flavors.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_ints.c | 4 ++-- .../testing/selftests/bpf/progs/test_core_reloc_kernel.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_misc.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_mods.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_nesting.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_primitives.c | 4 ++-- .../selftests/bpf/progs/test_core_reloc_ptr_as_arr.c | 4 ++-- tools/testing/selftests/bpf/progs/test_core_reloc_size.c | 4 ++-- 14 files changed, 28 insertions(+), 33 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 64bc75fc6723..a4e250a369c6 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1835,8 +1835,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable in special section (0x%x) found in insns[%d].code 0x%x\n", - name, shdr_idx, insn_idx, insn->code); + pr_warn("invalid relo for \'%s\' in special section 0x%x; forgot to initialize global var?..\n", + name, shdr_idx); return -LIBBPF_ERRNO__RELOC; } @@ -1876,11 +1876,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, pr_warn("bad data relo against section %u\n", shdr_idx); return -LIBBPF_ERRNO__RELOC; } - if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL) { - pr_warn("relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", - name, insn_idx, insn->code); - return -LIBBPF_ERRNO__RELOC; - } if (!obj->caps.global_data) { pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n", name, insn_idx); diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c index 96b1f5f3b07a..89951b684282 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_arrays_output { int a2; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c index 738b34b72655..edc0f7c9e56d 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_bitfields { /* unsigned bitfields */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c index e466e3ab7de4..6c20e433558b 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_bitfields { /* unsigned bitfields */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c index c3cac95a19f1..1b7f0ae49cfb 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_existence_output { int a_exists; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c index 71fd7cebc9d7..b5dbeef540fd 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_flavors { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c index ad5c3f59c9c6..c78ab6d28a14 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_ints { uint8_t u8_field; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index a4b5e0562ed5..5d499ebdc4bd 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_kernel_output { int valid[10]; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c index 1a36b0856653..292a5c4ee76a 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_misc_output { int a, b, c; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c index 3199fafede2c..0b28bfacc8fd 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_mods_output { int a, b, c, d, e, f, g, h; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c index 98238cb64fbd..39279bf0c9db 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_nesting_substruct { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c index 4f3ecb9127bb..ea57973cdd19 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; enum core_reloc_primitives_enum { A = 0, diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c index 27f602f00419..d1eb59d4ea64 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_ptr_as_arr { int a; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index 9a92998d9107..9e091124d3bd 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -8,10 +8,10 @@ char _license[] SEC("license") = "GPL"; -static volatile struct data { +struct { char in[256]; char out[256]; -} data; +} data = {}; struct core_reloc_size_output { int int_sz; From a8fdaad5cfd250b9effcec942b3bf7bc5a6c8b17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Nov 2019 16:35:48 -0800 Subject: [PATCH 10/27] selftests/bpf: Integrate verbose verifier log into test_progs Add exra level of verboseness, activated by -vvv argument. When -vv is specified, verbose libbpf and verifier log (level 1) is output, even for successful tests. With -vvv, verifier log goes to level 2. This is extremely useful to debug verifier failures, as well as just see the state and flow of verification. Before this, you'd have to go and modify load_program()'s source code inside libbpf to specify extra log_level flags, which is suboptimal to say the least. Currently -vv and -vvv triggering verifier output is integrated into test_stub's bpf_prog_load as well as bpf_verif_scale.c tests. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120003548.4159797-1-andriin@fb.com --- .../selftests/bpf/prog_tests/bpf_verif_scale.c | 4 +++- tools/testing/selftests/bpf/test_progs.c | 18 ++++++++++++------ tools/testing/selftests/bpf/test_progs.h | 10 ++++++++-- tools/testing/selftests/bpf/test_stub.c | 4 ++++ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index 1c01ee2600a9..9486c13af6b2 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -15,6 +15,8 @@ static int libbpf_debug_print(enum libbpf_print_level level, return 0; } +extern int extra_prog_load_log_flags; + static int check_load(const char *file, enum bpf_prog_type type) { struct bpf_prog_load_attr attr; @@ -24,7 +26,7 @@ static int check_load(const char *file, enum bpf_prog_type type) memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); attr.file = file; attr.prog_type = type; - attr.log_level = 4; + attr.log_level = 4 | extra_prog_load_log_flags; attr.prog_flags = BPF_F_TEST_RND_HI32; err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); bpf_object__close(obj); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index a05a807840c0..7fa7d08a8104 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -45,7 +45,7 @@ static void dump_test_log(const struct prog_test_def *test, bool failed) fflush(stdout); /* exports env.log_buf & env.log_cnt */ - if (env.verbose || test->force_log || failed) { + if (env.verbosity > VERBOSE_NONE || test->force_log || failed) { if (env.log_cnt) { env.log_buf[env.log_cnt] = '\0'; fprintf(env.stdout, "%s", env.log_buf); @@ -346,14 +346,14 @@ static const struct argp_option opts[] = { { "verifier-stats", ARG_VERIFIER_STATS, NULL, 0, "Output verifier statistics", }, { "verbose", ARG_VERBOSE, "LEVEL", OPTION_ARG_OPTIONAL, - "Verbose output (use -vv for extra verbose output)" }, + "Verbose output (use -vv or -vvv for progressively verbose output)" }, {}, }; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { - if (!env.very_verbose && level == LIBBPF_DEBUG) + if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG) return 0; vprintf(format, args); return 0; @@ -419,6 +419,8 @@ int parse_num_list(const char *s, struct test_selector *sel) return 0; } +extern int extra_prog_load_log_flags; + static error_t parse_arg(int key, char *arg, struct argp_state *state) { struct test_env *env = state->input; @@ -460,9 +462,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) env->verifier_stats = true; break; case ARG_VERBOSE: + env->verbosity = VERBOSE_NORMAL; if (arg) { if (strcmp(arg, "v") == 0) { - env->very_verbose = true; + env->verbosity = VERBOSE_VERY; + extra_prog_load_log_flags = 1; + } else if (strcmp(arg, "vv") == 0) { + env->verbosity = VERBOSE_SUPER; + extra_prog_load_log_flags = 2; } else { fprintf(stderr, "Unrecognized verbosity setting ('%s'), only -v and -vv are supported\n", @@ -470,7 +477,6 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } - env->verbose = true; break; case ARGP_KEY_ARG: argp_usage(state); @@ -489,7 +495,7 @@ static void stdio_hijack(void) env.stdout = stdout; env.stderr = stderr; - if (env.verbose) { + if (env.verbosity > VERBOSE_NONE) { /* nothing to do, output to stdout by default */ return; } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 0c48f64f732b..8477df835979 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -39,6 +39,13 @@ typedef __u16 __sum16; #include "trace_helpers.h" #include "flow_dissector_load.h" +enum verbosity { + VERBOSE_NONE, + VERBOSE_NORMAL, + VERBOSE_VERY, + VERBOSE_SUPER, +}; + struct test_selector { const char *name; bool *num_set; @@ -49,8 +56,7 @@ struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; bool verifier_stats; - bool verbose; - bool very_verbose; + enum verbosity verbosity; bool jit_enabled; diff --git a/tools/testing/selftests/bpf/test_stub.c b/tools/testing/selftests/bpf/test_stub.c index 84e81a89e2f9..47e132726203 100644 --- a/tools/testing/selftests/bpf/test_stub.c +++ b/tools/testing/selftests/bpf/test_stub.c @@ -5,6 +5,8 @@ #include #include +int extra_prog_load_log_flags = 0; + int bpf_prog_test_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd) { @@ -15,6 +17,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type, attr.prog_type = type; attr.expected_attach_type = 0; attr.prog_flags = BPF_F_TEST_RND_HI32; + attr.log_level = extra_prog_load_log_flags; return bpf_prog_load_xattr(&attr, pobj, prog_fd); } @@ -35,6 +38,7 @@ int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, load_attr.license = license; load_attr.kern_version = kern_version; load_attr.prog_flags = BPF_F_TEST_RND_HI32; + load_attr.log_level = extra_prog_load_log_flags; return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz); } From 071cdecec57fb5d5df78e6a12114ad7bccea5b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 21 Nov 2019 14:36:12 +0100 Subject: [PATCH 11/27] xdp: Fix cleanup on map free for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tetsuo pointed out that it was not only the device unregister hook that was broken for devmap_hash types, it was also cleanup on map free. So better fix this as well. While we're at it, there's no reason to allocate the netdev_map array for DEVMAP_HASH, so skip that and adjust the cost accordingly. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121133612.430414-1-toke@redhat.com --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3867864cdc2f..3d3d61b5985b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -74,7 +74,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head __percpu *flush_list; struct list_head list; @@ -101,6 +101,12 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries) return hash; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -120,8 +126,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += sizeof(struct list_head) * num_possible_cpus(); + cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -129,6 +134,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; + } else { + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } /* if map size is larger than memlock limit, reject it */ @@ -143,24 +150,22 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *), - dtab->map.numa_node); - if (!dtab->netdev_map) - goto free_percpu; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_map_area; + goto free_percpu; spin_lock_init(&dtab->index_lock); + } else { + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); + if (!dtab->netdev_map) + goto free_percpu; } return 0; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -228,21 +233,40 @@ static void dev_map_free(struct bpf_map *map) cond_resched(); } - for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; - dev = dtab->netdev_map[i]; - if (!dev) - continue; + head = dev_map_index_hash(dtab, i); - free_percpu(dev->bulkq); - dev_put(dev->dev); - kfree(dev); + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + hlist_del_rcu(&dev->index_hlist); + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + } + + kfree(dtab->dev_index_head); + } else { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + + bpf_map_area_free(dtab->netdev_map); } free_percpu(dtab->flush_list); - bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -263,12 +287,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); From 581738a681b6faae5725c2555439189ca81c0f1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:50 -0800 Subject: [PATCH 12/27] bpf: Provide better register bounds after jmp32 instructions With latest llvm (trunk https://github.com/llvm/llvm-project), test_progs, which has +alu32 enabled, failed for strobemeta.o. The verifier output looks like below with edit to replace large decimal numbers with hex ones. 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,umax_value=0xffffffff00000001) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 197: (67) r6 <<= 32 R6_w=inv(id=0,smax_value=0x7fffffff00000000,umax_value=0xffffffff00000000, var_off=(0x0; 0xffffffff00000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 257: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any array access into a map The value range for register r6 at insn 198 should be really just 0/1. The umax_value=0xffffffff caused later verification failure. After jmp instructions, the current verifier already tried to use just obtained information to get better register range. The current mechanism is for 64bit register only. This patch implemented to tighten the range for 32bit sub-registers after jmp32 instructions. With the patch, we have the below range ranges for the above code sequence: 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,smax_value=0x7fffffff00000001,umax_value=0xffffffff00000001, var_off=(0x0; 0xffffffff00000001)) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0x1)) 197: (67) r6 <<= 32 R6_w=inv(id=0,umax_value=0x100000000,var_off=(0x0; 0x100000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 257: (85) call bpf_probe_read_user_str#114 ... At insn 194, the register R0 has better var_off.mask and smax_value. Especially, the var_off.mask ensures later lshift and rshift maintains proper value range. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170650.449030-1-yhs@fb.com --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f59f7a19dd0..fc85714428c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1007,6 +1007,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } +static void __reg_bound_offset32(struct bpf_reg_state *reg) +{ + u64 mask = 0xffffFFFF; + struct tnum range = tnum_range(reg->umin_value & mask, + reg->umax_value & mask); + struct tnum lo32 = tnum_cast(reg->var_off, 4); + struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + + reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5589,6 +5600,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5698,6 +5713,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. From 1aace10f41adf1080d1cc54de9b3db98b8b8b0fb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 21 Nov 2019 16:35:27 -0800 Subject: [PATCH 13/27] libbpf: Fix bpf_object name determination for bpf_object__open_file() If bpf_object__open_file() gets path like "some/dir/obj.o", it should derive BPF object's name as "obj" (unless overriden through opts->object_name). Instead, due to using `path` as a fallback value for opts->obj_name, path is used as is for object name, so for above example BPF object's name will be verbatim "some/dir/obj", which leads to all sorts of troubles, especially when internal maps are concern (they are using up to 8 characters of object name). Fix that by ensuring object_name stays NULL, unless overriden. Fixes: 291ee02b5e40 ("libbpf: Refactor bpf_object__open APIs to use common opts") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191122003527.551556-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a4e250a369c6..e1698461c6b3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3940,7 +3940,7 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, if (!OPTS_VALID(opts, bpf_object_open_opts)) return ERR_PTR(-EINVAL); - obj_name = OPTS_GET(opts, object_name, path); + obj_name = OPTS_GET(opts, object_name, NULL); if (obj_buf) { if (!obj_name) { snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", From 260cb5df9d16c5715b32d73cc8af26ad9a17a792 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:51 -0800 Subject: [PATCH 14/27] selftests/bpf: Add verifier tests for better jmp32 register bounds Three test cases are added. Test 1: jmp32 'reg op imm'. Test 2: jmp32 'reg op reg' where dst 'reg' has unknown constant and src 'reg' has known constant Test 3: jmp32 'reg op reg' where dst 'reg' has known constant and src 'reg' has unknown constant Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170651.449096-1-yhs@fb.com --- tools/testing/selftests/bpf/verifier/jmp32.c | 83 ++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index f0961c58581e..bf0322eb5346 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -744,3 +744,86 @@ .result = ACCEPT, .retval = 2, }, +{ + "jgt32: range bound deduction, reg op imm", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_JMP32_IMM(BPF_JGT, BPF_REG_0, 1, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, +{ + "jgt32: range bound deduction, reg1 op reg2, reg1 unknown", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_JMP32_REG(BPF_JGT, BPF_REG_0, BPF_REG_2, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, +{ + "jle32: range bound deduction, reg1 op reg2, reg2 unknown", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid), + BPF_MOV32_IMM(BPF_REG_2, 1), + BPF_JMP32_REG(BPF_JLE, BPF_REG_2, BPF_REG_0, 5), + BPF_MOV32_REG(BPF_REG_6, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6), + BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, From 6147a140c99f1ded2b519dfbed17e781e5861bf3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 21 Nov 2019 09:59:00 -0800 Subject: [PATCH 15/27] selftests/bpf: Ensure core_reloc_kernel is reading test_progs's data only test_core_reloc_kernel.c selftest is the only CO-RE test that reads and returns for validation calling thread's information (pid, tgid, comm). Thus it has to make sure that only test_prog's invocations are honored. Fixes: df36e621418b ("selftests/bpf: add CO-RE relocs testing setup") Reported-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121175900.3486133-1-andriin@fb.com --- .../selftests/bpf/prog_tests/core_reloc.c | 16 +++++++++++----- .../selftests/bpf/progs/test_core_reloc_kernel.c | 4 ++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index ec9e2fdd6b89..05fe85281ff7 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -2,6 +2,7 @@ #include #include "progs/core_reloc_types.h" #include +#include #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name) @@ -452,6 +453,7 @@ static struct core_reloc_test_case test_cases[] = { struct data { char in[256]; char out[256]; + uint64_t my_pid_tgid; }; static size_t roundup_page(size_t sz) @@ -471,9 +473,12 @@ void test_core_reloc(void) struct bpf_map *data_map; struct bpf_program *prog; struct bpf_object *obj; + uint64_t my_pid_tgid; struct data *data; void *mmap_data = NULL; + my_pid_tgid = getpid() | ((uint64_t)syscall(SYS_gettid) << 32); + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { test_case = &test_cases[i]; if (!test__start_subtest(test_case->case_name)) @@ -517,11 +522,6 @@ void test_core_reloc(void) goto cleanup; } - link = bpf_program__attach_raw_tracepoint(prog, tp_name); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", - PTR_ERR(link))) - goto cleanup; - data_map = bpf_object__find_map_by_name(obj, "test_cor.bss"); if (CHECK(!data_map, "find_data_map", "data map not found\n")) goto cleanup; @@ -537,6 +537,12 @@ void test_core_reloc(void) memset(mmap_data, 0, sizeof(*data)); memcpy(data->in, test_case->input, test_case->input_len); + data->my_pid_tgid = my_pid_tgid; + + link = bpf_program__attach_raw_tracepoint(prog, tp_name); + if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", + PTR_ERR(link))) + goto cleanup; /* trigger test run */ usleep(1); diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index 5d499ebdc4bd..270de441b60a 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -11,6 +11,7 @@ char _license[] SEC("license") = "GPL"; struct { char in[256]; char out[256]; + uint64_t my_pid_tgid; } data = {}; struct core_reloc_kernel_output { @@ -38,6 +39,9 @@ int test_core_kernel(void *ctx) uint32_t real_tgid = (uint32_t)pid_tgid; int pid, tgid; + if (data.my_pid_tgid != pid_tgid) + return 0; + if (CORE_READ(&pid, &task->pid) || CORE_READ(&tgid, &task->tgid)) return 1; From c4781e37c6a22c39cb4a57411d14f42aca124f04 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 21 Nov 2019 17:15:15 -0800 Subject: [PATCH 16/27] selftests/bpf: Add BPF trampoline performance test Add a test that benchmarks different ways of attaching BPF program to a kernel function. Here are the results for 2.4Ghz x86 cpu on a kernel without mitigations: $ ./test_progs -n 49 -v|grep events task_rename base 2743K events per sec task_rename kprobe 2419K events per sec task_rename kretprobe 1876K events per sec task_rename raw_tp 2578K events per sec task_rename fentry 2710K events per sec task_rename fexit 2685K events per sec On a kernel with retpoline: $ ./test_progs -n 49 -v|grep events task_rename base 2401K events per sec task_rename kprobe 1930K events per sec task_rename kretprobe 1485K events per sec task_rename raw_tp 2053K events per sec task_rename fentry 2351K events per sec task_rename fexit 2185K events per sec All 5 approaches: - kprobe/kretprobe in __set_task_comm() - raw tracepoint in trace_task_rename() - fentry/fexit in __set_task_comm() are roughly equivalent. __set_task_comm() by itself is quite fast, so any extra instructions add up. Until BPF trampoline was introduced the fastest mechanism was raw tracepoint. kprobe via ftrace was second best. kretprobe is slow due to trap. New fentry/fexit methods via BPF trampoline are clearly the fastest and the difference is more pronounced with retpoline on, since BPF trampoline doesn't use indirect jumps. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191122011515.255371-1-ast@kernel.org --- .../selftests/bpf/prog_tests/test_overhead.c | 142 ++++++++++++++++++ .../selftests/bpf/progs/test_overhead.c | 43 ++++++ 2 files changed, 185 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_overhead.c create mode 100644 tools/testing/selftests/bpf/progs/test_overhead.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c new file mode 100644 index 000000000000..c32aa28bd93f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ +#define _GNU_SOURCE +#include +#include + +#define MAX_CNT 100000 + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static int test_task_rename(const char *prog) +{ + int i, fd, duration = 0, err; + char buf[] = "test\n"; + __u64 start_time; + + fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (CHECK(fd < 0, "open /proc", "err %d", errno)) + return -1; + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) { + err = write(fd, buf, sizeof(buf)); + if (err < 0) { + CHECK(err < 0, "task rename", "err %d", errno); + close(fd); + return -1; + } + } + printf("task_rename %s\t%lluK events per sec\n", prog, + MAX_CNT * 1000000ll / (time_get_ns() - start_time)); + close(fd); + return 0; +} + +static void test_run(const char *prog) +{ + test_task_rename(prog); +} + +static void setaffinity(void) +{ + cpu_set_t cpuset; + int cpu = 0; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); +} + +void test_test_overhead(void) +{ + const char *kprobe_name = "kprobe/__set_task_comm"; + const char *kretprobe_name = "kretprobe/__set_task_comm"; + const char *raw_tp_name = "raw_tp/task_rename"; + const char *fentry_name = "fentry/__set_task_comm"; + const char *fexit_name = "fexit/__set_task_comm"; + const char *kprobe_func = "__set_task_comm"; + struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog; + struct bpf_program *fentry_prog, *fexit_prog; + struct bpf_object *obj; + struct bpf_link *link; + int err, duration = 0; + + obj = bpf_object__open_file("./test_overhead.o", NULL); + if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + return; + + kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name); + if (CHECK(!kprobe_prog, "find_probe", + "prog '%s' not found\n", kprobe_name)) + goto cleanup; + kretprobe_prog = bpf_object__find_program_by_title(obj, kretprobe_name); + if (CHECK(!kretprobe_prog, "find_probe", + "prog '%s' not found\n", kretprobe_name)) + goto cleanup; + raw_tp_prog = bpf_object__find_program_by_title(obj, raw_tp_name); + if (CHECK(!raw_tp_prog, "find_probe", + "prog '%s' not found\n", raw_tp_name)) + goto cleanup; + fentry_prog = bpf_object__find_program_by_title(obj, fentry_name); + if (CHECK(!fentry_prog, "find_probe", + "prog '%s' not found\n", fentry_name)) + goto cleanup; + fexit_prog = bpf_object__find_program_by_title(obj, fexit_name); + if (CHECK(!fexit_prog, "find_probe", + "prog '%s' not found\n", fexit_name)) + goto cleanup; + + err = bpf_object__load(obj); + if (CHECK(err, "obj_load", "err %d\n", err)) + goto cleanup; + + setaffinity(); + + /* base line run */ + test_run("base"); + + /* attach kprobe */ + link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */, + kprobe_func); + if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("kprobe"); + bpf_link__destroy(link); + + /* attach kretprobe */ + link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */, + kprobe_func); + if (CHECK(IS_ERR(link), "attach kretprobe", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("kretprobe"); + bpf_link__destroy(link); + + /* attach raw_tp */ + link = bpf_program__attach_raw_tracepoint(raw_tp_prog, "task_rename"); + if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("raw_tp"); + bpf_link__destroy(link); + + /* attach fentry */ + link = bpf_program__attach_trace(fentry_prog); + if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("fentry"); + bpf_link__destroy(link); + + /* attach fexit */ + link = bpf_program__attach_trace(fexit_prog); + if (CHECK(IS_ERR(link), "attach fexit", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("fexit"); + bpf_link__destroy(link); +cleanup: + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c new file mode 100644 index 000000000000..ef06b2693f96 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include +#include "bpf_helpers.h" +#include "bpf_tracing.h" + +SEC("kprobe/__set_task_comm") +int prog1(struct pt_regs *ctx) +{ + return 0; +} + +SEC("kretprobe/__set_task_comm") +int prog2(struct pt_regs *ctx) +{ + return 0; +} + +SEC("raw_tp/task_rename") +int prog3(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} + +struct __set_task_comm_args { + struct task_struct *tsk; + const char *buf; + ku8 exec; +}; + +SEC("fentry/__set_task_comm") +int prog4(struct __set_task_comm_args *ctx) +{ + return 0; +} + +SEC("fexit/__set_task_comm") +int prog5(struct __set_task_comm_args *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; From 4b3da77b72ad6b3c48c6fe4a395ace7db39a12c5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:54 +0100 Subject: [PATCH 17/27] bpf, x86: Generalize and extend bpf_arch_text_poke for direct jumps Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86 JIT in order to be able to patch direct jumps or nop them out. We need this facility in order to patch tail call jumps and in later work also BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/aa4784196a8e5e985af4b30a4fe5336bce6e9643.1574452833.git.daniel@iogearbox.net --- arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++----------- include/linux/bpf.h | 6 ++++ 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2e586f579945..f438bd3b7689 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -203,8 +203,9 @@ struct jit_context { /* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -/* number of bytes emit_call() needs to generate call instruction */ -#define X86_CALL_SIZE 5 + +/* Number of bytes emit_patch() needs to generate instructions */ +#define X86_PATCH_SIZE 5 #define PROLOGUE_SIZE 25 @@ -215,7 +216,7 @@ struct jit_context { static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; - int cnt = X86_CALL_SIZE; + int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later @@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } -static int emit_call(u8 **pprog, void *func, void *ip) +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) { u8 *prog = *pprog; int cnt = 0; s64 offset; - offset = func - (ip + X86_CALL_SIZE); + offset = func - (ip + X86_PATCH_SIZE); if (!is_simm32(offset)) { pr_err("Target call %p is out of range\n", func); return -EINVAL; } - EMIT1_off32(0xE8, offset); + EMIT1_off32(opcode, offset); *pprog = prog; return 0; } +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { - u8 old_insn[X86_CALL_SIZE] = {}; - u8 new_insn[X86_CALL_SIZE] = {}; + int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); + u8 old_insn[X86_PATCH_SIZE] = {}; + u8 new_insn[X86_PATCH_SIZE] = {}; u8 *prog; int ret; if (!is_kernel_text((long)ip) && !is_bpf_text_address((long)ip)) - /* BPF trampoline in modules is not supported */ + /* BPF poking in modules is not supported */ return -EINVAL; + switch (t) { + case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: + emit_patch_fn = emit_call; + break; + case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: + emit_patch_fn = emit_jump; + break; + default: + return -ENOTSUPP; + } + if (old_addr) { prog = old_insn; - ret = emit_call(&prog, old_addr, (void *)ip); + ret = emit_patch_fn(&prog, old_addr, (void *)ip); if (ret) return ret; } if (new_addr) { prog = new_insn; - ret = emit_call(&prog, new_addr, (void *)ip); + ret = emit_patch_fn(&prog, new_addr, (void *)ip); if (ret) return ret; } + ret = -EBUSY; mutex_lock(&text_mutex); switch (t) { case BPF_MOD_NOP_TO_CALL: - if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE)) + case BPF_MOD_NOP_TO_JUMP: + if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE)) goto out; - text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); break; case BPF_MOD_CALL_TO_CALL: - if (memcmp(ip, old_insn, X86_CALL_SIZE)) + case BPF_MOD_JUMP_TO_JUMP: + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; - text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); break; case BPF_MOD_CALL_TO_NOP: - if (memcmp(ip, old_insn, X86_CALL_SIZE)) + case BPF_MOD_JUMP_TO_NOP: + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; - text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL); + text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE, + NULL); break; } ret = 0; @@ -1394,7 +1422,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - orig_call += X86_CALL_SIZE; + orig_call += X86_PATCH_SIZE; prog = image; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e89e86122233..7978b617caa8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1284,10 +1284,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + /* All call-related pokes. */ BPF_MOD_NOP_TO_CALL, BPF_MOD_CALL_TO_CALL, BPF_MOD_CALL_TO_NOP, + /* All jump-related pokes. */ + BPF_MOD_NOP_TO_JUMP, + BPF_MOD_JUMP_TO_JUMP, + BPF_MOD_JUMP_TO_NOP, }; + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: [PATCH 18/27] bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 4 ++++ kernel/bpf/core.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 20 -------------------- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7978b617caa8..561b920f0bf7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1031,6 +1031,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, { return -ENOTSUPP; } + +static inline void bpf_map_put(struct bpf_map *map) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5945c3aaa8e..0e825c164f1a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2003,12 +2003,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } +static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + bpf_free_cgroup_storage(aux); + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + kfree(aux->used_maps); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); + bpf_free_used_maps(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ae52eb05f41..373778da8489 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1302,25 +1302,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } -/* drop refcnt on maps used by eBPF program and free auxilary data */ -static void free_used_maps(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); - - kfree(aux->used_maps); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1415,7 +1396,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: [PATCH 19/27] bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 18 +++++++++++------- kernel/bpf/arraymap.c | 32 ++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 11 +++++------ kernel/bpf/map_in_map.c | 5 ++--- kernel/bpf/syscall.c | 16 ++++++---------- 5 files changed, 54 insertions(+), 28 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 561b920f0bf7..c3b29061284e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -560,17 +560,21 @@ struct bpf_prog_aux { }; }; +struct bpf_array_aux { + /* 'Ownership' of prog array is claimed by the first program that + * is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have + * the same prog type and JITed flag. + */ + enum bpf_prog_type type; + bool jited; +}; + struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; - /* 'ownership' of prog_array is claimed by the first program that - * is going to use this map or by the first program which FD is stored - * in the map to make sure that all callers and callees have the same - * prog_type and JITed flag - */ - enum bpf_prog_type owner_prog_type; - bool owner_jited; + struct bpf_array_aux *aux; union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633c8c701ff6..57da950ee55b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -671,10 +671,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0e825c164f1a..07af9c1d9cf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1691,18 +1691,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array, if (fp->kprobe_override) return false; - if (!array->owner_prog_type) { + if (!array->aux->type) { /* There's no owner yet where we could check for * compatibility. */ - array->owner_prog_type = fp->type; - array->owner_jited = fp->jited; - + array->aux->type = fp->type; + array->aux->jited = fp->jited; return true; } - return array->owner_prog_type == fp->type && - array->owner_jited == fp->jited; + return array->aux->type == fp->type && + array->aux->jited == fp->jited; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 4cbe987be35b..5e9366b33f0f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->owner_prog_type and owner_jited - * is a runtime binding. Doing static check alone - * in the verifier is not enough. + /* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 373778da8489..b904d56ec686 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -389,13 +389,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; const struct bpf_array *array; - u32 owner_prog_type = 0; - u32 owner_jited = 0; + u32 type = 0, jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - owner_prog_type = array->owner_prog_type; - owner_jited = array->owner_jited; + type = array->aux->type; + jited = array->aux->jited; } seq_printf(m, @@ -415,12 +414,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); - - if (owner_prog_type) { - seq_printf(m, "owner_prog_type:\t%u\n", - owner_prog_type); - seq_printf(m, "owner_jited:\t%u\n", - owner_jited); + if (type) { + seq_printf(m, "owner_prog_type:\t%u\n", type); + seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: [PATCH 20/27] bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 20 ++++++++++++++++++++ include/linux/filter.h | 10 ++++++++++ kernel/bpf/core.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3b29061284e..312983bf7faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,24 @@ struct bpf_func_info_aux { bool unreliable; }; +enum bpf_jit_poke_reason { + BPF_POKE_REASON_TAIL_CALL, +}; + +/* Descriptor of pokes pointing /into/ the JITed image. */ +struct bpf_jit_poke_descriptor { + void *ip; + union { + struct { + struct bpf_map *map; + u32 key; + } tail_call; + }; + bool ip_stable; + u8 adj_off; + u16 reason; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -513,6 +531,8 @@ struct bpf_prog_aux { const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ + struct bpf_jit_poke_descriptor *poke_tab; + u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/filter.h b/include/linux/filter.h index ad80e9c6111c..796b60d8cc6c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke); + int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); @@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return false; } +static inline int +bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + return -ENOTSUPP; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 07af9c1d9cf1..608b7085e0c9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { free_percpu(fp->aux->stats); + kfree(fp->aux->poke_tab); kfree(fp->aux); } vfree(fp); @@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + static const u32 poke_tab_max = 1024; + u32 slot = prog->aux->size_poke_tab; + u32 size = slot + 1; + + if (size > poke_tab_max) + return -ENOSPC; + if (poke->ip || poke->ip_stable || poke->adj_off) + return -EINVAL; + + switch (poke->reason) { + case BPF_POKE_REASON_TAIL_CALL: + if (!poke->tail_call.map) + return -EINVAL; + break; + default: + return -EINVAL; + } + + tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + if (!tab) + return -ENOMEM; + + memcpy(&tab[slot], poke, sizeof(*poke)); + prog->aux->size_poke_tab = size; + prog->aux->poke_tab = tab; + + return slot; +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: [PATCH 21/27] bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 12 +++ kernel/bpf/arraymap.c | 183 +++++++++++++++++++++++++++++++++++++++++- kernel/bpf/core.c | 9 ++- kernel/bpf/syscall.c | 20 +++-- 4 files changed, 212 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 312983bf7faa..c2f07fd410c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; +struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; @@ -64,6 +65,12 @@ struct bpf_map_ops { const struct btf_type *key_type, const struct btf_type *value_type); + /* Prog poke tracking helpers. */ + int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, + struct bpf_prog *new); + /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); @@ -588,6 +595,11 @@ struct bpf_array_aux { */ enum bpf_prog_type type; bool jited; + /* Programs with direct jumps into programs part of this array. */ + struct list_head poke_progs; + struct bpf_map *map; + struct mutex poke_mutex; + struct work_struct work; }; struct bpf_array { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 57da950ee55b..58bdf5fd24cc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + enum bpf_text_poke_type type; + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + if (!old && new) + type = BPF_MOD_NOP_TO_JUMP; + else if (old && !new) + type = BPF_MOD_JUMP_TO_NOP; + else if (old && new) + type = BPF_MOD_JUMP_TO_JUMP; + else + return; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, type, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; @@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) if (!aux) return ERR_PTR(-ENOMEM); + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); @@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) } container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + return map; } static void prog_array_map_free(struct bpf_map *map) { + struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } kfree(aux); fd_array_map_free(map); } @@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 608b7085e0c9..49e32acad7d8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) static void bpf_free_used_maps(struct bpf_prog_aux *aux) { + struct bpf_map *map; int i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); + for (i = 0; i < aux->used_map_cnt; i++) { + map = aux->used_maps[i]; + if (map->ops->map_poke_untrack) + map->ops->map_poke_untrack(map, aux); + bpf_map_put(map); + } kfree(aux->used_maps); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b904d56ec686..e3461ec59570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,12 +25,13 @@ #include #include -#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) -#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ + IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) @@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map)) { + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); @@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + goto out; } /* must increment bpf_prog_active to avoid kprobe+bpf triggering from @@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = map->ops->map_delete_elem(map, key); + goto out; } preempt_disable(); From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:59 +0100 Subject: [PATCH 22/27] bpf: Constant map key tracking for prog array pokes Add tracking of constant keys into tail call maps. The signature of bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3 is a index key. The direct call approach for tail calls can be enabled if the verifier asserted that for all branches leading to the tail call helper invocation, the map pointer and index key were both constant and the same. Tracking of map pointers we already do from prior work via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls on maps"). Given the tail call map index key is not on stack but directly in the register, we can add similar tracking approach and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab with the relevant information for the JITing phase. We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order to point into the prog's poke_tab, and keep insn->imm as 0 as indicator that current indirect tail call emission must be used. Note that publishing to the tracker must happen at the end of fixup_bpf_calls() since adding elements to the poke_tab reallocates its memory, so we need to wait until its in final state. Future work can generalize and add similar approach to optimize plain array map lookups. Difference there is that we need to look into the key value that sits on stack. For clarity in bpf_insn_aux_data, map_state has been renamed into map_ptr_state, so we get map_{ptr,key}_state as trackers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net --- include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++--- 2 files changed, 113 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cdd08bf0ec06..26e40de9ef55 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -293,7 +293,7 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_state; /* pointer/poison value for maps */ + unsigned long map_ptr_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { @@ -301,6 +301,7 @@ struct bpf_insn_aux_data { u32 map_off; /* offset from value base address */ }; }; + u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc85714428c7..a0482e1c4a77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 +#define BPF_MAP_KEY_POISON (1ULL << 63) +#define BPF_MAP_KEY_SEEN (1ULL << 62) + #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ POISON_POINTER_DELTA)) @@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem { static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; + return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, @@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, { BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} + +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & BPF_MAP_KEY_POISON; +} + +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) +{ + return !(aux->map_key_state & BPF_MAP_KEY_SEEN); +} + +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); +} + +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) +{ + bool poisoned = bpf_map_key_poisoned(aux); + + aux->map_key_state = state | BPF_MAP_KEY_SEEN | + (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } struct bpf_call_arg_meta { @@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_state)) + if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); - else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, meta->map_ptr->unpriv_array); return 0; } +static int +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_map *map = meta->map_ptr; + struct tnum range; + u64 val; + + if (func_id != BPF_FUNC_tail_call) + return 0; + if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + range = tnum_range(0, map->max_entries - 1); + reg = ®s[BPF_REG_3]; + + if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; + } + + val = reg->var_off.value; + if (bpf_map_key_unseen(aux)) + bpf_map_key_store(aux, val); + else if (!bpf_map_key_poisoned(aux) && + bpf_map_key_immediate(aux) != val) + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; +} + static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); @@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_key(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) static int fixup_bpf_calls(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool expect_blinding = bpf_jit_blinding_enabled(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, cnt, delta = 0; + int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; + if (prog->jit_requested && !expect_blinding && + !bpf_map_key_poisoned(aux) && + !bpf_map_ptr_poisoned(aux) && + !bpf_map_ptr_unpriv(aux)) { + struct bpf_jit_poke_descriptor desc = { + .reason = BPF_POKE_REASON_TAIL_CALL, + .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.key = bpf_map_key_immediate(aux), + }; + + ret = bpf_jit_add_poke_descriptor(prog, &desc); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + return ret; + } + + insn->imm = ret + 1; + continue; + } + if (!bpf_map_ptr_unpriv(aux)) continue; @@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -9345,6 +9430,23 @@ patch_call_imm: insn->imm = fn->func - __bpf_call_base; } + /* Since poke tab is now finalized, publish aux to tracker. */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + if (!map_ptr->ops->map_poke_track || + !map_ptr->ops->map_poke_untrack || + !map_ptr->ops->map_poke_run) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + return ret; + } + } + return 0; } From 428d5df1fa4f28daf622c48dd19da35585c9053c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:08:00 +0100 Subject: [PATCH 23/27] bpf, x86: Emit patchable direct jump as tail call Add initial code emission for *direct* jumps for tail call maps in order to avoid the retpoline overhead from a493a87f38cf ("bpf, x64: implement retpoline for tail call") for situations that allow for it, meaning, for known constant keys at verification time which are used as index into the tail call map. In case of Cilium which makes heavy use of tail calls, constant keys are used in the vast majority, only for a single occurrence we use a dynamic key. High level outline is that if the target prog is NULL in the map, we emit a 5-byte nop for the fall-through case and if not, we emit a 5-byte direct relative jmp to the target bpf_func + skipped prologue offset. Later during runtime, we patch these 5-byte nop/jmps upon tail call map update or deletions dynamically. Note that on x86-64 the direct jmp works as we reuse the same stack frame and skip prologue (as opposed to some other JIT implementations). One of the issues is that the tail call map slots can change at any given time even during JITing. Therefore, we have two passes: i) emit nops for all patchable locations during main JITing phase until we declare prog->jited = 1 eventually. At this point the image is stable, not public yet and with all jmps disabled. While JITing, we collect additional info like poke->ip in order to remember the patch location for later modifications. In ii) bpf_tail_call_direct_fixup() walks over the progs poke_tab, locks the tail call maps poke_mutex to prevent from parallel updates and patches in the right locations via __bpf_arch_text_poke(). Note, the main bpf_arch_text_poke() cannot be used at this point since we're not yet exposed to kallsyms. For the update we use plain memcpy() since the image is not public and still in read-write mode. After patching, we activate that poke entry through poke->ip_stable. Meaning, at this point any tail call map updates/deletions are not going to ignore that poke entry anymore. Then, bpf_arch_text_poke() might still occur on the read-write image until we finally locked it as read-only. Both modifications on the given image are under text_mutex to avoid interference with each other when update requests come in in parallel for different tail call maps (current one we have locked in JIT and different one where poke->ip_stable was already set). Example prog: # ./bpftool p d x i 1655 0: (b7) r3 = 0 1: (18) r2 = map[id:526] 3: (85) call bpf_tail_call#12 4: (b7) r0 = 1 5: (95) exit Before: # ./bpftool p d j i 1655 0xffffffffc076e55c: 0: nopl 0x0(%rax,%rax,1) 5: push %rbp 6: mov %rsp,%rbp 9: sub $0x200,%rsp 10: push %rbx 11: push %r13 13: push %r14 15: push %r15 17: pushq $0x0 _ 19: xor %edx,%edx |_ index (arg 3) 1b: movabs $0xffff88d95cc82600,%rsi |_ map (arg 2) 25: mov %edx,%edx | index >= array->map.max_entries 27: cmp %edx,0x24(%rsi) | 2a: jbe 0x0000000000000066 |_ 2c: mov -0x224(%rbp),%eax | tail call limit check 32: cmp $0x20,%eax | 35: ja 0x0000000000000066 | 37: add $0x1,%eax | 3a: mov %eax,-0x224(%rbp) |_ 40: mov 0xd0(%rsi,%rdx,8),%rax |_ prog = array->ptrs[index] 48: test %rax,%rax | prog == NULL check 4b: je 0x0000000000000066 |_ 4d: mov 0x30(%rax),%rax | goto *(prog->bpf_func + prologue_size) 51: add $0x19,%rax | 55: callq 0x0000000000000061 | retpoline for indirect jump 5a: pause | 5c: lfence | 5f: jmp 0x000000000000005a | 61: mov %rax,(%rsp) | 65: retq |_ 66: mov $0x1,%eax 6b: pop %rbx 6c: pop %r15 6e: pop %r14 70: pop %r13 72: pop %rbx 73: leaveq 74: retq After; state after JIT: # ./bpftool p d j i 1655 0xffffffffc08e8930: 0: nopl 0x0(%rax,%rax,1) 5: push %rbp 6: mov %rsp,%rbp 9: sub $0x200,%rsp 10: push %rbx 11: push %r13 13: push %r14 15: push %r15 17: pushq $0x0 _ 19: xor %edx,%edx |_ index (arg 3) 1b: movabs $0xffff9d8afd74c000,%rsi |_ map (arg 2) 25: mov -0x224(%rbp),%eax | tail call limit check 2b: cmp $0x20,%eax | 2e: ja 0x000000000000003e | 30: add $0x1,%eax | 33: mov %eax,-0x224(%rbp) |_ 39: jmpq 0xfffffffffffd1785 |_ [direct] goto *(prog->bpf_func + prologue_size) 3e: mov $0x1,%eax 43: pop %rbx 44: pop %r15 46: pop %r14 48: pop %r13 4a: pop %rbx 4b: leaveq 4c: retq After; state after map update (target prog): # ./bpftool p d j i 1655 0xffffffffc08e8930: 0: nopl 0x0(%rax,%rax,1) 5: push %rbp 6: mov %rsp,%rbp 9: sub $0x200,%rsp 10: push %rbx 11: push %r13 13: push %r14 15: push %r15 17: pushq $0x0 19: xor %edx,%edx 1b: movabs $0xffff9d8afd74c000,%rsi 25: mov -0x224(%rbp),%eax 2b: cmp $0x20,%eax . 2e: ja 0x000000000000003e . 30: add $0x1,%eax . 33: mov %eax,-0x224(%rbp) |_ 39: jmpq 0xffffffffffb09f55 |_ goto *(prog->bpf_func + prologue_size) 3e: mov $0x1,%eax 43: pop %rbx 44: pop %r15 46: pop %r14 48: pop %r13 4a: pop %rbx 4b: leaveq 4c: retq After; state after map update (no prog): # ./bpftool p d j i 1655 0xffffffffc08e8930: 0: nopl 0x0(%rax,%rax,1) 5: push %rbp 6: mov %rsp,%rbp 9: sub $0x200,%rsp 10: push %rbx 11: push %r13 13: push %r14 15: push %r15 17: pushq $0x0 19: xor %edx,%edx 1b: movabs $0xffff9d8afd74c000,%rsi 25: mov -0x224(%rbp),%eax 2b: cmp $0x20,%eax . 2e: ja 0x000000000000003e . 30: add $0x1,%eax . 33: mov %eax,-0x224(%rbp) |_ 39: nopl 0x0(%rax,%rax,1) |_ fall-through nop 3e: mov $0x1,%eax 43: pop %rbx 44: pop %r15 46: pop %r14 48: pop %r13 4a: pop %rbx 4b: leaveq 4c: retq Nice bonus is that this also shrinks the code emission quite a bit for every tail call invocation. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/6ada4c1c9d35eeb5f4ecfab94593dafa6b5c4b09.1574452833.git.daniel@iogearbox.net --- arch/x86/net/bpf_jit_comp.c | 282 ++++++++++++++++++++++++------------ 1 file changed, 187 insertions(+), 95 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f438bd3b7689..15615c94804f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -239,6 +239,123 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + X86_PATCH_SIZE); + if (!is_simm32(offset)) { + pr_err("Target call %p is out of range\n", func); + return -ERANGE; + } + EMIT1_off32(opcode, offset); + *pprog = prog; + return 0; +} + +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr, + const bool text_live) +{ + int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + u8 old_insn[X86_PATCH_SIZE] = {}; + u8 new_insn[X86_PATCH_SIZE] = {}; + u8 *prog; + int ret; + + switch (t) { + case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: + emit_patch_fn = emit_call; + break; + case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: + emit_patch_fn = emit_jump; + break; + default: + return -ENOTSUPP; + } + + switch (t) { + case BPF_MOD_NOP_TO_CALL: + case BPF_MOD_NOP_TO_JUMP: + if (!old_addr && new_addr) { + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + + prog = new_insn; + ret = emit_patch_fn(&prog, new_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + case BPF_MOD_CALL_TO_CALL: + case BPF_MOD_JUMP_TO_JUMP: + if (old_addr && new_addr) { + prog = old_insn; + ret = emit_patch_fn(&prog, old_addr, ip); + if (ret) + return ret; + + prog = new_insn; + ret = emit_patch_fn(&prog, new_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + case BPF_MOD_CALL_TO_NOP: + case BPF_MOD_JUMP_TO_NOP: + if (old_addr && !new_addr) { + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + + prog = old_insn; + ret = emit_patch_fn(&prog, old_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + default: + return -ENOTSUPP; + } + + ret = -EBUSY; + mutex_lock(&text_mutex); + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) + goto out; + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) + /* BPF poking in modules is not supported */ + return -EINVAL; + + return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); +} + /* * Generate the following code: * @@ -253,7 +370,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog) { u8 *prog = *pprog; int label1, label2, label3; @@ -320,6 +437,69 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } +static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, + u8 **pprog, int addr, u8 *image) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT2(X86_JA, 14); /* ja out */ + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + + poke->ip = image + (addr - X86_PATCH_SIZE); + poke->adj_off = PROLOGUE_SIZE; + + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; + /* out: */ + + *pprog = prog; +} + +static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) +{ + static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP; + struct bpf_jit_poke_descriptor *poke; + struct bpf_array *array; + struct bpf_prog *target; + int i, ret; + + for (i = 0; i < prog->aux->size_poke_tab; i++) { + poke = &prog->aux->poke_tab[i]; + WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + + array = container_of(poke->tail_call.map, struct bpf_array, map); + mutex_lock(&array->aux->poke_mutex); + target = array->ptrs[poke->tail_call.key]; + if (target) { + /* Plain memcpy is used when image is not live yet + * and still not locked as read-only. Once poke + * location is active (poke->ip_stable), any parallel + * bpf_arch_text_poke() might occur still on the + * read-write image until we finally locked it as + * read-only. Both modifications on the given image + * are under text_mutex to avoid interference. + */ + ret = __bpf_arch_text_poke(poke->ip, type, NULL, + (u8 *)target->bpf_func + + poke->adj_off, false); + BUG_ON(ret < 0); + } + WRITE_ONCE(poke->ip_stable, true); + mutex_unlock(&array->aux->poke_mutex); + } +} + static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -481,99 +661,6 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } -static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) -{ - u8 *prog = *pprog; - int cnt = 0; - s64 offset; - - offset = func - (ip + X86_PATCH_SIZE); - if (!is_simm32(offset)) { - pr_err("Target call %p is out of range\n", func); - return -EINVAL; - } - EMIT1_off32(opcode, offset); - *pprog = prog; - return 0; -} - -static int emit_call(u8 **pprog, void *func, void *ip) -{ - return emit_patch(pprog, func, ip, 0xE8); -} - -static int emit_jump(u8 **pprog, void *func, void *ip) -{ - return emit_patch(pprog, func, ip, 0xE9); -} - -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) -{ - int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); - u8 old_insn[X86_PATCH_SIZE] = {}; - u8 new_insn[X86_PATCH_SIZE] = {}; - u8 *prog; - int ret; - - if (!is_kernel_text((long)ip) && - !is_bpf_text_address((long)ip)) - /* BPF poking in modules is not supported */ - return -EINVAL; - - switch (t) { - case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: - emit_patch_fn = emit_call; - break; - case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: - emit_patch_fn = emit_jump; - break; - default: - return -ENOTSUPP; - } - - if (old_addr) { - prog = old_insn; - ret = emit_patch_fn(&prog, old_addr, (void *)ip); - if (ret) - return ret; - } - if (new_addr) { - prog = new_insn; - ret = emit_patch_fn(&prog, new_addr, (void *)ip); - if (ret) - return ret; - } - - ret = -EBUSY; - mutex_lock(&text_mutex); - switch (t) { - case BPF_MOD_NOP_TO_CALL: - case BPF_MOD_NOP_TO_JUMP: - if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE)) - goto out; - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); - break; - case BPF_MOD_CALL_TO_CALL: - case BPF_MOD_JUMP_TO_JUMP: - if (memcmp(ip, old_insn, X86_PATCH_SIZE)) - goto out; - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); - break; - case BPF_MOD_CALL_TO_NOP: - case BPF_MOD_JUMP_TO_NOP: - if (memcmp(ip, old_insn, X86_PATCH_SIZE)) - goto out; - text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE, - NULL); - break; - } - ret = 0; -out: - mutex_unlock(&text_mutex); - return ret; -} - static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -1041,7 +1128,11 @@ xadd: if (is_imm8(insn->off)) break; case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + if (imm32) + emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + &prog, addrs[i], image); + else + emit_bpf_tail_call_indirect(&prog); break; /* cond jump */ @@ -1599,6 +1690,7 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + bpf_tail_call_direct_fixup(prog); bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; From 79d49ba048ecace59a9850e8a04b618d7848b8e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:08:01 +0100 Subject: [PATCH 24/27] bpf, testing: Add various tail call test cases Add several BPF kselftest cases for tail calls which test the various patch directions, and that multiple locations are patched in same and different programs. # ./test_progs -n 45 #45/1 tailcall_1:OK #45/2 tailcall_2:OK #45/3 tailcall_3:OK #45/4 tailcall_4:OK #45/5 tailcall_5:OK #45 tailcalls:OK Summary: 1/5 PASSED, 0 SKIPPED, 0 FAILED I've also verified the JITed dump after each of the rewrite cases that it matches expectations. Also regular test_verifier suite passes fine which contains further tail call tests: # ./test_verifier [...] Summary: 1563 PASSED, 0 SKIPPED, 0 FAILED Checked under JIT, interpreter and JIT + hardening. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/3d6cbecbeb171117dccfe153306e479798fb608d.1574452833.git.daniel@iogearbox.net --- .../selftests/bpf/prog_tests/tailcalls.c | 487 ++++++++++++++++++ tools/testing/selftests/bpf/progs/tailcall1.c | 48 ++ tools/testing/selftests/bpf/progs/tailcall2.c | 59 +++ tools/testing/selftests/bpf/progs/tailcall3.c | 31 ++ tools/testing/selftests/bpf/progs/tailcall4.c | 33 ++ tools/testing/selftests/bpf/progs/tailcall5.c | 40 ++ 6 files changed, 698 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tailcalls.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall1.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall2.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall3.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall4.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall5.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c new file mode 100644 index 000000000000..bb8fe646dd9f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +/* test_tailcall_1 checks basic functionality by patching multiple locations + * in a single program for a single tail call slot with nop->jmp, jmp->nop + * and jmp->jmp rewrites. Also checks for nop->nop. + */ +static void test_tailcall_1(void) +{ + int err, map_fd, prog_fd, main_fd, i, j; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + char buff[128] = {}; + + err = bpf_prog_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + j = bpf_map__def(prog_array)->max_entries - 1 - i; + snprintf(prog_name, sizeof(prog_name), "classifier/%i", j); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + j = bpf_map__def(prog_array)->max_entries - 1 - i; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != j, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err >= 0 || errno != ENOENT)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + +out: + bpf_object__close(obj); +} + +/* test_tailcall_2 checks that patching multiple programs for a single + * tail call slot works. It also jumps through several programs and tests + * the tail call limit counter. + */ +static void test_tailcall_2(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + char buff[128] = {}; + + err = bpf_prog_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 2, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 2; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_3 checks that the count value of the tail call limit + * enforcement matches with expectations. + */ +static void test_tailcall_3(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char buff[128] = {}; + + err = bpf_prog_load("tailcall3.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + prog = bpf_object__find_program_by_title(obj, "classifier/0"); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_4 checks that the kernel properly selects indirect jump + * for the case where the key is not known. Latter is passed via global + * data to select different targets we can compare return value of. + */ +static void test_tailcall_4(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + static const int zero = 0; + char buff[128] = {}; + char prog_name[32]; + + err = bpf_prog_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } +out: + bpf_object__close(obj); +} + +/* test_tailcall_5 probes similarly to test_tailcall_4 that the kernel generates + * an indirect jump when the keys are const but different from different branches. + */ +static void test_tailcall_5(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, key[] = { 1111, 1234, 5678 }; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + static const int zero = 0; + char buff[128] = {}; + char prog_name[32]; + + err = bpf_prog_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } +out: + bpf_object__close(obj); +} + +void test_tailcalls(void) +{ + if (test__start_subtest("tailcall_1")) + test_tailcall_1(); + if (test__start_subtest("tailcall_2")) + test_tailcall_2(); + if (test__start_subtest("tailcall_3")) + test_tailcall_3(); + if (test__start_subtest("tailcall_4")) + test_tailcall_4(); + if (test__start_subtest("tailcall_5")) + test_tailcall_5(); +} diff --git a/tools/testing/selftests/bpf/progs/tailcall1.c b/tools/testing/selftests/bpf/progs/tailcall1.c new file mode 100644 index 000000000000..63531e1a9fa4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall1.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + /* Multiple locations to make sure we patch + * all of them. + */ + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall2.c b/tools/testing/selftests/bpf/progs/tailcall2.c new file mode 100644 index 000000000000..21c85c477210 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall2.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 5); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + return 0; +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 2); + return 1; +} + +SEC("classifier/2") +int bpf_func_2(struct __sk_buff *skb) +{ + return 2; +} + +SEC("classifier/3") +int bpf_func_3(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 4); + return 3; +} + +SEC("classifier/4") +int bpf_func_4(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 3); + return 4; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + /* Check multi-prog update. */ + bpf_tail_call(skb, &jmp_table, 2); + /* Check tail call limit. */ + bpf_tail_call(skb, &jmp_table, 3); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall3.c b/tools/testing/selftests/bpf/progs/tailcall3.c new file mode 100644 index 000000000000..1ecae198b8c1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall3.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int count; + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + count++; + bpf_tail_call(skb, &jmp_table, 0); + return 1; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + return 0; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall4.c b/tools/testing/selftests/bpf/progs/tailcall4.c new file mode 100644 index 000000000000..499388758119 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall4.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int selector; + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, selector); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall5.c b/tools/testing/selftests/bpf/progs/tailcall5.c new file mode 100644 index 000000000000..49c64eb53f19 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall5.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int selector; + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + int idx = 0; + + if (selector == 1234) + idx = 1; + else if (selector == 5678) + idx = 2; + + bpf_tail_call(skb, &jmp_table, idx); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; From b8cd76ca4ae34731d47cd6a876d912a08efcc240 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Nov 2019 21:37:31 +0100 Subject: [PATCH 25/27] bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a definition of bpf_jit_blinding_enabled() when CONFIG_BPF_JIT is not set in order to fix a recent build regression: [...] CC kernel/bpf/verifier.o CC kernel/bpf/inode.o kernel/bpf/verifier.c: In function ‘fixup_bpf_calls’: kernel/bpf/verifier.c:9132:25: error: implicit declaration of function ‘bpf_jit_blinding_enabled’; did you mean ‘bpf_jit_kallsyms_enabled’? [-Werror=implicit-function-declaration] 9132 | bool expect_blinding = bpf_jit_blinding_enabled(prog); | ^~~~~~~~~~~~~~~~~~~~~~~~ | bpf_jit_kallsyms_enabled CC kernel/bpf/helpers.o CC kernel/bpf/hashtab.o [...] Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Jakub Sitnicki Reported-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/40baf8f3507cac4851a310578edfb98ce73b5605.1574541375.git.daniel@iogearbox.net --- include/linux/filter.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index 796b60d8cc6c..1b1e8b8f88da 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1053,6 +1053,11 @@ static inline bool ebpf_jit_enabled(void) return false; } +static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) +{ + return false; +} + static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; From f9a7cf6eb17cd0110c8c47d9e7969fc2716e5772 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 23 Nov 2019 12:25:04 -0800 Subject: [PATCH 26/27] bpf: Introduce BPF_TRACE_x helper for the tracing tests For BPF_PROG_TYPE_TRACING, the bpf_prog's ctx is an array of u64. This patch borrows the idea from BPF_CALL_x in filter.h to convert a u64 to the arg type of the traced function. The new BPF_TRACE_x has an arg to specify the return type of a bpf_prog. It will be used in the future TCP-ops bpf_prog that may return "void". The new macros are defined in the new header file "bpf_trace_helpers.h". It is under selftests/bpf/ for now. It could be moved to libbpf later after seeing more upcoming non-tracing use cases. The tests are changed to use these new macros also. Hence, the k[s]u8/16/32/64 are no longer needed and they are removed from the bpf_helpers.h. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191123202504.1502696-1-kafai@fb.com --- tools/lib/bpf/bpf_helpers.h | 13 --- .../testing/selftests/bpf/bpf_trace_helpers.h | 58 +++++++++++++ .../testing/selftests/bpf/progs/fentry_test.c | 72 ++++------------ .../selftests/bpf/progs/fexit_bpf2bpf.c | 27 ++---- .../testing/selftests/bpf/progs/fexit_test.c | 83 +++++-------------- tools/testing/selftests/bpf/progs/kfree_skb.c | 43 +++------- .../selftests/bpf/progs/test_overhead.c | 16 ++-- 7 files changed, 125 insertions(+), 187 deletions(-) create mode 100644 tools/testing/selftests/bpf/bpf_trace_helpers.h diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index c63ab1add126..0c7d28292898 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -44,17 +44,4 @@ enum libbpf_pin_type { LIBBPF_PIN_BY_NAME, }; -/* The following types should be used by BPF_PROG_TYPE_TRACING program to - * access kernel function arguments. BPF trampoline and raw tracepoints - * typecast arguments to 'unsigned long long'. - */ -typedef int __attribute__((aligned(8))) ks32; -typedef char __attribute__((aligned(8))) ks8; -typedef short __attribute__((aligned(8))) ks16; -typedef long long __attribute__((aligned(8))) ks64; -typedef unsigned int __attribute__((aligned(8))) ku32; -typedef unsigned char __attribute__((aligned(8))) ku8; -typedef unsigned short __attribute__((aligned(8))) ku16; -typedef unsigned long long __attribute__((aligned(8))) ku64; - #endif diff --git a/tools/testing/selftests/bpf/bpf_trace_helpers.h b/tools/testing/selftests/bpf/bpf_trace_helpers.h new file mode 100644 index 000000000000..c76a214a53b0 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_trace_helpers.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_TRACE_HELPERS_H +#define __BPF_TRACE_HELPERS_H + +#include "bpf_helpers.h" + +#define __BPF_MAP_0(i, m, v, ...) v +#define __BPF_MAP_1(i, m, v, t, a, ...) m(t, a, ctx[i]) +#define __BPF_MAP_2(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_1(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_3(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_2(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_4(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_3(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_5(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_4(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_6(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_5(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_7(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_6(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_8(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_7(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_9(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_8(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_10(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_9(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_11(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_10(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP_12(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_11(i+1, m, v, __VA_ARGS__) +#define __BPF_MAP(n, ...) __BPF_MAP_##n(0, __VA_ARGS__) + +/* BPF sizeof(void *) is always 8, so no need to cast to long first + * for ptr to avoid compiler warning. + */ +#define __BPF_CAST(t, a, ctx) (t) ctx +#define __BPF_V void +#define __BPF_N + +#define __BPF_DECL_ARGS(t, a, ctx) t a + +#define BPF_TRACE_x(x, sec_name, fname, ret_type, ...) \ +static __always_inline ret_type \ +____##fname(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + \ +SEC(sec_name) \ +ret_type fname(__u64 *ctx) \ +{ \ + return ____##fname(__BPF_MAP(x, __BPF_CAST, __BPF_N, __VA_ARGS__));\ +} \ + \ +static __always_inline \ +ret_type ____##fname(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) + +#define BPF_TRACE_0(sec, fname, ...) BPF_TRACE_x(0, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_1(sec, fname, ...) BPF_TRACE_x(1, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_2(sec, fname, ...) BPF_TRACE_x(2, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_3(sec, fname, ...) BPF_TRACE_x(3, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_4(sec, fname, ...) BPF_TRACE_x(4, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_5(sec, fname, ...) BPF_TRACE_x(5, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_6(sec, fname, ...) BPF_TRACE_x(6, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_7(sec, fname, ...) BPF_TRACE_x(7, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_8(sec, fname, ...) BPF_TRACE_x(8, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_9(sec, fname, ...) BPF_TRACE_x(9, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_10(sec, fname, ...) BPF_TRACE_x(10, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_11(sec, fname, ...) BPF_TRACE_x(11, sec, fname, int, __VA_ARGS__) +#define BPF_TRACE_12(sec, fname, ...) BPF_TRACE_x(12, sec, fname, int, __VA_ARGS__) + +#endif diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 545788bf8d50..d2af9f039df5 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -2,89 +2,53 @@ /* Copyright (c) 2019 Facebook */ #include #include "bpf_helpers.h" +#include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; -struct test1 { - ks32 a; -}; static volatile __u64 test1_result; -SEC("fentry/bpf_fentry_test1") -int test1(struct test1 *ctx) +BPF_TRACE_1("fentry/bpf_fentry_test1", test1, int, a) { - test1_result = ctx->a == 1; + test1_result = a == 1; return 0; } -struct test2 { - ks32 a; - ku64 b; -}; static volatile __u64 test2_result; -SEC("fentry/bpf_fentry_test2") -int test2(struct test2 *ctx) +BPF_TRACE_2("fentry/bpf_fentry_test2", test2, int, a, __u64, b) { - test2_result = ctx->a == 2 && ctx->b == 3; + test2_result = a == 2 && b == 3; return 0; } -struct test3 { - ks8 a; - ks32 b; - ku64 c; -}; static volatile __u64 test3_result; -SEC("fentry/bpf_fentry_test3") -int test3(struct test3 *ctx) +BPF_TRACE_3("fentry/bpf_fentry_test3", test3, char, a, int, b, __u64, c) { - test3_result = ctx->a == 4 && ctx->b == 5 && ctx->c == 6; + test3_result = a == 4 && b == 5 && c == 6; return 0; } -struct test4 { - void *a; - ks8 b; - ks32 c; - ku64 d; -}; static volatile __u64 test4_result; -SEC("fentry/bpf_fentry_test4") -int test4(struct test4 *ctx) +BPF_TRACE_4("fentry/bpf_fentry_test4", test4, + void *, a, char, b, int, c, __u64, d) { - test4_result = ctx->a == (void *)7 && ctx->b == 8 && ctx->c == 9 && - ctx->d == 10; + test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10; return 0; } -struct test5 { - ku64 a; - void *b; - ks16 c; - ks32 d; - ku64 e; -}; static volatile __u64 test5_result; -SEC("fentry/bpf_fentry_test5") -int test5(struct test5 *ctx) +BPF_TRACE_5("fentry/bpf_fentry_test5", test5, + __u64, a, void *, b, short, c, int, d, __u64, e) { - test5_result = ctx->a == 11 && ctx->b == (void *)12 && ctx->c == 13 && - ctx->d == 14 && ctx->e == 15; + test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15; return 0; } -struct test6 { - ku64 a; - void *b; - ks16 c; - ks32 d; - void *e; - ks64 f; -}; static volatile __u64 test6_result; -SEC("fentry/bpf_fentry_test6") -int test6(struct test6 *ctx) +BPF_TRACE_6("fentry/bpf_fentry_test6", test6, + __u64, a, void *, b, short, c, int, d, void *, e, __u64, f) { - test6_result = ctx->a == 16 && ctx->b == (void *)17 && ctx->c == 18 && - ctx->d == 19 && ctx->e == (void *)20 && ctx->f == 21; + test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21; return 0; } diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index 981f0474da5a..525d47d7b589 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -2,46 +2,37 @@ /* Copyright (c) 2019 Facebook */ #include #include "bpf_helpers.h" +#include "bpf_trace_helpers.h" struct sk_buff { unsigned int len; }; -struct args { - struct sk_buff *skb; - ks32 ret; -}; static volatile __u64 test_result; -SEC("fexit/test_pkt_access") -int test_main(struct args *ctx) +BPF_TRACE_2("fexit/test_pkt_access", test_main, + struct sk_buff *, skb, int, ret) { - struct sk_buff *skb = ctx->skb; int len; __builtin_preserve_access_index(({ len = skb->len; })); - if (len != 74 || ctx->ret != 0) + if (len != 74 || ret != 0) return 0; test_result = 1; return 0; } -struct args_subprog1 { - struct sk_buff *skb; - ks32 ret; -}; static volatile __u64 test_result_subprog1; -SEC("fexit/test_pkt_access_subprog1") -int test_subprog1(struct args_subprog1 *ctx) +BPF_TRACE_2("fexit/test_pkt_access_subprog1", test_subprog1, + struct sk_buff *, skb, int, ret) { - struct sk_buff *skb = ctx->skb; int len; __builtin_preserve_access_index(({ len = skb->len; })); - if (len != 74 || ctx->ret != 148) + if (len != 74 || ret != 148) return 0; test_result_subprog1 = 1; return 0; @@ -62,8 +53,8 @@ int test_subprog1(struct args_subprog1 *ctx) * instead of accurate types. */ struct args_subprog2 { - ku64 args[5]; - ku64 ret; + __u64 args[5]; + __u64 ret; }; static volatile __u64 test_result_subprog2; SEC("fexit/test_pkt_access_subprog2") diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 8b98b1a51784..2487e98edb34 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -2,97 +2,56 @@ /* Copyright (c) 2019 Facebook */ #include #include "bpf_helpers.h" +#include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; -struct test1 { - ks32 a; - ks32 ret; -}; static volatile __u64 test1_result; -SEC("fexit/bpf_fentry_test1") -int test1(struct test1 *ctx) +BPF_TRACE_2("fexit/bpf_fentry_test1", test1, int, a, int, ret) { - test1_result = ctx->a == 1 && ctx->ret == 2; + test1_result = a == 1 && ret == 2; return 0; } -struct test2 { - ks32 a; - ku64 b; - ks32 ret; -}; static volatile __u64 test2_result; -SEC("fexit/bpf_fentry_test2") -int test2(struct test2 *ctx) +BPF_TRACE_3("fexit/bpf_fentry_test2", test2, int, a, __u64, b, int, ret) { - test2_result = ctx->a == 2 && ctx->b == 3 && ctx->ret == 5; + test2_result = a == 2 && b == 3 && ret == 5; return 0; } -struct test3 { - ks8 a; - ks32 b; - ku64 c; - ks32 ret; -}; static volatile __u64 test3_result; -SEC("fexit/bpf_fentry_test3") -int test3(struct test3 *ctx) +BPF_TRACE_4("fexit/bpf_fentry_test3", test3, char, a, int, b, __u64, c, int, ret) { - test3_result = ctx->a == 4 && ctx->b == 5 && ctx->c == 6 && - ctx->ret == 15; + test3_result = a == 4 && b == 5 && c == 6 && ret == 15; return 0; } -struct test4 { - void *a; - ks8 b; - ks32 c; - ku64 d; - ks32 ret; -}; static volatile __u64 test4_result; -SEC("fexit/bpf_fentry_test4") -int test4(struct test4 *ctx) +BPF_TRACE_5("fexit/bpf_fentry_test4", test4, + void *, a, char, b, int, c, __u64, d, int, ret) { - test4_result = ctx->a == (void *)7 && ctx->b == 8 && ctx->c == 9 && - ctx->d == 10 && ctx->ret == 34; + + test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && + ret == 34; return 0; } -struct test5 { - ku64 a; - void *b; - ks16 c; - ks32 d; - ku64 e; - ks32 ret; -}; static volatile __u64 test5_result; -SEC("fexit/bpf_fentry_test5") -int test5(struct test5 *ctx) +BPF_TRACE_6("fexit/bpf_fentry_test5", test5, + __u64, a, void *, b, short, c, int, d, __u64, e, int, ret) { - test5_result = ctx->a == 11 && ctx->b == (void *)12 && ctx->c == 13 && - ctx->d == 14 && ctx->e == 15 && ctx->ret == 65; + test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 65; return 0; } -struct test6 { - ku64 a; - void *b; - ks16 c; - ks32 d; - void *e; - ks64 f; - ks32 ret; -}; static volatile __u64 test6_result; -SEC("fexit/bpf_fentry_test6") -int test6(struct test6 *ctx) +BPF_TRACE_7("fexit/bpf_fentry_test6", test6, + __u64, a, void *, b, short, c, int, d, void *, e, __u64, f, + int, ret) { - test6_result = ctx->a == 16 && ctx->b == (void *)17 && ctx->c == 18 && - ctx->d == 19 && ctx->e == (void *)20 && ctx->f == 21 && - ctx->ret == 111; + test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21 && ret == 111; return 0; } diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c index dcc9feac8338..974d6f3bb319 100644 --- a/tools/testing/selftests/bpf/progs/kfree_skb.c +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c @@ -4,6 +4,7 @@ #include #include "bpf_helpers.h" #include "bpf_endian.h" +#include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; struct { @@ -47,28 +48,18 @@ struct sk_buff { char cb[48]; }; -/* copy arguments from - * include/trace/events/skb.h: - * TRACE_EVENT(kfree_skb, - * TP_PROTO(struct sk_buff *skb, void *location), - * - * into struct below: - */ -struct trace_kfree_skb { - struct sk_buff *skb; - void *location; -}; - struct meta { int ifindex; __u32 cb32_0; __u8 cb8_0; }; -SEC("tp_btf/kfree_skb") -int trace_kfree_skb(struct trace_kfree_skb *ctx) +/* TRACE_EVENT(kfree_skb, + * TP_PROTO(struct sk_buff *skb, void *location), + */ +BPF_TRACE_2("tp_btf/kfree_skb", trace_kfree_skb, + struct sk_buff *, skb, void *, location) { - struct sk_buff *skb = ctx->skb; struct net_device *dev; struct callback_head *ptr; void *func; @@ -123,17 +114,10 @@ static volatile struct { bool fexit_test_ok; } result; -struct eth_type_trans_args { - struct sk_buff *skb; - struct net_device *dev; - unsigned short protocol; /* return value available to fexit progs */ -}; - -SEC("fentry/eth_type_trans") -int fentry_eth_type_trans(struct eth_type_trans_args *ctx) +BPF_TRACE_3("fentry/eth_type_trans", fentry_eth_type_trans, + struct sk_buff *, skb, struct net_device *, dev, + unsigned short, protocol) { - struct sk_buff *skb = ctx->skb; - struct net_device *dev = ctx->dev; int len, ifindex; __builtin_preserve_access_index(({ @@ -148,11 +132,10 @@ int fentry_eth_type_trans(struct eth_type_trans_args *ctx) return 0; } -SEC("fexit/eth_type_trans") -int fexit_eth_type_trans(struct eth_type_trans_args *ctx) +BPF_TRACE_3("fexit/eth_type_trans", fexit_eth_type_trans, + struct sk_buff *, skb, struct net_device *, dev, + unsigned short, protocol) { - struct sk_buff *skb = ctx->skb; - struct net_device *dev = ctx->dev; int len, ifindex; __builtin_preserve_access_index(({ @@ -163,7 +146,7 @@ int fexit_eth_type_trans(struct eth_type_trans_args *ctx) /* fexit sees packet without L2 header that eth_type_trans should have * consumed. */ - if (len != 60 || ctx->protocol != bpf_htons(0x86dd) || ifindex != 1) + if (len != 60 || protocol != bpf_htons(0x86dd) || ifindex != 1) return 0; result.fexit_test_ok = true; return 0; diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index ef06b2693f96..96c0124a04ba 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -3,6 +3,7 @@ #include #include "bpf_helpers.h" #include "bpf_tracing.h" +#include "bpf_trace_helpers.h" SEC("kprobe/__set_task_comm") int prog1(struct pt_regs *ctx) @@ -22,20 +23,15 @@ int prog3(struct bpf_raw_tracepoint_args *ctx) return 0; } -struct __set_task_comm_args { - struct task_struct *tsk; - const char *buf; - ku8 exec; -}; - -SEC("fentry/__set_task_comm") -int prog4(struct __set_task_comm_args *ctx) +struct task_struct; +BPF_TRACE_3("fentry/__set_task_comm", prog4, + struct task_struct *, tsk, const char *, buf, __u8, exec) { return 0; } -SEC("fexit/__set_task_comm") -int prog5(struct __set_task_comm_args *ctx) +BPF_TRACE_3("fexit/__set_task_comm", prog5, + struct task_struct *, tsk, const char *, buf, __u8, exec) { return 0; } From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 24 Nov 2019 01:39:42 +0100 Subject: [PATCH 27/27] bpf: Simplify __bpf_arch_text_poke poke type handling Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in old_addr as well as new_addr, it's a bit redundant and unnecessarily complicates __bpf_arch_text_poke() itself since we can derive the same from the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP} as types which also allows to clean up call-sites. In addition to that, __bpf_arch_text_poke() currently verifies that text matches expected old_insn before we invoke text_poke_bp(). Also add a check on new_insn and skip rewrite if it already matches. Reason why this is rather useful is that it avoids making any special casing in prog_array_map_poke_run() when old and new prog were NULL and has the benefit that also for this case we perform a check on text whether it really matches our expectations. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net --- arch/x86/net/bpf_jit_comp.c | 85 +++++++++++-------------------------- include/linux/bpf.h | 10 +---- kernel/bpf/arraymap.c | 12 +----- kernel/bpf/trampoline.c | 8 ++-- 4 files changed, 32 insertions(+), 83 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15615c94804f..b8be18427277 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -269,76 +269,42 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr, const bool text_live) { - int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; - u8 old_insn[X86_PATCH_SIZE] = {}; - u8 new_insn[X86_PATCH_SIZE] = {}; + u8 old_insn[X86_PATCH_SIZE]; + u8 new_insn[X86_PATCH_SIZE]; u8 *prog; int ret; - switch (t) { - case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: - emit_patch_fn = emit_call; - break; - case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: - emit_patch_fn = emit_jump; - break; - default: - return -ENOTSUPP; + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + if (old_addr) { + prog = old_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, old_addr, ip) : + emit_jump(&prog, old_addr, ip); + if (ret) + return ret; } - switch (t) { - case BPF_MOD_NOP_TO_CALL: - case BPF_MOD_NOP_TO_JUMP: - if (!old_addr && new_addr) { - memcpy(old_insn, nop_insn, X86_PATCH_SIZE); - - prog = new_insn; - ret = emit_patch_fn(&prog, new_addr, ip); - if (ret) - return ret; - break; - } - return -ENXIO; - case BPF_MOD_CALL_TO_CALL: - case BPF_MOD_JUMP_TO_JUMP: - if (old_addr && new_addr) { - prog = old_insn; - ret = emit_patch_fn(&prog, old_addr, ip); - if (ret) - return ret; - - prog = new_insn; - ret = emit_patch_fn(&prog, new_addr, ip); - if (ret) - return ret; - break; - } - return -ENXIO; - case BPF_MOD_CALL_TO_NOP: - case BPF_MOD_JUMP_TO_NOP: - if (old_addr && !new_addr) { - memcpy(new_insn, nop_insn, X86_PATCH_SIZE); - - prog = old_insn; - ret = emit_patch_fn(&prog, old_addr, ip); - if (ret) - return ret; - break; - } - return -ENXIO; - default: - return -ENOTSUPP; + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + if (new_addr) { + prog = new_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, new_addr, ip) : + emit_jump(&prog, new_addr, ip); + if (ret) + return ret; } ret = -EBUSY; mutex_lock(&text_mutex); if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; - if (text_live) - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); - else - memcpy(ip, new_insn, X86_PATCH_SIZE); + if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + } ret = 0; out: mutex_unlock(&text_mutex); @@ -465,7 +431,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) { - static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP; struct bpf_jit_poke_descriptor *poke; struct bpf_array *array; struct bpf_prog *target; @@ -490,7 +455,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) * read-only. Both modifications on the given image * are under text_mutex to avoid interference. */ - ret = __bpf_arch_text_poke(poke->ip, type, NULL, + ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c2f07fd410c1..35903f148be5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1324,14 +1324,8 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { - /* All call-related pokes. */ - BPF_MOD_NOP_TO_CALL, - BPF_MOD_CALL_TO_CALL, - BPF_MOD_CALL_TO_NOP, - /* All jump-related pokes. */ - BPF_MOD_NOP_TO_JUMP, - BPF_MOD_JUMP_TO_JUMP, - BPF_MOD_JUMP_TO_NOP, + BPF_MOD_CALL, + BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 58bdf5fd24cc..f0d19bbb9211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -746,19 +746,9 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - enum bpf_text_poke_type type; struct prog_poke_elem *elem; struct bpf_array_aux *aux; - if (!old && new) - type = BPF_MOD_NOP_TO_JUMP; - else if (old && !new) - type = BPF_MOD_JUMP_TO_NOP; - else if (old && new) - type = BPF_MOD_JUMP_TO_JUMP; - else - return; - aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); @@ -806,7 +796,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, type, + ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, old ? (u8 *)old->bpf_func + poke->adj_off : NULL, new ? (u8 *)new->bpf_func + diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 10ae59d65f13..7e89f1f49d77 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -77,7 +77,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) int err; if (fentry_cnt + fexit_cnt == 0) { - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, NULL); tr->selector = 0; goto out; @@ -105,12 +105,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tr->selector) /* progs already running at this address */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, new_image); else /* first time registering */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, - NULL, new_image); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL, + new_image); if (err) goto out; tr->selector++;