From b428336676dbca363262cc134b6218205df4f530 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 4 Aug 2020 17:44:09 -0400 Subject: [PATCH 01/75] netfilter: nf_tables: nft_exthdr: the presence return value should be little-endian On big-endian machine, the returned register data when the exthdr is present is not being compared correctly because little-endian is assumed. The function nft_cmp_fast_mask(), called by nft_cmp_fast_eval() and nft_cmp_fast_init(), calls cpu_to_le32(). The following dump also shows that little endian is assumed: $ nft --debug=netlink add rule ip recordroute forward ip option rr exists counter ip [ exthdr load ipv4 1b @ 7 + 0 present => reg 1 ] [ cmp eq reg 1 0x01000000 ] [ counter pkts 0 bytes 0 ] Lastly, debug print in nft_cmp_fast_init() and nft_cmp_fast_eval() when RR option exists in the packet shows that the comparison fails because the assumption: nft_cmp_fast_init:189 priv->sreg=4 desc.len=8 mask=0xff000000 data.data[0]=0x10003e0 nft_cmp_fast_eval:57 regs->data[priv->sreg=4]=0x1 mask=0xff000000 priv->data=0x1000000 v2: use nft_reg_store8() instead (Florian Westphal). Also to avoid the warnings reported by kernel test robot. Fixes: dbb5281a1f84 ("netfilter: nf_tables: add support for matching IPv4 options") Fixes: c078ca3b0c5b ("netfilter: nft_exthdr: Add support for existence check") Signed-off-by: Stephen Suryaputra Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 07782836fad6..3c48cdc8935d 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -44,7 +44,7 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = (err >= 0); + nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; @@ -141,7 +141,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = (err >= 0); + nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; From 2f941622fd88328ca75806c45c9e9709286a0609 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 9 Aug 2020 20:28:01 +0200 Subject: [PATCH 02/75] netfilter: nft_compat: remove flush counter optimization WARNING: CPU: 1 PID: 16059 at lib/refcount.c:31 refcount_warn_saturate+0xdf/0xf [..] __nft_mt_tg_destroy+0x42/0x50 [nft_compat] nft_target_destroy+0x63/0x80 [nft_compat] nf_tables_expr_destroy+0x1b/0x30 [nf_tables] nf_tables_rule_destroy+0x3a/0x70 [nf_tables] nf_tables_exit_net+0x186/0x3d0 [nf_tables] Happens when a compat expr is destoyed from abort path. There is no functional impact; after this work queue is flushed unconditionally if its pending. This removes the waitcount optimization. Test of repeated iptables-restore of a ~60k kubernetes ruleset doesn't indicate a slowdown. In case the counter is needed after all for some workloads we can revert this and increment the refcount for the != NFT_PREPARE_TRANS case to avoid the increment/decrement imbalance. While at it, also flush for match case, this was an oversight in the original patch. Fixes: ffe8923f109b7e ("netfilter: nft_compat: make sure xtables destructors have run") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 6428856ccbec..8e56f353ff35 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -27,8 +27,6 @@ struct nft_xt_match_priv { void *info; }; -static refcount_t nft_compat_pending_destroy = REFCOUNT_INIT(1); - static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, const char *tablename) { @@ -215,6 +213,17 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return 0; } +static void nft_compat_wait_for_destructors(void) +{ + /* xtables matches or targets can have side effects, e.g. + * creation/destruction of /proc files. + * The xt ->destroy functions are run asynchronously from + * work queue. If we have pending invocations we thus + * need to wait for those to finish. + */ + nf_tables_trans_destroy_flush_work(); +} + static int nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -238,14 +247,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - /* xtables matches or targets can have side effects, e.g. - * creation/destruction of /proc files. - * The xt ->destroy functions are run asynchronously from - * work queue. If we have pending invocations we thus - * need to wait for those to finish. - */ - if (refcount_read(&nft_compat_pending_destroy) > 1) - nf_tables_trans_destroy_flush_work(); + nft_compat_wait_for_destructors(); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) @@ -260,7 +262,6 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) { - refcount_dec(&nft_compat_pending_destroy); module_put(me); kfree(expr->ops); } @@ -468,6 +469,8 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + nft_compat_wait_for_destructors(); + return xt_check_match(&par, size, proto, inv); } @@ -716,14 +719,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static struct nft_expr_type nft_match_type; -static void nft_mt_tg_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - enum nft_trans_phase phase) -{ - if (phase == NFT_TRANS_COMMIT) - refcount_inc(&nft_compat_pending_destroy); -} - static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -762,7 +757,6 @@ nft_match_select_ops(const struct nft_ctx *ctx, ops->type = &nft_match_type; ops->eval = nft_match_eval; ops->init = nft_match_init; - ops->deactivate = nft_mt_tg_deactivate, ops->destroy = nft_match_destroy; ops->dump = nft_match_dump; ops->validate = nft_match_validate; @@ -853,7 +847,6 @@ nft_target_select_ops(const struct nft_ctx *ctx, ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); ops->init = nft_target_init; ops->destroy = nft_target_destroy; - ops->deactivate = nft_mt_tg_deactivate, ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; @@ -917,8 +910,6 @@ static void __exit nft_compat_module_exit(void) nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); - - WARN_ON_ONCE(refcount_read(&nft_compat_pending_destroy) != 1); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); From 63fe3fd393dc4e7ea3948e79947362ffbb0fd616 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 10 Aug 2020 20:08:52 -0700 Subject: [PATCH 03/75] libbpf: Do not use __builtin_offsetof for offsetof Commit 5fbc220862fc ("tools/libpf: Add offsetof/container_of macro in bpf_helpers.h") added a macro offsetof() to get the offset of a structure member: #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) In certain use cases, size_t type may not be available so Commit da7a35062bcc ("libbpf bpf_helpers: Use __builtin_offsetof for offsetof") changed to use __builtin_offsetof which removed the dependency on type size_t, which I suggested. But using __builtin_offsetof will prevent CO-RE relocation generation in case that, e.g., TYPE is annotated with "preserve_access_info" where a relocation is desirable in case the member offset is changed in a different kernel version. So this patch reverted back to the original macro but using "unsigned long" instead of "site_t". Fixes: da7a35062bcc ("libbpf bpf_helpers: Use __builtin_offsetof for offsetof") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Ian Rogers Link: https://lore.kernel.org/bpf/20200811030852.3396929-1-yhs@fb.com --- tools/lib/bpf/bpf_helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index bc14db706b88..e9a4ecddb7a5 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -40,7 +40,7 @@ * Helper macro to manipulate data structures */ #ifndef offsetof -#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) #endif #ifndef container_of #define container_of(ptr, type, member) \ From 0390c429dbed4068bd2cd8dded937d9a5ec24cd2 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Mon, 10 Aug 2020 23:39:40 +0800 Subject: [PATCH 04/75] selftests/bpf: Fix segmentation fault in test_progs test_progs reports the segmentation fault as below: $ sudo ./test_progs -t mmap --verbose test_mmap:PASS:skel_open_and_load 0 nsec [...] test_mmap:PASS:adv_mmap1 0 nsec test_mmap:PASS:adv_mmap2 0 nsec test_mmap:PASS:adv_mmap3 0 nsec test_mmap:PASS:adv_mmap4 0 nsec Segmentation fault This issue was triggered because mmap() and munmap() used inconsistent length parameters; mmap() creates a new mapping of 3 * page_size, but the length parameter set in the subsequent re-map and munmap() functions is 4 * page_size; this leads to the destruction of the process space. To fix this issue, first create 4 pages of anonymous mapping, then do all the mmap() with MAP_FIXED. Another issue is that when unmap the second page fails, the length parameter to delete tmp1 mappings should be 4 * page_size. Signed-off-by: Jianlin Lv Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200810153940.125508-1-Jianlin.Lv@arm.com --- tools/testing/selftests/bpf/prog_tests/mmap.c | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 43d0b5578f46..9c3c5c0f068f 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -21,7 +21,7 @@ void test_mmap(void) const long page_size = sysconf(_SC_PAGE_SIZE); int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd, rdmap_fd; struct bpf_map *data_map, *bss_map; - void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; + void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp0, *tmp1, *tmp2; struct test_mmap__bss *bss_data; struct bpf_map_info map_info; __u32 map_info_sz = sizeof(map_info); @@ -183,16 +183,23 @@ void test_mmap(void) /* check some more advanced mmap() manipulations */ - /* map all but last page: pages 1-3 mapped */ - tmp1 = mmap(NULL, 3 * page_size, PROT_READ, MAP_SHARED, - data_map_fd, 0); - if (CHECK(tmp1 == MAP_FAILED, "adv_mmap1", "errno %d\n", errno)) + tmp0 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, + -1, 0); + if (CHECK(tmp0 == MAP_FAILED, "adv_mmap0", "errno %d\n", errno)) goto cleanup; + /* map all but last page: pages 1-3 mapped */ + tmp1 = mmap(tmp0, 3 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, + data_map_fd, 0); + if (CHECK(tmp0 != tmp1, "adv_mmap1", "tmp0: %p, tmp1: %p\n", tmp0, tmp1)) { + munmap(tmp0, 4 * page_size); + goto cleanup; + } + /* unmap second page: pages 1, 3 mapped */ err = munmap(tmp1 + page_size, page_size); if (CHECK(err, "adv_mmap2", "errno %d\n", errno)) { - munmap(tmp1, map_sz); + munmap(tmp1, 4 * page_size); goto cleanup; } @@ -201,7 +208,7 @@ void test_mmap(void) MAP_SHARED | MAP_FIXED, data_map_fd, 0); if (CHECK(tmp2 == MAP_FAILED, "adv_mmap3", "errno %d\n", errno)) { munmap(tmp1, page_size); - munmap(tmp1 + 2*page_size, page_size); + munmap(tmp1 + 2*page_size, 2 * page_size); goto cleanup; } CHECK(tmp1 + page_size != tmp2, "adv_mmap4", @@ -211,7 +218,7 @@ void test_mmap(void) tmp2 = mmap(tmp1, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, data_map_fd, 0); if (CHECK(tmp2 == MAP_FAILED, "adv_mmap5", "errno %d\n", errno)) { - munmap(tmp1, 3 * page_size); /* unmap page 1 */ + munmap(tmp1, 4 * page_size); /* unmap page 1 */ goto cleanup; } CHECK(tmp1 != tmp2, "adv_mmap6", "tmp1: %p, tmp2: %p\n", tmp1, tmp2); From da7bdfdd23b858e6d97a1e4b461548e23d16977f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 7 Aug 2020 15:38:46 -0700 Subject: [PATCH 05/75] selftests/bpf: Fix v4_to_v6 in sk_lookup I'm getting some garbage in bytes 8 and 9 when doing conversion from sockaddr_in to sockaddr_in6 (leftover from AF_INET?). Let's explicitly clear the higher bytes. Fixes: 0ab5539f8584 ("selftests/bpf: Tests for BPF_SK_LOOKUP attach point") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200807223846.4190917-1-sdf@google.com --- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index c571584c00f5..9ff0412e1fd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -309,6 +309,7 @@ static void v4_to_v6(struct sockaddr_storage *ss) v6->sin6_addr.s6_addr[10] = 0xff; v6->sin6_addr.s6_addr[11] = 0xff; memcpy(&v6->sin6_addr.s6_addr[12], &v4.sin_addr.s_addr, 4); + memset(&v6->sin6_addr.s6_addr[0], 0, 10); } static int udp_recv_send(int server_fd) From 068d9d1eba72423e99162aad3586727180715c2a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 11 Aug 2020 19:29:23 -0700 Subject: [PATCH 06/75] bpf: Fix XDP FD-based attach/detach logic around XDP_FLAGS_UPDATE_IF_NOEXIST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enforce XDP_FLAGS_UPDATE_IF_NOEXIST only if new BPF program to be attached is non-NULL (i.e., we are not detaching a BPF program). Fixes: d4baa9368a5e ("bpf, xdp: Extract common XDP program attachment logic") Reported-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Tested-by: Stanislav Fomichev Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200812022923.1217922-1-andriin@fb.com --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 7df6c9617321..b5d1129d8310 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8913,10 +8913,6 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } - if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { - NL_SET_ERR_MSG(extack, "XDP program already attached"); - return -EBUSY; - } /* put effective new program into new_prog */ if (link) @@ -8927,6 +8923,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; + } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; From 8faf7fc597d59b142af41ddd4a2d59485f75f88a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 11 Aug 2020 19:59:07 -0700 Subject: [PATCH 07/75] tools/bpftool: Make skeleton code C++17-friendly by dropping typeof() Seems like C++17 standard mode doesn't recognize typeof() anymore. This can be tested by compiling test_cpp test with -std=c++17 or -std=c++1z options. The use of typeof in skeleton generated code is unnecessary, all types are well-known at the time of code generation, so remove all typeof()'s to make skeleton code more future-proof when interacting with C++ compilers. Fixes: 985ead416df3 ("bpftool: Add skeleton codegen command") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200812025907.1371956-1-andriin@fb.com --- tools/bpf/bpftool/gen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 8a4c2b3b0cd6..db80e836816e 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -397,7 +397,7 @@ static int do_skeleton(int argc, char **argv) { \n\ struct %1$s *obj; \n\ \n\ - obj = (typeof(obj))calloc(1, sizeof(*obj)); \n\ + obj = (struct %1$s *)calloc(1, sizeof(*obj)); \n\ if (!obj) \n\ return NULL; \n\ if (%1$s__create_skeleton(obj)) \n\ @@ -461,7 +461,7 @@ static int do_skeleton(int argc, char **argv) { \n\ struct bpf_object_skeleton *s; \n\ \n\ - s = (typeof(s))calloc(1, sizeof(*s)); \n\ + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));\n\ if (!s) \n\ return -1; \n\ obj->skeleton = s; \n\ @@ -479,7 +479,7 @@ static int do_skeleton(int argc, char **argv) /* maps */ \n\ s->map_cnt = %zu; \n\ s->map_skel_sz = sizeof(*s->maps); \n\ - s->maps = (typeof(s->maps))calloc(s->map_cnt, s->map_skel_sz);\n\ + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);\n\ if (!s->maps) \n\ goto err; \n\ ", @@ -515,7 +515,7 @@ static int do_skeleton(int argc, char **argv) /* programs */ \n\ s->prog_cnt = %zu; \n\ s->prog_skel_sz = sizeof(*s->progs); \n\ - s->progs = (typeof(s->progs))calloc(s->prog_cnt, s->prog_skel_sz);\n\ + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);\n\ if (!s->progs) \n\ goto err; \n\ ", From 702eddc77a905782083b14ccd05b23840675fd18 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 12 Aug 2020 16:39:10 +0200 Subject: [PATCH 08/75] libbpf: Handle GCC built-in types for Arm NEON When building Arm NEON (SIMD) code from lib/raid6/neon.uc, GCC emits DWARF information using a base type "__Poly8_t", which is internal to GCC and not recognized by Clang. This causes build failures when building with Clang a vmlinux.h generated from an arm64 kernel that was built with GCC. vmlinux.h:47284:9: error: unknown type name '__Poly8_t' typedef __Poly8_t poly8x16_t[16]; ^~~~~~~~~ The polyX_t types are defined as unsigned integers in the "Arm C Language Extension" document (101028_Q220_00_en). Emit typedefs based on standard integer types for the GCC internal types, similar to those emitted by Clang. Including linux/kernel.h to use ARRAY_SIZE() incidentally redefined max(), causing a build bug due to different types, hence the seemingly unrelated change. Reported-by: Jakov Petrina Signed-off-by: Jean-Philippe Brucker Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200812143909.3293280-1-jean-philippe@linaro.org --- tools/lib/bpf/btf_dump.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index cf711168d34a..ac81f3f8957a 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "btf.h" #include "hashmap.h" #include "libbpf.h" @@ -549,6 +550,9 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) } } +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t); + static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, const struct btf_type *t); static void btf_dump_emit_struct_def(struct btf_dump *d, __u32 id, @@ -671,6 +675,9 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) switch (kind) { case BTF_KIND_INT: + /* Emit type alias definitions if necessary */ + btf_dump_emit_missing_aliases(d, id, t); + tstate->emit_state = EMITTED; break; case BTF_KIND_ENUM: @@ -870,7 +877,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, btf_dump_printf(d, ": %d", m_sz); off = m_off + m_sz; } else { - m_sz = max(0, btf__resolve_size(d->btf, m->type)); + m_sz = max(0LL, btf__resolve_size(d->btf, m->type)); off = m_off + m_sz * 8; } btf_dump_printf(d, ";"); @@ -890,6 +897,32 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, btf_dump_printf(d, " __attribute__((packed))"); } +static const char *missing_base_types[][2] = { + /* + * GCC emits typedefs to its internal __PolyX_t types when compiling Arm + * SIMD intrinsics. Alias them to standard base types. + */ + { "__Poly8_t", "unsigned char" }, + { "__Poly16_t", "unsigned short" }, + { "__Poly64_t", "unsigned long long" }, + { "__Poly128_t", "unsigned __int128" }, +}; + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + int i; + + for (i = 0; i < ARRAY_SIZE(missing_base_types); i++) { + if (strcmp(name, missing_base_types[i][0]) == 0) { + btf_dump_printf(d, "typedef %s %s;\n\n", + missing_base_types[i][1], name); + break; + } + } +} + static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, const struct btf_type *t) { From b33164f2bd1cedb094c32cb466287116164457ae Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Aug 2020 14:31:02 +0200 Subject: [PATCH 09/75] bpf: Iterate through all PT_NOTE sections when looking for build id Currently when we look for build id within bpf_get_stackid helper call, we check the first NOTE section and we fail if build id is not there. However on some system (Fedora) there can be multiple NOTE sections in binaries and build id data is not always the first one, like: $ readelf -a /usr/bin/ls ... [ 2] .note.gnu.propert NOTE 0000000000000338 00000338 0000000000000020 0000000000000000 A 0 0 8358 [ 3] .note.gnu.build-i NOTE 0000000000000358 00000358 0000000000000024 0000000000000000 A 0 0 437c [ 4] .note.ABI-tag NOTE 000000000000037c 0000037c ... So the stack_map_get_build_id function will fail on build id retrieval and fallback to BPF_STACK_BUILD_ID_IP. This patch is changing the stack_map_get_build_id code to iterate through all the NOTE sections and try to get build id data from each of them. When tracing on sched_switch tracepoint that does bpf_get_stackid helper call kernel build, I can see about 60% increase of successful build id retrieval. The rest seems fails on -EFAULT. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200812123102.20032-1-jolsa@kernel.org --- kernel/bpf/stackmap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4fd830a62be2..cfed0ac44d38 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr, phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr, phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } From 2404b73c3f1a5f15726c6ecd226b56f6f992767f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 10 Aug 2020 13:52:15 +0200 Subject: [PATCH 10/75] netfilter: avoid ipv6 -> nf_defrag_ipv6 module dependency nf_ct_frag6_gather is part of nf_defrag_ipv6.ko, not ipv6 core. The current use of the netfilter ipv6 stub indirections causes a module dependency between ipv6 and nf_defrag_ipv6. This prevents nf_defrag_ipv6 module from being removed because ipv6 can't be unloaded. Remove the indirection and always use a direct call. This creates a depency from nf_conntrack_bridge to nf_defrag_ipv6 instead: modinfo nf_conntrack depends: nf_conntrack,nf_defrag_ipv6,bridge .. and nf_conntrack already depends on nf_defrag_ipv6 anyway. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 18 ------------------ net/bridge/netfilter/nf_conntrack_bridge.c | 8 ++++++-- net/ipv6/netfilter.c | 3 --- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index aac42c28fe62..9b67394471e1 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -58,7 +58,6 @@ struct nf_ipv6_ops { int (*output)(struct net *, struct sock *, struct sk_buff *)); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); #if IS_MODULE(CONFIG_IPV6) - int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user); int (*br_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, @@ -117,23 +116,6 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst, #include -static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb, - u32 user) -{ -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return 1; - - return v6_ops->br_defrag(net, skb, user); -#elif IS_BUILTIN(CONFIG_IPV6) - return nf_ct_frag6_gather(net, skb, user); -#else - return 1; -#endif -} - int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 809673222382..8d033a75a766 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -168,6 +168,7 @@ static unsigned int nf_ct_br_defrag4(struct sk_buff *skb, static unsigned int nf_ct_br_defrag6(struct sk_buff *skb, const struct nf_hook_state *state) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) u16 zone_id = NF_CT_DEFAULT_ZONE_ID; enum ip_conntrack_info ctinfo; struct br_input_skb_cb cb; @@ -180,14 +181,17 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb, br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm)); - err = nf_ipv6_br_defrag(state->net, skb, - IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + err = nf_ct_frag6_gather(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); /* queued */ if (err == -EINPROGRESS) return NF_STOLEN; br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size); return err == 0 ? NF_ACCEPT : NF_DROP; +#else + return NF_ACCEPT; +#endif } static int nf_ct_br_ip_check(const struct sk_buff *skb) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 409e79b84a83..6d0e942d082d 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -245,9 +245,6 @@ static const struct nf_ipv6_ops ipv6ops = { .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, -#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - .br_defrag = nf_ct_frag6_gather, -#endif #if IS_MODULE(CONFIG_IPV6) .br_fragment = br_ip6_fragment, #endif From 59136aa3b2649796a3a1fd90158675f1f640ce0e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 11 Aug 2020 19:39:09 +0200 Subject: [PATCH 11/75] netfilter: nf_tables: free chain context when BINDING flag is missing syzbot found a memory leak in nf_tables_addchain() because the chain object is not free'd correctly on error. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Reported-by: syzbot+c99868fde67014f7e9f5@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d878e34e3354..fd814e514f94 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2018,8 +2018,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_NAME]) { chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); } else { - if (!(flags & NFT_CHAIN_BINDING)) - return -EINVAL; + if (!(flags & NFT_CHAIN_BINDING)) { + err = -EINVAL; + goto err1; + } snprintf(name, sizeof(name), "__chain%llu", ++chain_id); chain->name = kstrdup(name, GFP_KERNEL); From 6d006a4e38d5a52ef27a8204ac358b96da8148ec Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 7 Aug 2020 21:31:11 +0200 Subject: [PATCH 12/75] selftests: netfilter: add checktool function avoid repeating the same test for different toolcheck Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 33 +++++++------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index d3e0809ab368..68a183753c6c 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -21,29 +21,18 @@ ns2out="" log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool (){ + $1 > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -which nc > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nc (netcat)" - exit $ksft_skip -fi - -ip netns add nsr1 -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi +checktool "nft --version" "run test without nft tool" +checktool "ip -Version" "run test without ip tool" +checktool "which nc" "run test without nc (netcat)" +checktool "ip netns add nsr1" "create net namespace" ip netns add ns1 ip netns add ns2 From dd08734d8aca31f8991b26cd69d6bb99b617e451 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 7 Aug 2020 21:31:50 +0200 Subject: [PATCH 13/75] selftests: netfilter: add MTU arguments to flowtables Add some documentation, default values defined in original script and Originator/Link/Responder arguments using getopts like in tools/power/cpupower/bench/cpufreq-bench_plot.sh Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 68a183753c6c..e98cac6f8bfd 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -2,13 +2,18 @@ # SPDX-License-Identifier: GPL-2.0 # # This tests basic flowtable functionality. -# Creates following topology: +# Creates following default topology: # # Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) # Router1 is the one doing flow offloading, Router2 has no special # purpose other than having a link that is smaller than either Originator # and responder, i.e. TCPMSS announced values are too large and will still # result in fragmentation and/or PMTU discovery. +# +# You can check with different Orgininator/Link/Responder MTU eg: +# sh nft_flowtable.sh -o1000 -l500 -r100 +# + # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -78,11 +83,24 @@ ip -net nsr2 addr add dead:2::1/64 dev veth1 # ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers # is NOT the lowest link mtu. -ip -net nsr1 link set veth0 mtu 9000 -ip -net ns1 link set eth0 mtu 9000 +omtu=9000 +lmtu=1500 +rmtu=2000 -ip -net nsr2 link set veth1 mtu 2000 -ip -net ns2 link set eth0 mtu 2000 +while getopts "o:l:r:" o +do + case $o in + o) omtu=$OPTARG;; + l) lmtu=$OPTARG;; + r) rmtu=$OPTARG;; + esac +done + +ip -net nsr1 link set veth0 mtu $omtu +ip -net ns1 link set eth0 mtu $omtu + +ip -net nsr2 link set veth1 mtu $rmtu +ip -net ns2 link set eth0 mtu $rmtu # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. @@ -136,7 +154,7 @@ table inet filter { # as PMTUd is off. # This rule is deleted for the last test, when we expect PMTUd # to kick in and ensure all packets meet mtu requirements. - meta length gt 1500 accept comment something-to-grep-for + meta length gt $lmtu accept comment something-to-grep-for # next line blocks connection w.o. working offload. # we only do this for reverse dir, because we expect packets to From d8bb9abe21071c64d077f9db3b403823a389464f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 7 Aug 2020 21:32:20 +0200 Subject: [PATCH 14/75] selftests: netfilter: kill running process only Avoid noise like the following: nft_flowtable.sh: line 250: kill: (4691) - No such process Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index e98cac6f8bfd..a47d1d832210 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -250,8 +250,14 @@ test_tcp_forwarding_ip() sleep 3 - kill $lpid - kill $cpid + if ps -p $lpid > /dev/null;then + kill $lpid + fi + + if ps -p $cpid > /dev/null;then + kill $cpid + fi + wait check_transfer "$ns1in" "$ns2out" "ns1 -> ns2" From 23ab656be263813acc3c20623757d3cd1496d9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 13 Aug 2020 16:29:05 +0200 Subject: [PATCH 15/75] libbpf: Prevent overriding errno when logging errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out there were a few more instances where libbpf didn't save the errno before writing an error message, causing errno to be overridden by the printf() return and the error disappearing if logging is enabled. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200813142905.160381-1-toke@redhat.com --- tools/lib/bpf/libbpf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0a06124f7999..0d48c18d5030 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3478,10 +3478,11 @@ bpf_object__probe_global_data(struct bpf_object *obj) map = bpf_create_map_xattr(&map_attr); if (map < 0) { - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", - __func__, cp, errno); - return -errno; + __func__, cp, -ret); + return ret; } insns[0].imm = map; @@ -6012,9 +6013,10 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); pr_warn("failed to pin program: %s\n", cp); - return -errno; + return err; } pr_debug("pinned program '%s'\n", path); From fd09af010788a884de1c39537c288830c3d305db Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:04:37 -0700 Subject: [PATCH 16/75] bpf: sock_ops ctx access may stomp registers in corner case I had a sockmap program that after doing some refactoring started spewing this splat at me: [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001 [...] [18610.807359] Call Trace: [18610.807370] ? 0xffffffffc114d0d5 [18610.807382] __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0 [18610.807391] tcp_connect+0x895/0xd50 [18610.807400] tcp_v4_connect+0x465/0x4e0 [18610.807407] __inet_stream_connect+0xd6/0x3a0 [18610.807412] ? __inet_stream_connect+0x5/0x3a0 [18610.807417] inet_stream_connect+0x3b/0x60 [18610.807425] __sys_connect+0xed/0x120 After some debugging I was able to build this simple reproducer, __section("sockops/reproducer_bad") int bpf_reproducer_bad(struct bpf_sock_ops *skops) { volatile __maybe_unused __u32 i = skops->snd_ssthresh; return 0; } And along the way noticed that below program ran without splat, __section("sockops/reproducer_good") int bpf_reproducer_good(struct bpf_sock_ops *skops) { volatile __maybe_unused __u32 i = skops->snd_ssthresh; volatile __maybe_unused __u32 family; compiler_barrier(); family = skops->family; return 0; } So I decided to check out the code we generate for the above two programs and noticed each generates the BPF code you would expect, 0000000000000000 : ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 0: r1 = *(u32 *)(r1 + 96) 1: *(u32 *)(r10 - 4) = r1 ; return 0; 2: r0 = 0 3: exit 0000000000000000 : ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 0: r2 = *(u32 *)(r1 + 96) 1: *(u32 *)(r10 - 4) = r2 ; family = skops->family; 2: r1 = *(u32 *)(r1 + 20) 3: *(u32 *)(r10 - 8) = r1 ; return 0; 4: r0 = 0 5: exit So we get reasonable assembly, but still something was causing the null pointer dereference. So, we load the programs and dump the xlated version observing that line 0 above 'r* = *(u32 *)(r1 +96)' is going to be translated by the skops access helpers. int bpf_reproducer_bad(struct bpf_sock_ops * skops): ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 0: (61) r1 = *(u32 *)(r1 +28) 1: (15) if r1 == 0x0 goto pc+2 2: (79) r1 = *(u64 *)(r1 +0) 3: (61) r1 = *(u32 *)(r1 +2340) ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 4: (63) *(u32 *)(r10 -4) = r1 ; return 0; 5: (b7) r0 = 0 6: (95) exit int bpf_reproducer_good(struct bpf_sock_ops * skops): ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 0: (61) r2 = *(u32 *)(r1 +28) 1: (15) if r2 == 0x0 goto pc+2 2: (79) r2 = *(u64 *)(r1 +0) 3: (61) r2 = *(u32 *)(r2 +2340) ; volatile __maybe_unused __u32 i = skops->snd_ssthresh; 4: (63) *(u32 *)(r10 -4) = r2 ; family = skops->family; 5: (79) r1 = *(u64 *)(r1 +0) 6: (69) r1 = *(u16 *)(r1 +16) ; family = skops->family; 7: (63) *(u32 *)(r10 -8) = r1 ; return 0; 8: (b7) r0 = 0 9: (95) exit Then we look at lines 0 and 2 above. In the good case we do the zero check in r2 and then load 'r1 + 0' at line 2. Do a quick cross-check into the bpf_sock_ops check and we can confirm that is the 'struct sock *sk' pointer field. But, in the bad case, 0: (61) r1 = *(u32 *)(r1 +28) 1: (15) if r1 == 0x0 goto pc+2 2: (79) r1 = *(u64 *)(r1 +0) Oh no, we read 'r1 +28' into r1, this is skops->fullsock and then in line 2 we read the 'r1 +0' as a pointer. Now jumping back to our spat, [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001 The 0x01 makes sense because that is exactly the fullsock value. And its not a valid dereference so we splat. To fix we need to guard the case when a program is doing a sock_ops field access with src_reg == dst_reg. This is already handled in the load case where the ctx_access handler uses a tmp register being careful to store the old value and restore it. To fix the get case test if src_reg == dst_reg and in this case do the is_fullsock test in the temporary register. Remembering to restore the temporary register before writing to either dst_reg or src_reg to avoid smashing the pointer into the struct holding the tmp variable. Adding this inline code to test_tcpbpf_kern will now be generated correctly from, 9: r2 = *(u32 *)(r2 + 96) to xlated code, 12: (7b) *(u64 *)(r2 +32) = r9 13: (61) r9 = *(u32 *)(r2 +28) 14: (15) if r9 == 0x0 goto pc+4 15: (79) r9 = *(u64 *)(r2 +32) 16: (79) r2 = *(u64 *)(r2 +0) 17: (61) r2 = *(u32 *)(r2 +2348) 18: (05) goto pc+1 19: (79) r9 = *(u64 *)(r2 +32) And in the normal case we keep the original code, because really this is an edge case. From this, 9: r2 = *(u32 *)(r6 + 96) to xlated code, 22: (61) r2 = *(u32 *)(r6 +28) 23: (15) if r2 == 0x0 goto pc+2 24: (79) r2 = *(u64 *)(r6 +0) 25: (61) r2 = *(u32 *)(r2 +2348) So three additional instructions if dst == src register, but I scanned my current code base and did not see this pattern anywhere so should not be a big deal. Further, it seems no one else has hit this or at least reported it so it must a fairly rare pattern. Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718347772.4728.2781381670567919577.stgit@john-Precision-5820-Tower --- net/core/filter.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7124f0fe6974..1baeeff2fcd2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8317,15 +8317,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ - si->dst_reg, si->src_reg, \ + fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ @@ -8334,6 +8350,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ From 84f44df664e9f0e261157e16ee1acd77cc1bb78d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:04:56 -0700 Subject: [PATCH 17/75] bpf: sock_ops sk access may stomp registers when dst_reg = src_reg Similar to patch ("bpf: sock_ops ctx access may stomp registers") if the src_reg = dst_reg when reading the sk field of a sock_ops struct we generate xlated code, 53: (61) r9 = *(u32 *)(r9 +28) 54: (15) if r9 == 0x0 goto pc+3 56: (79) r9 = *(u64 *)(r9 +0) This stomps on the r9 reg to do the sk_fullsock check and then when reading the skops->sk field instead of the sk pointer we get the sk_fullsock. To fix use similar pattern noted in the previous fix and use the temp field to save/restore a register used to do sk_fullsock check. After the fix the generated xlated code reads, 52: (7b) *(u64 *)(r9 +32) = r8 53: (61) r8 = *(u32 *)(r9 +28) 54: (15) if r9 == 0x0 goto pc+3 55: (79) r8 = *(u64 *)(r9 +32) 56: (79) r9 = *(u64 *)(r9 +0) 57: (05) goto pc+1 58: (79) r8 = *(u64 *)(r9 +32) Here r9 register was in-use so r8 is chosen as the temporary register. In line 52 r8 is saved in temp variable and at line 54 restored in case fullsock != 0. Finally we handle fullsock == 0 case by restoring at line 58. This adds a new macro SOCK_OPS_GET_SK it is almost possible to merge this with SOCK_OPS_GET_FIELD, but I found the extra branch logic a bit more confusing than just adding a new macro despite a bit of duplicating code. Fixes: 1314ef561102e ("bpf: export bpf_sock for BPF_PROG_TYPE_SOCK_OPS prog type") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718349653.4728.6559437186853473612.stgit@john-Precision-5820-Tower --- net/core/filter.c | 49 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 1baeeff2fcd2..b2df52086445 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8358,6 +8358,43 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ } while (0) +#define SOCK_OPS_GET_SK() \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) @@ -8642,17 +8679,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct bpf_sock_ops_kern, - is_fullsock), - si->dst_reg, si->src_reg, - offsetof(struct bpf_sock_ops_kern, - is_fullsock)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct bpf_sock_ops_kern, sk), - si->dst_reg, si->src_reg, - offsetof(struct bpf_sock_ops_kern, sk)); + SOCK_OPS_GET_SK(); break; } return insn - insn_buf; From 86ed4be68fdee23df4843a59f91c1ac7fc05e860 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:05:14 -0700 Subject: [PATCH 18/75] bpf, selftests: Add tests for ctx access in sock_ops with single register To verify fix ("bpf: sock_ops ctx access may stomp registers in corner case") we want to force compiler to generate the following code when accessing a field with BPF_TCP_SOCK_GET_COMMON, r1 = *(u32 *)(r1 + 96) // r1 is skops ptr Rather than depend on clang to do this we add the test with inline asm to the tcpbpf test. This saves us from having to create another runner and ensures that if we break this again test_tcpbpf will crash. With above code we get the xlated code, 11: (7b) *(u64 *)(r1 +32) = r9 12: (61) r9 = *(u32 *)(r1 +28) 13: (15) if r9 == 0x0 goto pc+4 14: (79) r9 = *(u64 *)(r1 +32) 15: (79) r1 = *(u64 *)(r1 +0) 16: (61) r1 = *(u32 *)(r1 +2348) 17: (05) goto pc+1 18: (79) r9 = *(u64 *)(r1 +32) We also add the normal case where src_reg != dst_reg so we can compare code generation easily from llvm-objdump and ensure that case continues to work correctly. The normal code is xlated to, 20: (b7) r1 = 0 21: (61) r1 = *(u32 *)(r3 +28) 22: (15) if r1 == 0x0 goto pc+2 23: (79) r1 = *(u64 *)(r3 +0) 24: (61) r1 = *(u32 *)(r1 +2348) Where the temp variable is not used. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718351457.4728.3295119261717842496.stgit@john-Precision-5820-Tower --- .../testing/selftests/bpf/progs/test_tcpbpf_kern.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index 1f1966e86e9f..f8b136827fcc 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -54,6 +54,7 @@ SEC("sockops") int bpf_testcb(struct bpf_sock_ops *skops) { char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)]; + struct bpf_sock_ops *reuse = skops; struct tcphdr *thdr; int good_call_rv = 0; int bad_call_rv = 0; @@ -62,6 +63,18 @@ int bpf_testcb(struct bpf_sock_ops *skops) int v = 0; int op; + /* Test reading fields in bpf_sock_ops using single register */ + asm volatile ( + "%[reuse] = *(u32 *)(%[reuse] +96)" + : [reuse] "+r"(reuse) + :); + + asm volatile ( + "%[op] = *(u32 *)(%[skops] +96)" + : [op] "+r"(op) + : [skops] "r"(skops) + :); + op = (int) skops->op; update_event_map(op); From 8e0c1517565f06027b68caf2875620ddf6914eee Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:05:33 -0700 Subject: [PATCH 19/75] bpf, selftests: Add tests for sock_ops load with r9, r8.r7 registers Loads in sock_ops case when using high registers requires extra logic to ensure the correct temporary value is used. We need to ensure the temp register does not use either the src_reg or dst_reg. Lets add an asm test to force the logic is triggered. The xlated code is here, 30: (7b) *(u64 *)(r9 +32) = r7 31: (61) r7 = *(u32 *)(r9 +28) 32: (15) if r7 == 0x0 goto pc+2 33: (79) r7 = *(u64 *)(r9 +0) 34: (63) *(u32 *)(r7 +916) = r8 35: (79) r7 = *(u64 *)(r9 +32) Notice r9 and r8 are not used for temp registers and r7 is chosen. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718353345.4728.8805043614257933227.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index f8b136827fcc..6420b61fbbc8 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -75,6 +75,13 @@ int bpf_testcb(struct bpf_sock_ops *skops) : [skops] "r"(skops) :); + asm volatile ( + "r9 = %[skops];\n" + "r8 = *(u32 *)(r9 +164);\n" + "*(u32 *)(r9 +164) = r8;\n" + :: [skops] "r"(skops) + : "r9", "r8"); + op = (int) skops->op; update_event_map(op); From 9efa9e499799f939968aff1123cc7e8184960e48 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:05:53 -0700 Subject: [PATCH 20/75] bpf, selftests: Add tests to sock_ops for loading sk Add tests to directly accesse sock_ops sk field. Then use it to ensure a bad pointer access will fault if something goes wrong. We do three tests: The first test ensures when we read sock_ops sk pointer into the same register that we don't fault as described earlier. Here r9 is chosen as the temp register. The xlated code is, 36: (7b) *(u64 *)(r1 +32) = r9 37: (61) r9 = *(u32 *)(r1 +28) 38: (15) if r9 == 0x0 goto pc+3 39: (79) r9 = *(u64 *)(r1 +32) 40: (79) r1 = *(u64 *)(r1 +0) 41: (05) goto pc+1 42: (79) r9 = *(u64 *)(r1 +32) The second test ensures the temp register selection does not collide with in-use register r9. Shown here r8 is chosen because r9 is the sock_ops pointer. The xlated code is as follows, 46: (7b) *(u64 *)(r9 +32) = r8 47: (61) r8 = *(u32 *)(r9 +28) 48: (15) if r8 == 0x0 goto pc+3 49: (79) r8 = *(u64 *)(r9 +32) 50: (79) r9 = *(u64 *)(r9 +0) 51: (05) goto pc+1 52: (79) r8 = *(u64 *)(r9 +32) And finally, ensure we didn't break the base case where dst_reg does not equal the source register, 56: (61) r2 = *(u32 *)(r1 +28) 57: (15) if r2 == 0x0 goto pc+1 58: (79) r2 = *(u64 *)(r1 +0) Notice it takes us an extra four instructions when src reg is the same as dst reg. One to save the reg, two to restore depending on the branch taken and a goto to jump over the second restore. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718355325.4728.4163036953345999636.stgit@john-Precision-5820-Tower --- .../selftests/bpf/progs/test_tcpbpf_kern.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index 6420b61fbbc8..3e6912e4df3d 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -82,6 +82,27 @@ int bpf_testcb(struct bpf_sock_ops *skops) :: [skops] "r"(skops) : "r9", "r8"); + asm volatile ( + "r1 = %[skops];\n" + "r1 = *(u64 *)(r1 +184);\n" + "if r1 == 0 goto +1;\n" + "r1 = *(u32 *)(r1 +4);\n" + :: [skops] "r"(skops):"r1"); + + asm volatile ( + "r9 = %[skops];\n" + "r9 = *(u64 *)(r9 +184);\n" + "if r9 == 0 goto +1;\n" + "r9 = *(u32 *)(r9 +4);\n" + :: [skops] "r"(skops):"r9"); + + asm volatile ( + "r1 = %[skops];\n" + "r2 = *(u64 *)(r1 +184);\n" + "if r2 == 0 goto +1;\n" + "r2 = *(u32 *)(r2 +4);\n" + :: [skops] "r"(skops):"r1", "r2"); + op = (int) skops->op; update_event_map(op); From a62f68c172c3954a78c9dbf7e68496c3e9c22aaf Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 13 Aug 2020 11:08:07 -0700 Subject: [PATCH 21/75] doc: Add link to bpf helpers man page The bpf-helpers(7) man pages provide an invaluable description of the functions that an eBPF program can call at runtime. Link them here. Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200813180807.2821735-1-joe@wand.net.nz --- Documentation/bpf/index.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index d46429be334e..7df2465fd108 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -36,6 +36,12 @@ Two sets of Questions and Answers (Q&A) are maintained. bpf_devel_QA +Helper functions +================ + +* `bpf-helpers(7)`_ maintains a list of helpers available to eBPF programs. + + Program types ============= @@ -79,4 +85,5 @@ Other .. _networking-filter: ../networking/filter.rst .. _man-pages: https://www.kernel.org/doc/man-pages/ .. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html +.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html .. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/ From 09f44b753a7d120becc80213c3459183c8acd26b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:37 -0700 Subject: [PATCH 22/75] tools/bpftool: Fix compilation warnings in 32-bit mode Fix few compilation warnings in bpftool when compiling in 32-bit mode. Abstract away u64 to pointer conversion into a helper function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-2-andriin@fb.com --- tools/bpf/bpftool/btf_dumper.c | 2 +- tools/bpf/bpftool/link.c | 4 ++-- tools/bpf/bpftool/main.h | 10 +++++++++- tools/bpf/bpftool/prog.c | 16 ++++++++-------- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index ede162f83eea..0e9310727281 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -67,7 +67,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, if (!info->btf_id || !info->nr_func_info || btf__get_from_id(info->btf_id, &prog_btf)) goto print; - finfo = (struct bpf_func_info *)info->func_info; + finfo = u64_to_ptr(info->func_info); func_type = btf__type_by_id(prog_btf, finfo->type_id); if (!func_type || !btf_is_func(func_type)) goto print; diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 1b793759170e..a89f09e3c848 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -106,7 +106,7 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) switch (info->type) { case BPF_LINK_TYPE_RAW_TRACEPOINT: jsonw_string_field(json_wtr, "tp_name", - (const char *)info->raw_tracepoint.tp_name); + u64_to_ptr(info->raw_tracepoint.tp_name)); break; case BPF_LINK_TYPE_TRACING: err = get_prog_info(info->prog_id, &prog_info); @@ -185,7 +185,7 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) switch (info->type) { case BPF_LINK_TYPE_RAW_TRACEPOINT: printf("\n\ttp '%s' ", - (const char *)info->raw_tracepoint.tp_name); + (const char *)u64_to_ptr(info->raw_tracepoint.tp_name)); break; case BPF_LINK_TYPE_TRACING: err = get_prog_info(info->prog_id, &prog_info); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index e3a79b5a9960..c46e52137b87 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -21,7 +21,15 @@ /* Make sure we do not use kernel-only integer typedefs */ #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 -#define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void *u64_to_ptr(__u64 ptr) +{ + return (void *)(unsigned long)ptr; +} #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) #define NEXT_ARGP() ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); }) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 158995d853b0..d393eb8263a6 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -428,14 +428,14 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, p_info("no instructions returned"); return -1; } - buf = (unsigned char *)(info->jited_prog_insns); + buf = u64_to_ptr(info->jited_prog_insns); member_len = info->jited_prog_len; } else { /* DUMP_XLATED */ if (info->xlated_prog_len == 0 || !info->xlated_prog_insns) { p_err("error retrieving insn dump: kernel.kptr_restrict set?"); return -1; } - buf = (unsigned char *)info->xlated_prog_insns; + buf = u64_to_ptr(info->xlated_prog_insns); member_len = info->xlated_prog_len; } @@ -444,7 +444,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, return -1; } - func_info = (void *)info->func_info; + func_info = u64_to_ptr(info->func_info); if (info->nr_line_info) { prog_linfo = bpf_prog_linfo__new(info); @@ -462,7 +462,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, n = write(fd, buf, member_len); close(fd); - if (n != member_len) { + if (n != (ssize_t)member_len) { p_err("error writing output file: %s", n < 0 ? strerror(errno) : "short write"); return -1; @@ -492,13 +492,13 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, __u32 i; if (info->nr_jited_ksyms) { kernel_syms_load(&dd); - ksyms = (__u64 *) info->jited_ksyms; + ksyms = u64_to_ptr(info->jited_ksyms); } if (json_output) jsonw_start_array(json_wtr); - lens = (__u32 *) info->jited_func_lens; + lens = u64_to_ptr(info->jited_func_lens); for (i = 0; i < info->nr_jited_func_lens; i++) { if (ksyms) { sym = kernel_syms_search(&dd, ksyms[i]); @@ -559,7 +559,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, } else { kernel_syms_load(&dd); dd.nr_jited_ksyms = info->nr_jited_ksyms; - dd.jited_ksyms = (__u64 *) info->jited_ksyms; + dd.jited_ksyms = u64_to_ptr(info->jited_ksyms); dd.btf = btf; dd.func_info = func_info; dd.finfo_rec_size = info->func_info_rec_size; @@ -1681,7 +1681,7 @@ static char *profile_target_name(int tgt_fd) goto out; } - func_info = (struct bpf_func_info *)(info_linear->info.func_info); + func_info = u64_to_ptr(info_linear->info.func_info); t = btf__type_by_id(btf, func_info[0].type_id); if (!t) { p_err("btf %d doesn't have type %d", From 9028bbcc3e12510cac13a9554f1a1e39667a4387 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:38 -0700 Subject: [PATCH 23/75] selftest/bpf: Fix compilation warnings in 32-bit mode Fix compilation warnings emitted when compiling selftests for 32-bit platform (x86 in my case). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-3-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c | 8 ++++---- tools/testing/selftests/bpf/prog_tests/core_extern.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c | 6 +++--- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 2 +- tools/testing/selftests/bpf/prog_tests/global_data.c | 6 +++--- tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c | 2 +- tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 2 +- tools/testing/selftests/bpf/test_btf.c | 8 ++++---- tools/testing/selftests/bpf/test_progs.h | 5 +++++ 9 files changed, 24 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index 7afa4160416f..284d5921c345 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -159,15 +159,15 @@ void test_bpf_obj_id(void) /* Check getting link info */ info_len = sizeof(struct bpf_link_info) * 2; bzero(&link_infos[i], info_len); - link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name; + link_infos[i].raw_tracepoint.tp_name = ptr_to_u64(&tp_name); link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name); err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]), &link_infos[i], &info_len); if (CHECK(err || link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT || link_infos[i].prog_id != prog_infos[i].id || - link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name || - strcmp((char *)link_infos[i].raw_tracepoint.tp_name, + link_infos[i].raw_tracepoint.tp_name != ptr_to_u64(&tp_name) || + strcmp(u64_to_ptr(link_infos[i].raw_tracepoint.tp_name), "sys_enter") || info_len != sizeof(struct bpf_link_info), "get-link-info(fd)", @@ -178,7 +178,7 @@ void test_bpf_obj_id(void) link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT, link_infos[i].id, link_infos[i].prog_id, prog_infos[i].id, - (char *)link_infos[i].raw_tracepoint.tp_name, + (const char *)u64_to_ptr(link_infos[i].raw_tracepoint.tp_name), "sys_enter")) goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c index b093787e9448..1931a158510e 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_extern.c +++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c @@ -159,8 +159,8 @@ void test_core_extern(void) exp = (uint64_t *)&t->data; for (j = 0; j < n; j++) { CHECK(got[j] != exp[j], "check_res", - "result #%d: expected %lx, but got %lx\n", - j, exp[j], got[j]); + "result #%d: expected %llx, but got %llx\n", + j, (__u64)exp[j], (__u64)got[j]); } cleanup: test_core_extern__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index a895bfed55db..197d0d217b56 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -16,7 +16,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, __u32 duration = 0, retval; struct bpf_map *data_map; const int zero = 0; - u64 *result = NULL; + __u64 *result = NULL; err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &pkt_obj, &pkt_fd); @@ -29,7 +29,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, link = calloc(sizeof(struct bpf_link *), prog_cnt); prog = calloc(sizeof(struct bpf_program *), prog_cnt); - result = malloc((prog_cnt + 32 /* spare */) * sizeof(u64)); + result = malloc((prog_cnt + 32 /* spare */) * sizeof(__u64)); if (CHECK(!link || !prog || !result, "alloc_memory", "failed to alloc memory")) goto close_prog; @@ -72,7 +72,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, goto close_prog; for (i = 0; i < prog_cnt; i++) - if (CHECK(result[i] != 1, "result", "fexit_bpf2bpf failed err %ld\n", + if (CHECK(result[i] != 1, "result", "fexit_bpf2bpf failed err %llu\n", result[i])) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index f11f187990e9..cd6dc80edf18 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -591,7 +591,7 @@ void test_flow_dissector(void) CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) || err || tattr.retval != 1, tests[i].name, - "err %d errno %d retval %d duration %d size %u/%lu\n", + "err %d errno %d retval %d duration %d size %u/%zu\n", err, errno, tattr.retval, tattr.duration, tattr.data_size_out, sizeof(flow_keys)); CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c index e3cb62b0a110..9efa7e50eab2 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data.c @@ -5,7 +5,7 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration) { int i, err, map_fd; - uint64_t num; + __u64 num; map_fd = bpf_find_map(__func__, obj, "result_number"); if (CHECK_FAIL(map_fd < 0)) @@ -14,7 +14,7 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration) struct { char *name; uint32_t key; - uint64_t num; + __u64 num; } tests[] = { { "relocate .bss reference", 0, 0 }, { "relocate .data reference", 1, 42 }, @@ -32,7 +32,7 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration) for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) { err = bpf_map_lookup_elem(map_fd, &tests[i].key, &num); CHECK(err || num != tests[i].num, tests[i].name, - "err %d result %lx expected %lx\n", + "err %d result %llx expected %llx\n", err, num, tests[i].num); } } diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index dde2b7ae7bc9..935a294f049a 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -28,7 +28,7 @@ void test_prog_run_xattr(void) "err %d errno %d retval %d\n", err, errno, tattr.retval); CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out", - "incorrect output size, want %lu have %u\n", + "incorrect output size, want %zu have %u\n", sizeof(pkt_v4), tattr.data_size_out); CHECK_ATTR(buf[5] != 0, "overflow", diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index 25de86af2d03..fafeddaad6a9 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -81,7 +81,7 @@ void test_skb_ctx(void) CHECK_ATTR(tattr.ctx_size_out != sizeof(skb), "ctx_size_out", - "incorrect output size, want %lu have %u\n", + "incorrect output size, want %zu have %u\n", sizeof(skb), tattr.ctx_size_out); for (i = 0; i < 5; i++) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 305fae8f80a9..c75fc6447186 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -3883,7 +3883,7 @@ static int test_big_btf_info(unsigned int test_num) info_garbage.garbage = 0; err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len); if (CHECK(err || info_len != sizeof(*info), - "err:%d errno:%d info_len:%u sizeof(*info):%lu", + "err:%d errno:%d info_len:%u sizeof(*info):%zu", err, errno, info_len, sizeof(*info))) { err = -1; goto done; @@ -4094,7 +4094,7 @@ static int do_test_get_info(unsigned int test_num) if (CHECK(err || !info.id || info_len != sizeof(info) || info.btf_size != raw_btf_size || (ret = memcmp(raw_btf, user_btf, expected_nbytes)), - "err:%d errno:%d info.id:%u info_len:%u sizeof(info):%lu raw_btf_size:%u info.btf_size:%u expected_nbytes:%u memcmp:%d", + "err:%d errno:%d info.id:%u info_len:%u sizeof(info):%zu raw_btf_size:%u info.btf_size:%u expected_nbytes:%u memcmp:%d", err, errno, info.id, info_len, sizeof(info), raw_btf_size, info.btf_size, expected_nbytes, ret)) { err = -1; @@ -4730,7 +4730,7 @@ ssize_t get_pprint_expected_line(enum pprint_mapv_kind_t mapv_kind, nexpected_line = snprintf(expected_line, line_size, "%s%u: {%u,0,%d,0x%x,0x%x,0x%x," - "{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s," + "{%llu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s," "%u,0x%x,[[%d,%d],[%d,%d]]}\n", percpu_map ? "\tcpu" : "", percpu_map ? cpu : next_key, @@ -4738,7 +4738,7 @@ ssize_t get_pprint_expected_line(enum pprint_mapv_kind_t mapv_kind, v->unused_bits2a, v->bits28, v->unused_bits2b, - v->ui64, + (__u64)v->ui64, v->ui8a[0], v->ui8a[1], v->ui8a[2], v->ui8a[3], v->ui8a[4], v->ui8a[5], diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 6e09bf738473..dbb820dde138 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -135,6 +135,11 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } +static inline void *u64_to_ptr(__u64 ptr) +{ + return (void *) (unsigned long) ptr; +} + int bpf_find_map(const char *test, struct bpf_object *obj, const char *name); int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); From 15728ad3e71c120278105f20fa65b3735e715e0f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:39 -0700 Subject: [PATCH 24/75] libbpf: Fix BTF-defined map-in-map initialization on 32-bit host arches Libbpf built in 32-bit mode should be careful about not conflating 64-bit BPF pointers in BPF ELF file and host architecture pointers. This patch fixes issue of incorrect initializating of map-in-map inner map slots due to such difference. Fixes: 646f02ffdd49 ("libbpf: Add BTF-defined map-in-map support") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-4-andriin@fb.com --- tools/lib/bpf/libbpf.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0d48c18d5030..6accddaaedab 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5195,7 +5195,8 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, static int bpf_object__collect_map_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data *data) { - int i, j, nrels, new_sz, ptr_sz = sizeof(void *); + const int bpf_ptr_sz = 8, host_ptr_sz = sizeof(void *); + int i, j, nrels, new_sz; const struct btf_var_secinfo *vi = NULL; const struct btf_type *sec, *var, *def; const struct btf_member *member; @@ -5244,7 +5245,7 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, vi = btf_var_secinfos(sec) + map->btf_var_idx; if (vi->offset <= rel.r_offset && - rel.r_offset + sizeof(void *) <= vi->offset + vi->size) + rel.r_offset + bpf_ptr_sz <= vi->offset + vi->size) break; } if (j == obj->nr_maps) { @@ -5280,17 +5281,20 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return -EINVAL; moff = rel.r_offset - vi->offset - moff; - if (moff % ptr_sz) + /* here we use BPF pointer size, which is always 64 bit, as we + * are parsing ELF that was built for BPF target + */ + if (moff % bpf_ptr_sz) return -EINVAL; - moff /= ptr_sz; + moff /= bpf_ptr_sz; if (moff >= map->init_slots_sz) { new_sz = moff + 1; - tmp = realloc(map->init_slots, new_sz * ptr_sz); + tmp = realloc(map->init_slots, new_sz * host_ptr_sz); if (!tmp) return -ENOMEM; map->init_slots = tmp; memset(map->init_slots + map->init_slots_sz, 0, - (new_sz - map->init_slots_sz) * ptr_sz); + (new_sz - map->init_slots_sz) * host_ptr_sz); map->init_slots_sz = new_sz; } map->init_slots[moff] = targ_map; From 44ad23dfbccbcd26d6ca504eba1ac55755864969 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:40 -0700 Subject: [PATCH 25/75] libbpf: Handle BTF pointer sizes more carefully With libbpf and BTF it is pretty common to have libbpf built for one architecture, while BTF information was generated for a different architecture (typically, but not always, BPF). In such case, the size of a pointer might differ betweem architectures. libbpf previously was always making an assumption that pointer size for BTF is the same as native architecture pointer size, but that breaks for cases where libbpf is built as 32-bit library, while BTF is for 64-bit architecture. To solve this, add heuristic to determine pointer size by searching for `long` or `unsigned long` integer type and using its size as a pointer size. Also, allow to override the pointer size with a new API btf__set_pointer_size(), for cases where application knows which pointer size should be used. User application can check what libbpf "guessed" by looking at the result of btf__pointer_size(). If it's not 0, then libbpf successfully determined a pointer size, otherwise native arch pointer size will be used. For cases where BTF is parsed from ELF file, use ELF's class (32-bit or 64-bit) to determine pointer size. Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf") Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-5-andriin@fb.com --- tools/lib/bpf/btf.c | 83 ++++++++++++++++++++++++++++++++++++++-- tools/lib/bpf/btf.h | 2 + tools/lib/bpf/btf_dump.c | 4 +- tools/lib/bpf/libbpf.map | 2 + 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 4843e44916f7..7dfca7016aaa 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -41,6 +41,7 @@ struct btf { __u32 types_size; __u32 data_size; int fd; + int ptr_sz; }; static inline __u64 ptr_to_u64(const void *ptr) @@ -221,6 +222,70 @@ const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) return btf->types[type_id]; } +static int determine_ptr_size(const struct btf *btf) +{ + const struct btf_type *t; + const char *name; + int i; + + for (i = 1; i <= btf->nr_types; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_int(t)) + continue; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + continue; + + if (strcmp(name, "long int") == 0 || + strcmp(name, "long unsigned int") == 0) { + if (t->size != 4 && t->size != 8) + continue; + return t->size; + } + } + + return -1; +} + +static size_t btf_ptr_sz(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + return btf->ptr_sz < 0 ? sizeof(void *) : btf->ptr_sz; +} + +/* Return pointer size this BTF instance assumes. The size is heuristically + * determined by looking for 'long' or 'unsigned long' integer type and + * recording its size in bytes. If BTF type information doesn't have any such + * type, this function returns 0. In the latter case, native architecture's + * pointer size is assumed, so will be either 4 or 8, depending on + * architecture that libbpf was compiled for. It's possible to override + * guessed value by using btf__set_pointer_size() API. + */ +size_t btf__pointer_size(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + + if (btf->ptr_sz < 0) + /* not enough BTF type info to guess */ + return 0; + + return btf->ptr_sz; +} + +/* Override or set pointer size in bytes. Only values of 4 and 8 are + * supported. + */ +int btf__set_pointer_size(struct btf *btf, size_t ptr_sz) +{ + if (ptr_sz != 4 && ptr_sz != 8) + return -EINVAL; + btf->ptr_sz = ptr_sz; + return 0; +} + static bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void || btf_is_fwd(t); @@ -253,7 +318,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) size = t->size; goto done; case BTF_KIND_PTR: - size = sizeof(void *); + size = btf_ptr_sz(btf); goto done; case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: @@ -293,9 +358,9 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: - return min(sizeof(void *), (size_t)t->size); + return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: - return sizeof(void *); + return btf_ptr_sz(btf); case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: @@ -533,6 +598,18 @@ struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) if (IS_ERR(btf)) goto done; + switch (gelf_getclass(elf)) { + case ELFCLASS32: + btf__set_pointer_size(btf, 4); + break; + case ELFCLASS64: + btf__set_pointer_size(btf, 8); + break; + default: + pr_warn("failed to get ELF class (bitness) for %s\n", path); + break; + } + if (btf_ext && btf_ext_data) { *btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index f4a1a1d2b9a3..1ca14448df4c 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -76,6 +76,8 @@ LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); +LIBBPF_API size_t btf__pointer_size(const struct btf *btf); +LIBBPF_API int btf__set_pointer_size(struct btf *btf, size_t ptr_sz); LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index ac81f3f8957a..fe39bd774697 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -61,6 +61,7 @@ struct btf_dump { const struct btf_ext *btf_ext; btf_dump_printf_fn_t printf_fn; struct btf_dump_opts opts; + int ptr_sz; bool strip_mods; /* per-type auxiliary state */ @@ -139,6 +140,7 @@ struct btf_dump *btf_dump__new(const struct btf *btf, d->btf_ext = btf_ext; d->printf_fn = printf_fn; d->opts.ctx = opts ? opts->ctx : NULL; + d->ptr_sz = btf__pointer_size(btf) ? : sizeof(void *); d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); if (IS_ERR(d->type_names)) { @@ -804,7 +806,7 @@ static void btf_dump_emit_bit_padding(const struct btf_dump *d, int align, int lvl) { int off_diff = m_off - cur_off; - int ptr_bits = sizeof(void *) * 8; + int ptr_bits = d->ptr_sz * 8; if (off_diff <= 0) /* no gap */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 0c4722bfdd0a..e35bd6cdbdbf 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -295,5 +295,7 @@ LIBBPF_0.1.0 { bpf_program__set_sk_lookup; btf__parse; btf__parse_raw; + btf__pointer_size; btf__set_fd; + btf__set_pointer_size; } LIBBPF_0.0.9; From eed7818adf03e874994b966aa33bc00204dd275a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:41 -0700 Subject: [PATCH 26/75] selftests/bpf: Fix btf_dump test cases on 32-bit arches Fix btf_dump test cases by hard-coding BPF's pointer size of 8 bytes for cases where it's impossible to deterimne the pointer size (no long type in BTF). In cases where it's known, validate libbpf correctly determines it as 8. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-6-andriin@fb.com --- .../selftests/bpf/prog_tests/btf_dump.c | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index cb33a7ee4e04..39fb81d9daeb 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -12,15 +12,16 @@ void btf_dump_printf(void *ctx, const char *fmt, va_list args) static struct btf_dump_test_case { const char *name; const char *file; + bool known_ptr_sz; struct btf_dump_opts opts; } btf_dump_test_cases[] = { - {"btf_dump: syntax", "btf_dump_test_case_syntax", {}}, - {"btf_dump: ordering", "btf_dump_test_case_ordering", {}}, - {"btf_dump: padding", "btf_dump_test_case_padding", {}}, - {"btf_dump: packing", "btf_dump_test_case_packing", {}}, - {"btf_dump: bitfields", "btf_dump_test_case_bitfields", {}}, - {"btf_dump: multidim", "btf_dump_test_case_multidim", {}}, - {"btf_dump: namespacing", "btf_dump_test_case_namespacing", {}}, + {"btf_dump: syntax", "btf_dump_test_case_syntax", true, {}}, + {"btf_dump: ordering", "btf_dump_test_case_ordering", false, {}}, + {"btf_dump: padding", "btf_dump_test_case_padding", true, {}}, + {"btf_dump: packing", "btf_dump_test_case_packing", true, {}}, + {"btf_dump: bitfields", "btf_dump_test_case_bitfields", true, {}}, + {"btf_dump: multidim", "btf_dump_test_case_multidim", false, {}}, + {"btf_dump: namespacing", "btf_dump_test_case_namespacing", false, {}}, }; static int btf_dump_all_types(const struct btf *btf, @@ -62,6 +63,18 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) goto done; } + /* tests with t->known_ptr_sz have no "long" or "unsigned long" type, + * so it's impossible to determine correct pointer size; but if they + * do, it should be 8 regardless of host architecture, becaues BPF + * target is always 64-bit + */ + if (!t->known_ptr_sz) { + btf__set_pointer_size(btf, 8); + } else { + CHECK(btf__pointer_size(btf) != 8, "ptr_sz", "exp %d, got %zu\n", + 8, btf__pointer_size(btf)); + } + snprintf(out_file, sizeof(out_file), "/tmp/%s.output.XXXXXX", t->file); fd = mkstemp(out_file); if (CHECK(fd < 0, "create_tmp", "failed to create file: %d\n", fd)) { From 4c01925f583eaa7d9d003dc87a4b75b8140b4ff6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:42 -0700 Subject: [PATCH 27/75] libbpf: Enforce 64-bitness of BTF for BPF object files BPF object files are always targeting 64-bit BPF target architecture, so enforce that at BTF level as well. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-7-andriin@fb.com --- tools/lib/bpf/libbpf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6accddaaedab..5d20b2da4427 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2434,6 +2434,8 @@ static int bpf_object__init_btf(struct bpf_object *obj, BTF_ELF_SEC, err); goto out; } + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); err = 0; } if (btf_ext_data) { @@ -2542,6 +2544,8 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) if (IS_ERR(kern_btf)) return PTR_ERR(kern_btf); + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); bpf_object__sanitize_btf(obj, kern_btf); } From 5705d705832f74395c5465ce93192688f543006a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:43 -0700 Subject: [PATCH 28/75] selftests/bpf: Correct various core_reloc 64-bit assumptions Ensure that types are memory layout- and field alignment-compatible regardless of 32/64-bitness mix of libbpf and BPF architecture. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-8-andriin@fb.com --- .../selftests/bpf/prog_tests/core_reloc.c | 20 +++--- .../selftests/bpf/progs/core_reloc_types.h | 69 ++++++++++--------- 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 084ed26a7d78..a54eafc5e4b3 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -237,8 +237,8 @@ .union_sz = sizeof(((type *)0)->union_field), \ .arr_sz = sizeof(((type *)0)->arr_field), \ .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \ - .ptr_sz = sizeof(((type *)0)->ptr_field), \ - .enum_sz = sizeof(((type *)0)->enum_field), \ + .ptr_sz = 8, /* always 8-byte pointer for BPF */ \ + .enum_sz = sizeof(((type *)0)->enum_field), \ } #define SIZE_CASE(name) { \ @@ -432,20 +432,20 @@ static struct core_reloc_test_case test_cases[] = { .sb4 = -1, .sb20 = -0x17654321, .u32 = 0xBEEF, - .s32 = -0x3FEDCBA987654321, + .s32 = -0x3FEDCBA987654321LL, }), BITFIELDS_CASE(bitfields___bitfield_vs_int, { - .ub1 = 0xFEDCBA9876543210, + .ub1 = 0xFEDCBA9876543210LL, .ub2 = 0xA6, - .ub7 = -0x7EDCBA987654321, - .sb4 = -0x6123456789ABCDE, - .sb20 = 0xD00D, + .ub7 = -0x7EDCBA987654321LL, + .sb4 = -0x6123456789ABCDELL, + .sb20 = 0xD00DLL, .u32 = -0x76543, - .s32 = 0x0ADEADBEEFBADB0B, + .s32 = 0x0ADEADBEEFBADB0BLL, }), BITFIELDS_CASE(bitfields___just_big_enough, { - .ub1 = 0xF, - .ub2 = 0x0812345678FEDCBA, + .ub1 = 0xFLL, + .ub2 = 0x0812345678FEDCBALL, }), BITFIELDS_ERR_CASE(bitfields___err_too_big_bitfield), diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 34d84717c946..69139ed66216 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -1,5 +1,10 @@ #include #include + +void preserce_ptr_sz_fn(long x) {} + +#define __bpf_aligned __attribute__((aligned(8))) + /* * KERNEL */ @@ -444,51 +449,51 @@ struct core_reloc_primitives { char a; int b; enum core_reloc_primitives_enum c; - void *d; - int (*f)(const char *); + void *d __bpf_aligned; + int (*f)(const char *) __bpf_aligned; }; struct core_reloc_primitives___diff_enum_def { char a; int b; - void *d; - int (*f)(const char *); + void *d __bpf_aligned; + int (*f)(const char *) __bpf_aligned; enum { X = 100, Y = 200, - } c; /* inline enum def with differing set of values */ + } c __bpf_aligned; /* inline enum def with differing set of values */ }; struct core_reloc_primitives___diff_func_proto { - void (*f)(int); /* incompatible function prototype */ - void *d; - enum core_reloc_primitives_enum c; + void (*f)(int) __bpf_aligned; /* incompatible function prototype */ + void *d __bpf_aligned; + enum core_reloc_primitives_enum c __bpf_aligned; int b; char a; }; struct core_reloc_primitives___diff_ptr_type { - const char * const d; /* different pointee type + modifiers */ - char a; + const char * const d __bpf_aligned; /* different pointee type + modifiers */ + char a __bpf_aligned; int b; enum core_reloc_primitives_enum c; - int (*f)(const char *); + int (*f)(const char *) __bpf_aligned; }; struct core_reloc_primitives___err_non_enum { char a[1]; int b; int c; /* int instead of enum */ - void *d; - int (*f)(const char *); + void *d __bpf_aligned; + int (*f)(const char *) __bpf_aligned; }; struct core_reloc_primitives___err_non_int { char a[1]; - int *b; /* ptr instead of int */ - enum core_reloc_primitives_enum c; - void *d; - int (*f)(const char *); + int *b __bpf_aligned; /* ptr instead of int */ + enum core_reloc_primitives_enum c __bpf_aligned; + void *d __bpf_aligned; + int (*f)(const char *) __bpf_aligned; }; struct core_reloc_primitives___err_non_ptr { @@ -496,7 +501,7 @@ struct core_reloc_primitives___err_non_ptr { int b; enum core_reloc_primitives_enum c; int d; /* int instead of ptr */ - int (*f)(const char *); + int (*f)(const char *) __bpf_aligned; }; /* @@ -507,7 +512,7 @@ struct core_reloc_mods_output { }; typedef const int int_t; -typedef const char *char_ptr_t; +typedef const char *char_ptr_t __bpf_aligned; typedef const int arr_t[7]; struct core_reloc_mods_substruct { @@ -523,9 +528,9 @@ typedef struct { struct core_reloc_mods { int a; int_t b; - char *c; + char *c __bpf_aligned; char_ptr_t d; - int e[3]; + int e[3] __bpf_aligned; arr_t f; struct core_reloc_mods_substruct g; core_reloc_mods_substruct_t h; @@ -535,9 +540,9 @@ struct core_reloc_mods { struct core_reloc_mods___mod_swap { int b; int_t a; - char *d; + char *d __bpf_aligned; char_ptr_t c; - int f[3]; + int f[3] __bpf_aligned; arr_t e; struct { int y; @@ -555,7 +560,7 @@ typedef arr1_t arr2_t; typedef arr2_t arr3_t; typedef arr3_t arr4_t; -typedef const char * const volatile fancy_char_ptr_t; +typedef const char * const volatile fancy_char_ptr_t __bpf_aligned; typedef core_reloc_mods_substruct_t core_reloc_mods_substruct_tt; @@ -567,7 +572,7 @@ struct core_reloc_mods___typedefs { arr4_t e; fancy_char_ptr_t d; fancy_char_ptr_t c; - int3_t b; + int3_t b __bpf_aligned; int3_t a; }; @@ -739,19 +744,19 @@ struct core_reloc_bitfields___bit_sz_change { int8_t sb4: 1; /* 4 -> 1 */ int32_t sb20: 30; /* 20 -> 30 */ /* non-bitfields */ - uint16_t u32; /* 32 -> 16 */ - int64_t s32; /* 32 -> 64 */ + uint16_t u32; /* 32 -> 16 */ + int64_t s32 __bpf_aligned; /* 32 -> 64 */ }; /* turn bitfield into non-bitfield and vice versa */ struct core_reloc_bitfields___bitfield_vs_int { uint64_t ub1; /* 3 -> 64 non-bitfield */ uint8_t ub2; /* 20 -> 8 non-bitfield */ - int64_t ub7; /* 7 -> 64 non-bitfield signed */ - int64_t sb4; /* 4 -> 64 non-bitfield signed */ - uint64_t sb20; /* 20 -> 16 non-bitfield unsigned */ - int32_t u32: 20; /* 32 non-bitfield -> 20 bitfield */ - uint64_t s32: 60; /* 32 non-bitfield -> 60 bitfield */ + int64_t ub7 __bpf_aligned; /* 7 -> 64 non-bitfield signed */ + int64_t sb4 __bpf_aligned; /* 4 -> 64 non-bitfield signed */ + uint64_t sb20 __bpf_aligned; /* 20 -> 16 non-bitfield unsigned */ + int32_t u32: 20; /* 32 non-bitfield -> 20 bitfield */ + uint64_t s32: 60 __bpf_aligned; /* 32 non-bitfield -> 60 bitfield */ }; struct core_reloc_bitfields___just_big_enough { From 0f993845d723c87656552837b412994d6086f086 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:44 -0700 Subject: [PATCH 29/75] tools/bpftool: Generate data section struct with conservative alignment The comment in the code describes this in good details. Generate such a memory layout that would work both on 32-bit and 64-bit architectures for user-space. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-9-andriin@fb.com --- tools/bpf/bpftool/gen.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index db80e836816e..f61184653633 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -143,6 +143,20 @@ static int codegen_datasec_def(struct bpf_object *obj, var_name, align); return -EINVAL; } + /* Assume 32-bit architectures when generating data section + * struct memory layout. Given bpftool can't know which target + * host architecture it's emitting skeleton for, we need to be + * conservative and assume 32-bit one to ensure enough padding + * bytes are generated for pointer and long types. This will + * still work correctly for 64-bit architectures, because in + * the worst case we'll generate unnecessary padding field, + * which on 64-bit architectures is not strictly necessary and + * would be handled by natural 8-byte alignment. But it still + * will be a correct memory layout, based on recorded offsets + * in BTF. + */ + if (align > 4) + align = 4; align_off = (off + align - 1) / align * align; if (align_off != need_off) { From 4fccd2ff74fbad222c69c7604307e0773a37ab8d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 13 Aug 2020 13:49:45 -0700 Subject: [PATCH 30/75] selftests/bpf: Make test_varlen work with 32-bit user-space arch Despite bpftool generating data section memory layout that will work for 32-bit architectures on user-space side, BPF programs should be careful to not use ambiguous types like `long`, which have different size in 32-bit and 64-bit environments. Fix that in test by using __u64 explicitly, which is a recommended approach anyway. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200813204945.1020225-10-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/varlen.c | 8 ++++---- tools/testing/selftests/bpf/progs/test_varlen.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/varlen.c b/tools/testing/selftests/bpf/prog_tests/varlen.c index c75525eab02c..dd324b4933db 100644 --- a/tools/testing/selftests/bpf/prog_tests/varlen.c +++ b/tools/testing/selftests/bpf/prog_tests/varlen.c @@ -44,25 +44,25 @@ void test_varlen(void) CHECK_VAL(bss->payload1_len2, size2); CHECK_VAL(bss->total1, size1 + size2); CHECK(memcmp(bss->payload1, exp_str, size1 + size2), "content_check", - "doesn't match!"); + "doesn't match!\n"); CHECK_VAL(data->payload2_len1, size1); CHECK_VAL(data->payload2_len2, size2); CHECK_VAL(data->total2, size1 + size2); CHECK(memcmp(data->payload2, exp_str, size1 + size2), "content_check", - "doesn't match!"); + "doesn't match!\n"); CHECK_VAL(data->payload3_len1, size1); CHECK_VAL(data->payload3_len2, size2); CHECK_VAL(data->total3, size1 + size2); CHECK(memcmp(data->payload3, exp_str, size1 + size2), "content_check", - "doesn't match!"); + "doesn't match!\n"); CHECK_VAL(data->payload4_len1, size1); CHECK_VAL(data->payload4_len2, size2); CHECK_VAL(data->total4, size1 + size2); CHECK(memcmp(data->payload4, exp_str, size1 + size2), "content_check", - "doesn't match!"); + "doesn't match!\n"); cleanup: test_varlen__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_varlen.c b/tools/testing/selftests/bpf/progs/test_varlen.c index cd4b72c55dfe..913acdffd90f 100644 --- a/tools/testing/selftests/bpf/progs/test_varlen.c +++ b/tools/testing/selftests/bpf/progs/test_varlen.c @@ -15,9 +15,9 @@ int test_pid = 0; bool capture = false; /* .bss */ -long payload1_len1 = 0; -long payload1_len2 = 0; -long total1 = 0; +__u64 payload1_len1 = 0; +__u64 payload1_len2 = 0; +__u64 total1 = 0; char payload1[MAX_LEN + MAX_LEN] = {}; /* .data */ From 5c04da55c754c44937b3d19c6522f9023fd5c5d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Aug 2020 09:46:11 +0200 Subject: [PATCH 31/75] netfilter: ebtables: reject bogus getopt len value syzkaller reports splat: ------------[ cut here ]------------ Buffer overflow detected (80 < 137)! Call Trace: do_ebt_get_ctl+0x2b4/0x790 net/bridge/netfilter/ebtables.c:2317 nf_getsockopt+0x72/0xd0 net/netfilter/nf_sockopt.c:116 ip_getsockopt net/ipv4/ip_sockglue.c:1778 [inline] caused by a copy-to-user with a too-large "*len" value. This adds a argument check on *len just like in the non-compat version of the handler. Before the "Fixes" commit, the reproducer fails with -EINVAL as expected: 1. core calls the "compat" getsockopt version 2. compat getsockopt version detects the *len value is possibly in 64-bit layout (*len != compat_len) 3. compat getsockopt version delegates everything to native getsockopt version 4. native getsockopt rejects invalid *len -> compat handler only sees len == sizeof(compat_struct) for GET_ENTRIES. After the refactor, event sequence is: 1. getsockopt calls "compat" version (len != native_len) 2. compat version attempts to copy *len bytes, where *len is random value from userspace Fixes: fc66de8e16ec ("netfilter/ebtables: clean up compat {get, set}sockopt handling") Reported-by: syzbot+5accb5c62faa1d346480@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Reviewed-by: Christoph Hellwig Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 1641f414d1ba..ebe33b60efd6 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2238,6 +2238,10 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, struct ebt_table *t; struct net *net = sock_net(sk); + if ((cmd == EBT_SO_GET_INFO || cmd == EBT_SO_GET_INIT_INFO) && + *len != sizeof(struct compat_ebt_replace)) + return -EINVAL; + if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; From 38ba8b9241f5848a49b80fddac9ab5f4692e434e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Aug 2020 09:18:34 -0700 Subject: [PATCH 32/75] can: j1939: fix kernel-infoleak in j1939_sk_sock2sockaddr_can() syzbot found that at least 2 bytes of kernel information were leaked during getsockname() on AF_CAN CAN_J1939 socket. Since struct sockaddr_can has in fact two holes, simply clear the whole area before filling it with useful data. BUG: KMSAN: kernel-infoleak in kmsan_copy_to_user+0x81/0x90 mm/kmsan/kmsan_hooks.c:253 CPU: 0 PID: 8466 Comm: syz-executor511 Not tainted 5.8.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 kmsan_internal_check_memory+0x238/0x3d0 mm/kmsan/kmsan.c:423 kmsan_copy_to_user+0x81/0x90 mm/kmsan/kmsan_hooks.c:253 instrument_copy_to_user include/linux/instrumented.h:91 [inline] _copy_to_user+0x18e/0x260 lib/usercopy.c:39 copy_to_user include/linux/uaccess.h:186 [inline] move_addr_to_user+0x3de/0x670 net/socket.c:237 __sys_getsockname+0x407/0x5e0 net/socket.c:1909 __do_sys_getsockname net/socket.c:1920 [inline] __se_sys_getsockname+0x91/0xb0 net/socket.c:1917 __x64_sys_getsockname+0x4a/0x70 net/socket.c:1917 do_syscall_64+0xad/0x160 arch/x86/entry/common.c:386 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x440219 Code: Bad RIP value. RSP: 002b:00007ffe5ee150c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000033 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440219 RDX: 0000000020000240 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401a20 R13: 0000000000401ab0 R14: 0000000000000000 R15: 0000000000000000 Local variable ----address@__sys_getsockname created at: __sys_getsockname+0x91/0x5e0 net/socket.c:1894 __sys_getsockname+0x91/0x5e0 net/socket.c:1894 Bytes 2-3 of 24 are uninitialized Memory access of size 24 starts at ffff8880ba2c7de8 Data copied to user address 0000000020000100 Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Robin van der Gracht Cc: Oleksij Rempel Cc: Pengutronix Kernel Team Cc: linux-can@vger.kernel.org Acked-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200813161834.4021638-1-edumazet@google.com Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 78ff9b3f1d40..b634b680177f 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -553,6 +553,11 @@ static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { + /* There are two holes (2 bytes and 3 bytes) to clear to avoid + * leaking kernel information to user space. + */ + memset(addr, 0, J1939_MIN_NAMELEN); + addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; From b43e3a82bc432c1caaed8950e7662c143470c54c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Aug 2020 12:51:56 +0200 Subject: [PATCH 33/75] can: j1939: transport: j1939_simple_recv(): ignore local J1939 messages send not by J1939 stack In current J1939 stack implementation, we process all locally send messages as own messages. Even if it was send by CAN_RAW socket. To reproduce it use following commands: testj1939 -P -r can0:0x80 & cansend can0 18238040#0123 This step will trigger false positive not critical warning: j1939_simple_recv: Received already invalidated message With this patch we add additional check to make sure, related skb is own echo message. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200807105200.26441-2-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 1 + net/can/j1939/transport.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index b634b680177f..ad973370de12 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -398,6 +398,7 @@ static int j1939_sk_init(struct sock *sk) spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); sk->sk_destruct = j1939_sk_sock_destruct; + sk->sk_protocol = CAN_J1939; return 0; } diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 9f99af5b0b11..b135c5e2a86e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2017,6 +2017,10 @@ void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb) if (!skb->sk) return; + if (skb->sk->sk_family != AF_CAN || + skb->sk->sk_protocol != CAN_J1939) + return; + j1939_session_list_lock(priv); session = j1939_session_get_simple(priv, skb); j1939_session_list_unlock(priv); From cd3b3636c99fcac52c598b64061f3fe4413c6a12 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Aug 2020 12:51:57 +0200 Subject: [PATCH 34/75] can: j1939: transport: j1939_session_tx_dat(): fix use-after-free read in j1939_tp_txtimer() The current stack implementation do not support ECTS requests of not aligned TP sized blocks. If ECTS will request a block with size and offset spanning two TP blocks, this will cause memcpy() to read beyond the queued skb (which does only contain one TP sized block). Sometimes KASAN will detect this read if the memory region beyond the skb was previously allocated and freed. In other situations it will stay undetected. The ETP transfer in any case will be corrupted. This patch adds a sanity check to avoid this kind of read and abort the session with error J1939_XTP_ABORT_ECTS_TOO_BIG. Reported-by: syzbot+5322482fe520b02aea30@syzkaller.appspotmail.com Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: linux-stable # >= v5.4 Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200807105200.26441-3-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index b135c5e2a86e..30957c9a8eb7 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -787,6 +787,18 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (len > 7) len = 7; + if (offset + len > se_skb->len) { + netdev_err_once(priv->ndev, + "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", + __func__, session, skcb->offset, se_skb->len , session->pkt.tx); + return -EOVERFLOW; + } + + if (!len) { + ret = -ENOBUFS; + break; + } + memcpy(&dat[1], &tpdat[offset], len); ret = j1939_tp_tx_dat(session, dat, len + 1); if (ret < 0) { @@ -1120,6 +1132,9 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) * cleanup including propagation of the error to user space. */ break; + case -EOVERFLOW: + j1939_session_cancel(session, J1939_XTP_ABORT_ECTS_TOO_BIG); + break; case 0: session->tx_retry = 0; break; From af804b7826350d5af728dca4715e473338fbd7e5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Aug 2020 12:51:58 +0200 Subject: [PATCH 35/75] can: j1939: socket: j1939_sk_bind(): make sure ml_priv is allocated This patch adds check to ensure that the struct net_device::ml_priv is allocated, as it is used later by the j1939 stack. The allocation is done by all mainline CAN network drivers, but when using bond or team devices this is not the case. Bail out if no ml_priv is allocated. Reported-by: syzbot+f03d384f3455d28833eb@syzkaller.appspotmail.com Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: linux-stable # >= v5.4 Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200807105200.26441-4-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index ad973370de12..b93876c57fc4 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -467,6 +467,14 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out_release_sock; } + if (!ndev->ml_priv) { + netdev_warn_once(ndev, + "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n"); + dev_put(ndev); + ret = -ENODEV; + goto out_release_sock; + } + priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { From 840835c9281215341d84966a8855f267a971e6a3 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Aug 2020 12:51:59 +0200 Subject: [PATCH 36/75] can: j1939: transport: add j1939_session_skb_find_by_offset() function Sometimes it makes no sense to search the skb by pkt.dpo, since we need next the skb within the transaction block. This may happen if we have an ETP session with CTS set to less than 255 packets. After this patch, we will be able to work with ETP sessions where the block size (ETP.CM_CTS byte 2) is less than 255 packets. Reported-by: Henrique Figueira Reported-by: https://github.com/linux-can/can-utils/issues/228 Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200807105200.26441-5-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 30957c9a8eb7..90a2baac8a4a 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -352,17 +352,16 @@ void j1939_session_skb_queue(struct j1939_session *session, skb_queue_tail(&session->skb_queue, skb); } -static struct sk_buff *j1939_session_skb_find(struct j1939_session *session) +static struct +sk_buff *j1939_session_skb_find_by_offset(struct j1939_session *session, + unsigned int offset_start) { struct j1939_priv *priv = session->priv; + struct j1939_sk_buff_cb *do_skcb; struct sk_buff *skb = NULL; struct sk_buff *do_skb; - struct j1939_sk_buff_cb *do_skcb; - unsigned int offset_start; unsigned long flags; - offset_start = session->pkt.dpo * 7; - spin_lock_irqsave(&session->skb_queue.lock, flags); skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); @@ -382,6 +381,14 @@ static struct sk_buff *j1939_session_skb_find(struct j1939_session *session) return skb; } +static struct sk_buff *j1939_session_skb_find(struct j1939_session *session) +{ + unsigned int offset_start; + + offset_start = session->pkt.dpo * 7; + return j1939_session_skb_find_by_offset(session, offset_start); +} + /* see if we are receiver * returns 0 for broadcasts, although we will receive them */ @@ -766,7 +773,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) int ret = 0; u8 dat[8]; - se_skb = j1939_session_skb_find(session); + se_skb = j1939_session_skb_find_by_offset(session, session->pkt.tx * 7); if (!se_skb) return -ENOBUFS; @@ -1765,7 +1772,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, __func__, session); goto out_session_cancel; } - se_skb = j1939_session_skb_find(session); + + se_skb = j1939_session_skb_find_by_offset(session, packet * 7); if (!se_skb) { netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__, session); From e052d0540298bfe0f6cbbecdc7e2ea9b859575b2 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Aug 2020 12:52:00 +0200 Subject: [PATCH 37/75] can: j1939: transport: j1939_xtp_rx_dat_one(): compare own packets to detect corruptions Since the stack relays on receiving own packets, it was overwriting own transmit buffer from received packets. At least theoretically, the received echo buffer can be corrupt or changed and the session partner can request to resend previous data. In this case we will re-send bad data. With this patch we will stop to overwrite own TX buffer and use it for sanity checking. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20200807105200.26441-6-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 90a2baac8a4a..5cf107cb447c 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1792,7 +1792,20 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, } tpdat = se_skb->data; - memcpy(&tpdat[offset], &dat[1], nbytes); + if (!session->transmission) { + memcpy(&tpdat[offset], &dat[1], nbytes); + } else { + int err; + + err = memcmp(&tpdat[offset], &dat[1], nbytes); + if (err) + netdev_err_once(priv->ndev, + "%s: 0x%p: Data of RX-looped back packet (%*ph) doesn't match TX data (%*ph)!\n", + __func__, session, + nbytes, &dat[1], + nbytes, &tpdat[offset]); + } + if (packet == session->pkt.rx) session->pkt.rx++; From 3cda505a679ced78d69c889cfb418d1728bb2707 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 3 Aug 2020 11:32:07 -0700 Subject: [PATCH 38/75] igc: Fix PTP initialization Right now, igc_ptp_reset() is called from igc_reset(), which is called from igc_probe() before igc_ptp_init() has a chance to run. It is detected as an attempt to use an spinlock without registering its key first. See log below. To avoid this problem, simplify the initialization: igc_ptp_init() is only called from igc_probe(), and igc_ptp_reset() is only called from igc_reset(). [ 2.736332] INFO: trying to register non-static key. [ 2.736902] input: HDA Intel PCH Front Headphone as /devices/pci0000:00/0000:00:1f.3/sound/card0/input10 [ 2.737513] the code is fine but needs lockdep annotation. [ 2.737513] turning off the locking correctness validator. [ 2.737515] CPU: 8 PID: 239 Comm: systemd-udevd Tainted: G E 5.8.0-rc7+ #13 [ 2.737515] Hardware name: Gigabyte Technology Co., Ltd. Z390 AORUS ULTRA/Z390 AORUS ULTRA-CF, BIOS F7 03/14/2019 [ 2.737516] Call Trace: [ 2.737521] dump_stack+0x78/0xa0 [ 2.737524] register_lock_class+0x6b1/0x6f0 [ 2.737526] ? lockdep_hardirqs_on_prepare+0xca/0x160 [ 2.739177] ? _raw_spin_unlock_irq+0x24/0x50 [ 2.739179] ? trace_hardirqs_on+0x1c/0xf0 [ 2.740820] __lock_acquire+0x56/0x1ff0 [ 2.740823] ? __schedule+0x30c/0x970 [ 2.740825] lock_acquire+0x97/0x3e0 [ 2.740830] ? igc_ptp_reset+0x35/0xf0 [igc] [ 2.740833] ? schedule_hrtimeout_range_clock+0xb7/0x120 [ 2.742507] _raw_spin_lock_irqsave+0x3a/0x50 [ 2.742512] ? igc_ptp_reset+0x35/0xf0 [igc] [ 2.742515] igc_ptp_reset+0x35/0xf0 [igc] [ 2.742519] igc_reset+0x96/0xd0 [igc] [ 2.744148] igc_probe+0x68f/0x7d0 [igc] [ 2.745796] local_pci_probe+0x3d/0x70 [ 2.745799] pci_device_probe+0xd1/0x190 [ 2.745802] really_probe+0x15a/0x3f0 [ 2.759936] driver_probe_device+0xe1/0x150 [ 2.759937] device_driver_attach+0xa8/0xb0 [ 2.761786] __driver_attach+0x89/0x150 [ 2.761786] ? device_driver_attach+0xb0/0xb0 [ 2.761787] ? device_driver_attach+0xb0/0xb0 [ 2.761788] bus_for_each_dev+0x66/0x90 [ 2.765012] bus_add_driver+0x12e/0x1f0 [ 2.765716] driver_register+0x8b/0xe0 [ 2.766418] ? 0xffffffffc0230000 [ 2.767119] do_one_initcall+0x5a/0x310 [ 2.767826] ? kmem_cache_alloc_trace+0xe9/0x200 [ 2.768528] do_init_module+0x5c/0x260 [ 2.769206] __do_sys_finit_module+0x93/0xe0 [ 2.770048] do_syscall_64+0x46/0xa0 [ 2.770716] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2.771396] RIP: 0033:0x7f83534589e0 [ 2.772073] Code: 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 2e 2e 2e 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 80 24 0d 00 f7 d8 64 89 01 48 [ 2.772074] RSP: 002b:00007ffd31d0ed18 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 2.774854] RAX: ffffffffffffffda RBX: 000055d52816aba0 RCX: 00007f83534589e0 [ 2.774855] RDX: 0000000000000000 RSI: 00007f83535b982f RDI: 0000000000000006 [ 2.774855] RBP: 00007ffd31d0ed60 R08: 0000000000000000 R09: 00007ffd31d0ed30 [ 2.774856] R10: 0000000000000006 R11: 0000000000000246 R12: 0000000000000000 [ 2.774856] R13: 0000000000020000 R14: 00007f83535b982f R15: 000055d527f5e120 Fixes: 5f2958052c58 ("igc: Add basic skeleton for PTP") Signed-off-by: Vinicius Costa Gomes Reviewed-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 5 ++--- drivers/net/ethernet/intel/igc/igc_ptp.c | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 7a6f2a0d413f..9593aa4eea36 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -5142,6 +5142,8 @@ static int igc_probe(struct pci_dev *pdev, device_set_wakeup_enable(&adapter->pdev->dev, adapter->flags & IGC_FLAG_WOL_SUPPORTED); + igc_ptp_init(adapter); + /* reset the hardware with the new settings */ igc_reset(adapter); @@ -5158,9 +5160,6 @@ static int igc_probe(struct pci_dev *pdev, /* carrier off reporting is important to ethtool even BEFORE open */ netif_carrier_off(netdev); - /* do hw tstamp init after resetting */ - igc_ptp_init(adapter); - /* Check if Media Autosense is enabled */ adapter->ei = *ei; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index e67d4655b47e..36c999250fcc 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -496,8 +496,6 @@ void igc_ptp_init(struct igc_adapter *adapter) adapter->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; adapter->tstamp_config.tx_type = HWTSTAMP_TX_OFF; - igc_ptp_reset(adapter); - adapter->ptp_clock = ptp_clock_register(&adapter->ptp_caps, &adapter->pdev->dev); if (IS_ERR(adapter->ptp_clock)) { From 068885434ccb20542e0d759aebbefe7a6724d85f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 14 Aug 2020 13:26:22 +0100 Subject: [PATCH 39/75] sfc: check hash is valid before using it On EF100, the RX hash field in the packet prefix may not be valid (e.g. if the header parse failed), and this is indicated by a one-bit flag elsewhere in the packet prefix. Only call skb_set_hash() if the RSS_HASH_VALID bit is set. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef100_rx.c | 5 +++++ drivers/net/ethernet/sfc/ef100_rx.h | 1 + drivers/net/ethernet/sfc/efx.h | 8 ++++++++ drivers/net/ethernet/sfc/net_driver.h | 2 ++ drivers/net/ethernet/sfc/rx_common.c | 3 ++- 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ef100_rx.c b/drivers/net/ethernet/sfc/ef100_rx.c index 13ba1a4f66fc..012925e878f4 100644 --- a/drivers/net/ethernet/sfc/ef100_rx.c +++ b/drivers/net/ethernet/sfc/ef100_rx.c @@ -31,6 +31,11 @@ #define ESF_GZ_RX_PREFIX_NT_OR_INNER_L3_CLASS_WIDTH \ ESF_GZ_RX_PREFIX_HCLASS_NT_OR_INNER_L3_CLASS_WIDTH +bool ef100_rx_buf_hash_valid(const u8 *prefix) +{ + return PREFIX_FIELD(prefix, RSS_HASH_VALID); +} + static bool check_fcs(struct efx_channel *channel, u32 *prefix) { u16 rxclass; diff --git a/drivers/net/ethernet/sfc/ef100_rx.h b/drivers/net/ethernet/sfc/ef100_rx.h index f2f266863966..fe45b36458d1 100644 --- a/drivers/net/ethernet/sfc/ef100_rx.h +++ b/drivers/net/ethernet/sfc/ef100_rx.h @@ -14,6 +14,7 @@ #include "net_driver.h" +bool ef100_rx_buf_hash_valid(const u8 *prefix); void efx_ef100_ev_rx(struct efx_channel *channel, const efx_qword_t *p_event); void ef100_rx_write(struct efx_rx_queue *rx_queue); void __ef100_rx_packet(struct efx_channel *channel); diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index a9808e86068d..daf0c00c1242 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -45,6 +45,14 @@ static inline void efx_rx_flush_packet(struct efx_channel *channel) __ef100_rx_packet, __efx_rx_packet, channel); } +static inline bool efx_rx_buf_hash_valid(struct efx_nic *efx, const u8 *prefix) +{ + if (efx->type->rx_buf_hash_valid) + return INDIRECT_CALL_1(efx->type->rx_buf_hash_valid, + ef100_rx_buf_hash_valid, + prefix); + return true; +} /* Maximum number of TCP segments we support for soft-TSO */ #define EFX_TSO_MAX_SEGS 100 diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 7bb7ecb480ae..dcb741d8bd11 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1265,6 +1265,7 @@ struct efx_udp_tunnel { * @rx_write: Write RX descriptors and doorbell * @rx_defer_refill: Generate a refill reminder event * @rx_packet: Receive the queued RX buffer on a channel + * @rx_buf_hash_valid: Determine whether the RX prefix contains a valid hash * @ev_probe: Allocate resources for event queue * @ev_init: Initialise event queue on the NIC * @ev_fini: Deinitialise event queue on the NIC @@ -1409,6 +1410,7 @@ struct efx_nic_type { void (*rx_write)(struct efx_rx_queue *rx_queue); void (*rx_defer_refill)(struct efx_rx_queue *rx_queue); void (*rx_packet)(struct efx_channel *channel); + bool (*rx_buf_hash_valid)(const u8 *prefix); int (*ev_probe)(struct efx_channel *channel); int (*ev_init)(struct efx_channel *channel); void (*ev_fini)(struct efx_channel *channel); diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index fb77c7bbe4af..ef9bca92b0b7 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -525,7 +525,8 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, return; } - if (efx->net_dev->features & NETIF_F_RXHASH) + if (efx->net_dev->features & NETIF_F_RXHASH && + efx_rx_buf_hash_valid(efx, eh)) skb_set_hash(skb, efx_rx_buf_hash(efx, eh), PKT_HASH_TYPE_L3); if (csum) { From 35759383133f64d90eba120a0d3efe8f71241650 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 14 Aug 2020 15:56:34 +0200 Subject: [PATCH 40/75] mptcp: sendmsg: reset iter on error Once we've copied data from the iterator we need to revert in case we end up not sending any data. This bug doesn't trigger with normal 'poll' based tests, because we only feed a small chunk of data to kernel after poll indicated POLLOUT. With blocking IO and large writes this triggers. Receiver ends up with less data than it should get. Fixes: 72511aab95c94d ("mptcp: avoid blocking in tcp_sendpages") Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8c1d1a595701..c84b4051c2a4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -725,8 +725,10 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (!psize) return -EINVAL; - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) + if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { + iov_iter_revert(&msg->msg_iter, psize); return -ENOMEM; + } } else { offset = dfrag->offset; psize = min_t(size_t, dfrag->data_len, avail_size); @@ -737,8 +739,10 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, */ ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) + if (ret <= 0) { + iov_iter_revert(&msg->msg_iter, psize); return ret; + } frag_truesize += ret; if (!retransmission) { From 4bd5e02a2ed1575c2f65bd3c557a077dd399f0e8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Thu, 6 Aug 2020 13:40:59 +0000 Subject: [PATCH 41/75] i40e: Set RX_ONLY mode for unicast promiscuous on VLAN Trusted VF with unicast promiscuous mode set, could listen to TX traffic of other VFs. Set unicast promiscuous mode to RX traffic, if VSI has port VLAN configured. Rename misleading I40E_AQC_SET_VSI_PROMISC_TX bit to I40E_AQC_SET_VSI_PROMISC_RX_ONLY. Aligned unicast promiscuous with VLAN to the one without VLAN. Fixes: 6c41a7606967 ("i40e: Add promiscuous on VLAN support") Fixes: 3b1200891b7f ("i40e: When in promisc mode apply promisc mode to Tx Traffic as well") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Aleksandr Loktionov Signed-off-by: Arkadiusz Kubalewski Tested-by: Andrew Bowers Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 2 +- drivers/net/ethernet/intel/i40e/i40e_common.c | 35 ++++++++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index a62ddd626929..c0c8efe42fce 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -981,7 +981,7 @@ struct i40e_aqc_set_vsi_promiscuous_modes { #define I40E_AQC_SET_VSI_PROMISC_BROADCAST 0x04 #define I40E_AQC_SET_VSI_DEFAULT 0x08 #define I40E_AQC_SET_VSI_PROMISC_VLAN 0x10 -#define I40E_AQC_SET_VSI_PROMISC_TX 0x8000 +#define I40E_AQC_SET_VSI_PROMISC_RX_ONLY 0x8000 __le16 seid; __le16 vlan_tag; #define I40E_AQC_SET_VSI_VLAN_VALID 0x8000 diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index afad5e9f80e0..6ab52cbd697a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1966,6 +1966,21 @@ i40e_status i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags, return status; } +/** + * i40e_is_aq_api_ver_ge + * @aq: pointer to AdminQ info containing HW API version to compare + * @maj: API major value + * @min: API minor value + * + * Assert whether current HW API version is greater/equal than provided. + **/ +static bool i40e_is_aq_api_ver_ge(struct i40e_adminq_info *aq, u16 maj, + u16 min) +{ + return (aq->api_maj_ver > maj || + (aq->api_maj_ver == maj && aq->api_min_ver >= min)); +} + /** * i40e_aq_add_vsi * @hw: pointer to the hw struct @@ -2091,18 +2106,16 @@ i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw, if (set) { flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; - if (rx_only_promisc && - (((hw->aq.api_maj_ver == 1) && (hw->aq.api_min_ver >= 5)) || - (hw->aq.api_maj_ver > 1))) - flags |= I40E_AQC_SET_VSI_PROMISC_TX; + if (rx_only_promisc && i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + flags |= I40E_AQC_SET_VSI_PROMISC_RX_ONLY; } cmd->promiscuous_flags = cpu_to_le16(flags); cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST); - if (((hw->aq.api_maj_ver >= 1) && (hw->aq.api_min_ver >= 5)) || - (hw->aq.api_maj_ver > 1)) - cmd->valid_flags |= cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_TX); + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + cmd->valid_flags |= + cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_RX_ONLY); cmd->seid = cpu_to_le16(seid); status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); @@ -2199,11 +2212,17 @@ enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_vsi_promiscuous_modes); - if (enable) + if (enable) { flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + flags |= I40E_AQC_SET_VSI_PROMISC_RX_ONLY; + } cmd->promiscuous_flags = cpu_to_le16(flags); cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST); + if (i40e_is_aq_api_ver_ge(&hw->aq, 1, 5)) + cmd->valid_flags |= + cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_RX_ONLY); cmd->seid = cpu_to_le16(seid); cmd->vlan_tag = cpu_to_le16(vid | I40E_AQC_SET_VSI_VLAN_VALID); From 5b6d4a7f20b09c47ca598760f6dafd554af8b6d5 Mon Sep 17 00:00:00 2001 From: Grzegorz Szczurek Date: Tue, 11 Aug 2020 10:56:49 +0000 Subject: [PATCH 42/75] i40e: Fix crash during removing i40e driver Fix the reason of crashing system by add waiting time to finish reset recovery process before starting remove driver procedure. Now VSI is releasing if VSI is not in reset recovery mode. Without this fix it was possible to start remove driver if other processing command need reset recovery procedure which resulted in null pointer dereference. VSI used by the ethtool process has been cleared by remove driver process. [ 6731.508665] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 6731.508668] #PF: supervisor read access in kernel mode [ 6731.508670] #PF: error_code(0x0000) - not-present page [ 6731.508671] PGD 0 P4D 0 [ 6731.508674] Oops: 0000 [#1] SMP PTI [ 6731.508679] Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0021.032120170601 03/21/2017 [ 6731.508694] RIP: 0010:i40e_down+0x252/0x310 [i40e] [ 6731.508696] Code: c7 78 de fa c0 e8 61 02 3a c1 66 83 bb f6 0c 00 00 00 0f 84 bf 00 00 00 45 31 e4 45 31 ff eb 03 41 89 c7 48 8b 83 98 0c 00 00 <4a> 8b 3c 20 e8 a5 79 02 00 48 83 bb d0 0c 00 00 00 74 10 48 8b 83 [ 6731.508698] RSP: 0018:ffffb75ac7b3faf0 EFLAGS: 00010246 [ 6731.508700] RAX: 0000000000000000 RBX: ffff9c9874bd5000 RCX: 0000000000000007 [ 6731.508701] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff9c987f4d9780 [ 6731.508703] RBP: ffffb75ac7b3fb30 R08: 0000000000005b60 R09: 0000000000000004 [ 6731.508704] R10: ffffb75ac64fbd90 R11: 0000000000000001 R12: 0000000000000000 [ 6731.508706] R13: ffff9c97a08e0000 R14: ffff9c97a08e0a68 R15: 0000000000000000 [ 6731.508708] FS: 00007f2617cd2740(0000) GS:ffff9c987f4c0000(0000) knlGS:0000000000000000 [ 6731.508710] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6731.508711] CR2: 0000000000000000 CR3: 0000001e765c4006 CR4: 00000000003606e0 [ 6731.508713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6731.508714] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6731.508715] Call Trace: [ 6731.508734] i40e_vsi_close+0x84/0x90 [i40e] [ 6731.508742] i40e_quiesce_vsi.part.98+0x3c/0x40 [i40e] [ 6731.508749] i40e_pf_quiesce_all_vsi+0x55/0x60 [i40e] [ 6731.508757] i40e_prep_for_reset+0x59/0x130 [i40e] [ 6731.508765] i40e_reconfig_rss_queues+0x5a/0x120 [i40e] [ 6731.508774] i40e_set_channels+0xda/0x170 [i40e] [ 6731.508778] ethtool_set_channels+0xe9/0x150 [ 6731.508781] dev_ethtool+0x1b94/0x2920 [ 6731.508805] dev_ioctl+0xc2/0x590 [ 6731.508811] sock_do_ioctl+0xae/0x150 [ 6731.508813] sock_ioctl+0x34f/0x3c0 [ 6731.508821] ksys_ioctl+0x98/0xb0 [ 6731.508828] __x64_sys_ioctl+0x1a/0x20 [ 6731.508831] do_syscall_64+0x57/0x1c0 [ 6731.508835] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 4b8164467b85 ("i40e: Add common function for finding VSI by type") Signed-off-by: Grzegorz Szczurek Signed-off-by: Arkadiusz Kubalewski Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b5399357a667..2e433fdbf2c3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15463,6 +15463,9 @@ static void i40e_remove(struct pci_dev *pdev) i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0); i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0); + while (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) + usleep_range(1000, 2000); + /* no more scheduling of any task */ set_bit(__I40E_SUSPENDED, pf->state); set_bit(__I40E_DOWN, pf->state); From b07e2a86133989cbcb17a9a9a86788f5ec34d846 Mon Sep 17 00:00:00 2001 From: Nivedita Singhvi Date: Thu, 13 Aug 2020 10:40:05 +0530 Subject: [PATCH 43/75] docs: networking: bonding.rst resources section cleanup Removed obsolete resources from bonding.rst doc: - bonding-devel@lists.sourceforge.net hasn't been used since 2008 - admin interface is 404 - Donald Becker's domain/content no longer online Signed-off-by: Nivedita Singhvi Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 24168b0d16bd..adc314639085 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -2860,17 +2860,6 @@ version of the linux kernel, found on http://kernel.org The latest version of this document can be found in the latest kernel source (named Documentation/networking/bonding.rst). -Discussions regarding the usage of the bonding driver take place on the -bonding-devel mailing list, hosted at sourceforge.net. If you have questions or -problems, post them to the list. The list address is: - -bonding-devel@lists.sourceforge.net - -The administrative interface (to subscribe or unsubscribe) can -be found at: - -https://lists.sourceforge.net/lists/listinfo/bonding-devel - Discussions regarding the development of the bonding driver take place on the main Linux network mailing list, hosted at vger.kernel.org. The list address is: @@ -2881,10 +2870,3 @@ The administrative interface (to subscribe or unsubscribe) can be found at: http://vger.kernel.org/vger-lists.html#netdev - -Donald Becker's Ethernet Drivers and diag programs may be found at : - - - http://web.archive.org/web/%2E/http://www.scyld.com/network/ - -You will also find a lot of information regarding Ethernet, NWay, MII, -etc. at www.scyld.com. From c6165cf0dbb82ded90163dce3ac183fc7a913dc4 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Thu, 13 Aug 2020 15:13:14 +0800 Subject: [PATCH 44/75] net: fec: correct the error path for regulator disable in probe Correct the error path for regulator disable. Fixes: 9269e5560b26 ("net: fec: add phy-reset-gpios PROBE_DEFER check") Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 9934421814b4..fb37816a74db 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3715,11 +3715,11 @@ failed_mii_init: failed_irq: failed_init: fec_ptp_stop(pdev); - if (fep->reg_phy) - regulator_disable(fep->reg_phy); failed_reset: pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); + if (fep->reg_phy) + regulator_disable(fep->reg_phy); failed_regulator: clk_disable_unprepare(fep->clk_ahb); failed_clk_ahb: From 4ca0d9ac3fd8f9f90b72a15d8da2aca3ffb58418 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Thu, 13 Aug 2020 10:09:00 -0400 Subject: [PATCH 45/75] bonding: show saner speed for broadcast mode Broadcast mode bonds transmit a copy of all traffic simultaneously out of all interfaces, so the "speed" of the bond isn't really the aggregate of all interfaces, but rather, the speed of the slowest active interface. Also, the type of the speed field is u32, not unsigned long, so adjust that accordingly, as required to make min() function here without complaining about mismatching types. Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool") CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: "David S. Miller" CC: netdev@vger.kernel.org Acked-by: Jay Vosburgh Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 5ad43aaf76e5..c853ca67058c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4552,13 +4552,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } +static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) +{ + if (speed == 0 || speed == SPEED_UNKNOWN) + speed = slave->speed; + else + speed = min(speed, slave->speed); + + return speed; +} + static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); - unsigned long speed = 0; struct list_head *iter; struct slave *slave; + u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; @@ -4570,8 +4580,13 @@ static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { - if (slave->speed != SPEED_UNKNOWN) - speed += slave->speed; + if (slave->speed != SPEED_UNKNOWN) { + if (BOND_MODE(bond) == BOND_MODE_BROADCAST) + speed = bond_mode_bcast_speed(slave, + speed); + else + speed += slave->speed; + } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; From 77b981c82c1df7c7ad32a046f17f007450b46954 Mon Sep 17 00:00:00 2001 From: Xie He Date: Thu, 13 Aug 2020 11:17:04 -0700 Subject: [PATCH 46/75] drivers/net/wan/hdlc_x25: Added needed_headroom and a skb->len check 1. Added a skb->len check This driver expects upper layers to include a pseudo header of 1 byte when passing down a skb for transmission. This driver will read this 1-byte header. This patch added a skb->len check before reading the header to make sure the header exists. 2. Added needed_headroom and set hard_header_len to 0 When this driver transmits data, first this driver will remove a pseudo header of 1 byte, then the lapb module will prepend the LAPB header of 2 or 3 bytes. So the value of needed_headroom in this driver should be 3 - 1. Because this driver has no header_ops, according to the logic of af_packet.c, the value of hard_header_len should be 0. Reason of setting needed_headroom and hard_header_len at this place: This driver is written using the API of the hdlc module, the hdlc module enables this driver (the protocol driver) to run on any hardware that has a driver (the hardware driver) written using the API of the hdlc module. Two other hdlc protocol drivers - hdlc_ppp and hdlc_raw_eth, also set things like hard_header_len at this place. In hdlc_ppp, it sets hard_header_len after attach_hdlc_protocol and before setting dev->type. In hdlc_raw_eth, it sets hard_header_len by calling ether_setup after attach_hdlc_protocol and after memcpy the settings. 3. Reset needed_headroom when detaching protocols (in hdlc.c) When detaching a protocol from a hardware device, the hdlc module will reset various parameters of the device (including hard_header_len) to the default values. We add needed_headroom here so that needed_headroom will also be reset. Cc: Willem de Bruijn Cc: Martin Schiller Cc: Andrew Hendry Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/hdlc.c | 1 + drivers/net/wan/hdlc_x25.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c index dfc16770458d..386ed2aa31fd 100644 --- a/drivers/net/wan/hdlc.c +++ b/drivers/net/wan/hdlc.c @@ -230,6 +230,7 @@ static void hdlc_setup_dev(struct net_device *dev) dev->max_mtu = HDLC_MAX_MTU; dev->type = ARPHRD_RAWHDLC; dev->hard_header_len = 16; + dev->needed_headroom = 0; dev->addr_len = 0; dev->header_ops = &hdlc_null_ops; } diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c index f70336bb6f52..f52b9fed0593 100644 --- a/drivers/net/wan/hdlc_x25.c +++ b/drivers/net/wan/hdlc_x25.c @@ -107,8 +107,14 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) { int result; + /* There should be a pseudo header of 1 byte added by upper layers. + * Check to make sure it is there before reading it. + */ + if (skb->len < 1) { + kfree_skb(skb); + return NETDEV_TX_OK; + } - /* X.25 to LAPB */ switch (skb->data[0]) { case X25_IFACE_DATA: /* Data to be transmitted */ skb_pull(skb, 1); @@ -294,6 +300,15 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr) return result; memcpy(&state(hdlc)->settings, &new_settings, size); + + /* There's no header_ops so hard_header_len should be 0. */ + dev->hard_header_len = 0; + /* When transmitting data: + * first we'll remove a pseudo header of 1 byte, + * then we'll prepend an LAPB header of at most 3 bytes. + */ + dev->needed_headroom = 3 - 1; + dev->type = ARPHRD_X25; call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); netif_dormant_off(dev); From a35e5478779c2886a68beb233441eabcd2883d6c Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:04 +0100 Subject: [PATCH 47/75] net: bonding: bond_3ad: Fix a bunch of kerneldoc parameter issues Renames and missing descriptions. Fixes the following W=1 kernel build warning(s): drivers/net/bonding/bond_3ad.c:140: warning: Function parameter or member 'port' not described in '__get_first_agg' drivers/net/bonding/bond_3ad.c:140: warning: Excess function parameter 'bond' description in '__get_first_agg' drivers/net/bonding/bond_3ad.c:1655: warning: Function parameter or member 'agg' not described in 'ad_agg_selection_logic' drivers/net/bonding/bond_3ad.c:1655: warning: Excess function parameter 'aggregator' description in 'ad_agg_selection_logic' drivers/net/bonding/bond_3ad.c:1817: warning: Function parameter or member 'port' not described in 'ad_initialize_port' drivers/net/bonding/bond_3ad.c:1817: warning: Excess function parameter 'aggregator' description in 'ad_initialize_port' drivers/net/bonding/bond_3ad.c:1976: warning: Function parameter or member 'timeout' not described in 'bond_3ad_initiate_agg_selection' drivers/net/bonding/bond_3ad.c:2274: warning: Function parameter or member 'work' not described in 'bond_3ad_state_machine_handler' drivers/net/bonding/bond_3ad.c:2274: warning: Excess function parameter 'bond' description in 'bond_3ad_state_machine_handler' drivers/net/bonding/bond_3ad.c:2508: warning: Function parameter or member 'link' not described in 'bond_3ad_handle_link_change' drivers/net/bonding/bond_3ad.c:2508: warning: Excess function parameter 'status' description in 'bond_3ad_handle_link_change' drivers/net/bonding/bond_3ad.c:2566: warning: Function parameter or member 'bond' not described in 'bond_3ad_set_carrier' drivers/net/bonding/bond_3ad.c:2677: warning: Function parameter or member 'bond' not described in 'bond_3ad_update_lacp_rate' drivers/net/bonding/bond_3ad.c:1655: warning: Function parameter or member 'agg' not described in 'ad_agg_selection_logic' drivers/net/bonding/bond_3ad.c:1655: warning: Excess function parameter 'aggregator' description in 'ad_agg_selection_logic' drivers/net/bonding/bond_3ad.c:1817: warning: Function parameter or member 'port' not described in 'ad_initialize_port' drivers/net/bonding/bond_3ad.c:1817: warning: Excess function parameter 'aggregator' description in 'ad_initialize_port' drivers/net/bonding/bond_3ad.c:1976: warning: Function parameter or member 'timeout' not described in 'bond_3ad_initiate_agg_selection' drivers/net/bonding/bond_3ad.c:2274: warning: Function parameter or member 'work' not described in 'bond_3ad_state_machine_handler' drivers/net/bonding/bond_3ad.c:2274: warning: Excess function parameter 'bond' description in 'bond_3ad_state_machine_handler' drivers/net/bonding/bond_3ad.c:2508: warning: Function parameter or member 'link' not described in 'bond_3ad_handle_link_change' drivers/net/bonding/bond_3ad.c:2508: warning: Excess function parameter 'status' description in 'bond_3ad_handle_link_change' drivers/net/bonding/bond_3ad.c:2566: warning: Function parameter or member 'bond' not described in 'bond_3ad_set_carrier' drivers/net/bonding/bond_3ad.c:2677: warning: Function parameter or member 'bond' not described in 'bond_3ad_update_lacp_rate' Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 31e43a2197a3..cddaa43a9d52 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -130,7 +130,7 @@ static inline struct bonding *__get_bond_by_port(struct port *port) /** * __get_first_agg - get the first aggregator in the bond - * @bond: the bond we're looking at + * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. @@ -1626,7 +1626,7 @@ static int agg_device_up(const struct aggregator *agg) /** * ad_agg_selection_logic - select an aggregation group for a team - * @aggregator: the aggregator we're looking at + * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. @@ -1810,7 +1810,7 @@ static void ad_initialize_agg(struct aggregator *aggregator) /** * ad_initialize_port - initialize a given port's parameters - * @aggregator: the aggregator we're looking at + * @port: the port we're looking at * @lacp_fast: boolean. whether fast periodic should be used */ static void ad_initialize_port(struct port *port, int lacp_fast) @@ -1967,6 +1967,7 @@ static void ad_marker_response_received(struct bond_marker *marker, /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct + * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during @@ -2259,7 +2260,7 @@ void bond_3ad_update_ad_actor_settings(struct bonding *bond) /** * bond_3ad_state_machine_handler - handle state machines timeout - * @bond: bonding struct to work on + * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is @@ -2500,7 +2501,7 @@ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on - * @status: whether the link is now up or down + * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ @@ -2551,7 +2552,7 @@ void bond_3ad_handle_link_change(struct slave *slave, char link) /** * bond_3ad_set_carrier - set link state for bonding master - * @bond - bonding structure + * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are @@ -2664,7 +2665,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, /** * bond_3ad_update_lacp_rate - change the lacp rate - * @bond - bonding struct + * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. From 45a1553bd34abc17c3766d384a6e1500f7551a30 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:05 +0100 Subject: [PATCH 48/75] net: bonding: bond_main: Document 'proto' and rename 'new_active' parameters Fixes the following W=1 kernel build warning(s): drivers/net/bonding/bond_main.c:329: warning: Function parameter or member 'proto' not described in 'bond_vlan_rx_add_vid' drivers/net/bonding/bond_main.c:362: warning: Function parameter or member 'proto' not described in 'bond_vlan_rx_kill_vid' drivers/net/bonding/bond_main.c:964: warning: Function parameter or member 'new_active' not described in 'bond_change_active_slave' drivers/net/bonding/bond_main.c:964: warning: Excess function parameter 'new' description in 'bond_change_active_slave' Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Thomas Davis Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c853ca67058c..4239cdcfd798 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -322,6 +322,7 @@ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, /** * bond_vlan_rx_add_vid - Propagates adding an id to slaves * @bond_dev: bonding net device that got called + * @proto: network protocol ID * @vid: vlan id being added */ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, @@ -355,6 +356,7 @@ unwind: /** * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves * @bond_dev: bonding net device that got called + * @proto: network protocol ID * @vid: vlan id being removed */ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, @@ -948,7 +950,7 @@ static bool bond_should_notify_peers(struct bonding *bond) /** * change_active_interface - change the active slave into the specified one * @bond: our bonding struct - * @new: the new slave to make the active one + * @new_active: the new slave to make the active one * * Set the new slave to the bond's settings and unset them on the old * curr_active_slave. From 2083bebca7c08598ac15ac20a9ba1cc3338fc4d6 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:06 +0100 Subject: [PATCH 49/75] net: ethernet: 3com: 3c574_cs: Remove set but unused variables 'tx' and 'rx' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/net/ethernet/3com/3c574_cs.c: In function ‘update_stats’: drivers/net/ethernet/3com/3c574_cs.c:954:9: warning: variable ‘tx’ set but not used [-Wunused-but-set-variable] 954 | u8 rx, tx, up; | ^~ drivers/net/ethernet/3com/3c574_cs.c:954:5: warning: variable ‘rx’ set but not used [-Wunused-but-set-variable] 954 | u8 rx, tx, up; | ^~ Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Shannon Nelson Cc: Heiner Kallweit Cc: Martin Habets Cc: "Michael S. Tsirkin" Cc: Donald Becker Cc: David Hinds Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/3c574_cs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c index ef1c3151fbb2..bd0ada4e81b0 100644 --- a/drivers/net/ethernet/3com/3c574_cs.c +++ b/drivers/net/ethernet/3com/3c574_cs.c @@ -951,7 +951,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev) static void update_stats(struct net_device *dev) { unsigned int ioaddr = dev->base_addr; - u8 rx, tx, up; + u8 up; pr_debug("%s: updating the statistics.\n", dev->name); @@ -972,8 +972,8 @@ static void update_stats(struct net_device *dev) dev->stats.tx_packets += (up&0x30) << 4; /* Rx packets */ inb(ioaddr + 7); /* Tx deferrals */ inb(ioaddr + 8); - rx = inw(ioaddr + 10); - tx = inw(ioaddr + 12); + /* rx */ inw(ioaddr + 10); + /* tx */ inw(ioaddr + 12); EL3WINDOW(4); /* BadSSD */ inb(ioaddr + 12); From f6e81b890b29336ff5c5777b0b24c23b631213b2 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:07 +0100 Subject: [PATCH 50/75] net: bonding: bond_alb: Describe alb_handle_addr_collision_on_attach()'s 'bond' and 'addr' params Fixes the following W=1 kernel build warning(s): drivers/net/bonding/bond_alb.c:1222: warning: Function parameter or member 'bond' not described in 'alb_set_mac_address' Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/bonding/bond_alb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 095ea51d1853..4e1b7deb724b 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1206,8 +1206,8 @@ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slav /** * alb_set_mac_address - * @bond: - * @addr: + * @bond: bonding we're working on + * @addr: MAC address to set * * In TLB mode all slaves are configured to the bond's hw address, but set * their dev_addr field to different addresses (based on their permanent hw From fd29aeeec5a1af1a97a42ec2ba307bffdbc563e8 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:09 +0100 Subject: [PATCH 51/75] net: ethernet: 8390: axnet_cs: Document unused parameter 'txqueue' Fixes the following W=1 kernel build warning(s): drivers/net/ethernet/8390/axnet_cs.c:907: warning: Function parameter or member 'txqueue' not described in 'axnet_tx_timeout' Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Martin Habets Cc: Shannon Nelson Cc: "Michael S. Tsirkin" Cc: William Lee Cc: "A. Hinds --" Cc: reached at Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/axnet_cs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c index aeae7966a082..08db4c9da2fa 100644 --- a/drivers/net/ethernet/8390/axnet_cs.c +++ b/drivers/net/ethernet/8390/axnet_cs.c @@ -898,6 +898,7 @@ static int ax_close(struct net_device *dev) /** * axnet_tx_timeout - handle transmit time out condition * @dev: network device which has apparently fallen asleep + * @txqueue: unused * * Called by kernel when device never acknowledges a transmit has * completed (or failed) - i.e. never posted a Tx related interrupt. From 1a2c26681f88c924e15d3bfe0a9fb7a162336777 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:13 +0100 Subject: [PATCH 52/75] net: wan: dlci: Remove set but not used variable 'err' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/net/wan/dlci.c: In function ‘dlci_close’: drivers/net/wan/dlci.c:298:8: warning: variable ‘err’ set but not used [-Wunused-but-set-variable] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Mike McLagan Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 7bcee41905cf..3ca4daf63389 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -295,14 +295,13 @@ static int dlci_close(struct net_device *dev) { struct dlci_local *dlp; struct frad_local *flp; - int err; netif_stop_queue(dev); dlp = netdev_priv(dev); flp = netdev_priv(dlp->slave); - err = (*flp->deactivate)(dlp->slave, dev); + (*flp->deactivate)(dlp->slave, dev); return 0; } From 0d9b56453b754b1c5ca32ca8a5780b556971f881 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:21 +0100 Subject: [PATCH 53/75] net: fddi: skfp: hwmtm: Remove seemingly unused variable 'ID_sccs' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This variable is present in many source files and has not been used anywhere (at least internally) since it was introduced. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/hwmtm.c:14:19: warning: ‘ID_sccs’ defined but not used [-Wunused-const-variable=] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/hwmtm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/fddi/skfp/hwmtm.c b/drivers/net/fddi/skfp/hwmtm.c index 3412e0fb0ac4..107039056511 100644 --- a/drivers/net/fddi/skfp/hwmtm.c +++ b/drivers/net/fddi/skfp/hwmtm.c @@ -10,10 +10,6 @@ * ******************************************************************************/ -#ifndef lint -static char const ID_sccs[] = "@(#)hwmtm.c 1.40 99/05/31 (C) SK" ; -#endif - #define HWMTM #ifndef FDDI From 026ff46b42f4929b25ef28e1e69dd302835c9783 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:22 +0100 Subject: [PATCH 54/75] net: fddi: skfp: fplustm: Remove seemingly unused variable 'ID_sccs' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This variable is present in many source files and has not been used anywhere (at least internally) since it was introduced. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/fplustm.c:25:19: warning: ‘ID_sccs’ defined but not used [-Wunused-const-variable=] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Maxime Coquelin Cc: Alexandre Torgue Cc: netdev@vger.kernel.org Cc: linux-stm32@st-md-mailman.stormreply.com Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/fplustm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/fddi/skfp/fplustm.c b/drivers/net/fddi/skfp/fplustm.c index 02966d141948..4cbb145c74ab 100644 --- a/drivers/net/fddi/skfp/fplustm.c +++ b/drivers/net/fddi/skfp/fplustm.c @@ -21,10 +21,6 @@ #include #include -#ifndef lint -static const char ID_sccs[] = "@(#)fplustm.c 1.32 99/02/23 (C) SK " ; -#endif - #ifndef UNUSED #ifdef lint #define UNUSED(x) (x) = (x) From 327afdd7c0d13695e8fcbaa0995d451b34c3107b Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:24 +0100 Subject: [PATCH 55/75] net: fddi: skfp: smt: Place definition of 'smt_pdef' under same stipulations as its use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'smt_pdef' is only used if LITTLE_ENDIAN is set, so only define it if this is the case. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/smt.c:1572:3: warning: ‘smt_pdef’ defined but not used [-Wunused-const-variable=] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Tetsuo Handa Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/smt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/fddi/skfp/smt.c b/drivers/net/fddi/skfp/smt.c index b8c59d803ce6..a151d336b904 100644 --- a/drivers/net/fddi/skfp/smt.c +++ b/drivers/net/fddi/skfp/smt.c @@ -1561,7 +1561,7 @@ u_long smt_get_tid(struct s_smc *smc) return tid & 0x3fffffffL; } - +#ifdef LITTLE_ENDIAN /* * table of parameter lengths */ @@ -1641,6 +1641,7 @@ static const struct smt_pdef { } ; #define N_SMT_PLEN ARRAY_SIZE(smt_pdef) +#endif int smt_check_para(struct s_smc *smc, struct smt_header *sm, const u_short list[]) From 7b1af34f024e72c458a330d028ed4e9ad0d981c3 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:30 +0100 Subject: [PATCH 56/75] net: fddi: skfp: smt: Remove seemingly unused variable 'ID_sccs' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This variable is present in many source files and has not been used anywhere (at least internally) since it was introduced. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/smt.c:24:19: warning: ‘ID_sccs’ defined but not used [-Wunused-const-variable=] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Tetsuo Handa Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/smt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/fddi/skfp/smt.c b/drivers/net/fddi/skfp/smt.c index a151d336b904..774a6e3b0a67 100644 --- a/drivers/net/fddi/skfp/smt.c +++ b/drivers/net/fddi/skfp/smt.c @@ -20,10 +20,6 @@ #define KERNEL #include "h/smtstate.h" -#ifndef lint -static const char ID_sccs[] = "@(#)smt.c 2.43 98/11/23 (C) SK " ; -#endif - /* * FC in SMbuf */ From d1ad06ba7e47a2cb5522d7e81aed1a76abdef41c Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:31 +0100 Subject: [PATCH 57/75] net: fddi: skfp: cfm: Remove set but unused variable 'oldstate' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While we're at it, remove some code which has never been invoked. Keep the comment though, as it seems potentially half useful. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/cfm.c: In function ‘cfm’: drivers/net/fddi/skfp/cfm.c:211:6: warning: variable ‘oldstate’ set but not used [-Wunused-but-set-variable] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/cfm.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/net/fddi/skfp/cfm.c b/drivers/net/fddi/skfp/cfm.c index e9bf42996de8..668b1d7be6e2 100644 --- a/drivers/net/fddi/skfp/cfm.c +++ b/drivers/net/fddi/skfp/cfm.c @@ -208,7 +208,6 @@ void cfm(struct s_smc *smc, int event) { int state ; /* remember last state */ int cond ; - int oldstate ; /* We will do the following: */ /* - compute the variable WC_Flag for every port (This is where */ @@ -222,7 +221,6 @@ void cfm(struct s_smc *smc, int event) /* - change the portstates */ cem_priv_state (smc, event); - oldstate = smc->mib.fddiSMTCF_State ; do { DB_CFM("CFM : state %s%s event %s", smc->mib.fddiSMTCF_State & AFLAG ? "ACTIONS " : "", @@ -250,18 +248,11 @@ void cfm(struct s_smc *smc, int event) if (cond != smc->mib.fddiSMTPeerWrapFlag) smt_srf_event(smc,SMT_COND_SMT_PEER_WRAP,0,cond) ; -#if 0 /* - * Don't send ever MAC_PATH_CHANGE events. Our MAC is hard-wired + * Don't ever send MAC_PATH_CHANGE events. Our MAC is hard-wired * to the primary path. */ - /* - * path change - */ - if (smc->mib.fddiSMTCF_State != oldstate) { - smt_srf_event(smc,SMT_EVENT_MAC_PATH_CHANGE,INDEX_MAC,0) ; - } -#endif + #endif /* no SLIM_SMT */ /* From 81dbf2191f60ccf90f65171d6593c420ee05507f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 14 Aug 2020 12:39:32 +0100 Subject: [PATCH 58/75] net: fddi: skfp: cfm: Remove seemingly unused variable 'ID_sccs' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This variable is present in many source files and has not been used anywhere (at least internally) since it was introduced. Fixes the following W=1 kernel build warning(s): drivers/net/fddi/skfp/cfm.c: In function ‘cfm’: drivers/net/fddi/skfp/cfm.c:211:6: warning: variable ‘oldstate’ set but not used [-Wunused-but-set-variable] drivers/net/fddi/skfp/cfm.c:40:19: warning: ‘ID_sccs’ defined but not used [-Wunused-const-variable=] Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Lee Jones Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/cfm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/fddi/skfp/cfm.c b/drivers/net/fddi/skfp/cfm.c index 668b1d7be6e2..4eea3408034b 100644 --- a/drivers/net/fddi/skfp/cfm.c +++ b/drivers/net/fddi/skfp/cfm.c @@ -36,10 +36,6 @@ #define KERNEL #include "h/smtstate.h" -#ifndef lint -static const char ID_sccs[] = "@(#)cfm.c 2.18 98/10/06 (C) SK " ; -#endif - /* * FSM Macros */ From f4fd77fd87e9b214c26bb2ebd4f90055eaea5ade Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Wed, 5 Aug 2020 11:50:22 +0800 Subject: [PATCH 59/75] can: j1939: fix support for multipacket broadcast message Currently j1939_tp_im_involved_anydir() in j1939_tp_recv() check the previously set flags J1939_ECU_LOCAL_DST and J1939_ECU_LOCAL_SRC of incoming skb, thus multipacket broadcast message was aborted by receive side because it may come from remote ECUs and have no exact dst address. Similarly, j1939_tp_cmd_recv() and j1939_xtp_rx_dat() didn't process broadcast message. So fix it by checking and process broadcast message in j1939_tp_recv(), j1939_tp_cmd_recv() and j1939_xtp_rx_dat(). Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1596599425-5534-2-git-send-email-zhangchangzhong@huawei.com Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 5cf107cb447c..868ecb2bfa45 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1673,8 +1673,12 @@ static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb, return; } session = j1939_xtp_rx_rts_session_new(priv, skb); - if (!session) + if (!session) { + if (cmd == J1939_TP_CMD_BAM && j1939_sk_recv_match(priv, skcb)) + netdev_info(priv->ndev, "%s: failed to create TP BAM session\n", + __func__); return; + } } else { if (j1939_xtp_rx_rts_session_active(session, skb)) { j1939_session_put(session); @@ -1865,6 +1869,13 @@ static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb) else j1939_xtp_rx_dat_one(session, skb); } + + if (j1939_cb_is_broadcast(skcb)) { + session = j1939_session_get_by_addr(priv, &skcb->addr, false, + false); + if (session) + j1939_xtp_rx_dat_one(session, skb); + } } /* j1939 main intf */ @@ -1956,7 +1967,7 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_rts(priv, skb, true); - if (j1939_tp_im_receiver(skcb)) + if (j1939_tp_im_receiver(skcb) || j1939_cb_is_broadcast(skcb)) j1939_xtp_rx_rts(priv, skb, false); break; @@ -2020,7 +2031,7 @@ int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); - if (!j1939_tp_im_involved_anydir(skcb)) + if (!j1939_tp_im_involved_anydir(skcb) && !j1939_cb_is_broadcast(skcb)) return 0; switch (skcb->addr.pgn) { From e8b17653088f28a87c81845fa41a2d295a3b458c Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Wed, 5 Aug 2020 11:50:23 +0800 Subject: [PATCH 60/75] can: j1939: cancel rxtimer on multipacket broadcast session complete If j1939_xtp_rx_dat_one() receive last frame of multipacket broadcast message, j1939_session_timers_cancel() should be called to cancel rxtimer. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1596599425-5534-3-git-send-email-zhangchangzhong@huawei.com Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 868ecb2bfa45..2f3c3afd5071 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1824,6 +1824,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, } if (final) { + j1939_session_timers_cancel(session); j1939_session_completed(session); } else if (do_cts_eoma) { j1939_tp_set_rxtimeout(session, 1250); From 2b8b2e31555cf55ba3680fb28e2b382e168d7ea1 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Wed, 5 Aug 2020 11:50:24 +0800 Subject: [PATCH 61/75] can: j1939: abort multipacket broadcast session when timeout occurs If timeout occurs, j1939_tp_rxtimer() first calls hrtimer_start() to restart rxtimer, and then calls __j1939_session_cancel() to set session->state = J1939_SESSION_WAITING_ABORT. At next timeout expiration, because of the J1939_SESSION_WAITING_ABORT session state j1939_tp_rxtimer() will call j1939_session_deactivate_activate_next() to deactivate current session, and rxtimer won't be set. But for multipacket broadcast session, __j1939_session_cancel() don't set session->state = J1939_SESSION_WAITING_ABORT, thus current session won't be deactivate and hrtimer_start() is called to start new rxtimer again and again. So fix it by moving session->state = J1939_SESSION_WAITING_ABORT out of if (!j1939_cb_is_broadcast(&session->skcb)) statement. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1596599425-5534-4-git-send-email-zhangchangzhong@huawei.com Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 2f3c3afd5071..047118d5270b 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1074,9 +1074,9 @@ static void __j1939_session_cancel(struct j1939_session *session, lockdep_assert_held(&session->priv->active_session_list_lock); session->err = j1939_xtp_abort_to_errno(priv, err); + session->state = J1939_SESSION_WAITING_ABORT; /* do not send aborts on incoming broadcasts */ if (!j1939_cb_is_broadcast(&session->skcb)) { - session->state = J1939_SESSION_WAITING_ABORT; j1939_xtp_tx_abort(priv, &session->skcb, !session->transmission, err, session->skcb.addr.pgn); From 0ae18a82686f9b9965a8ce0dd81371871b306ffe Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Wed, 5 Aug 2020 11:50:25 +0800 Subject: [PATCH 62/75] can: j1939: add rxtimer for multipacket broadcast session According to SAE J1939/21 (Chapter 5.12.3 and APPENDIX C), for transmit side the required time interval between packets of a multipacket broadcast message is 50 to 200 ms, the responder shall use a timeout of 250ms (provides margin allowing for the maximumm spacing of 200ms). For receive side a timeout will occur when a time of greater than 750 ms elapsed between two message packets when more packets were expected. So this patch fix and add rxtimer for multipacket broadcast session. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1596599425-5534-5-git-send-email-zhangchangzhong@huawei.com Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 047118d5270b..a8dd956b5e8e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -723,10 +723,12 @@ static int j1939_session_tx_rts(struct j1939_session *session) return ret; session->last_txcmd = dat[0]; - if (dat[0] == J1939_TP_CMD_BAM) + if (dat[0] == J1939_TP_CMD_BAM) { j1939_tp_schedule_txtimer(session, 50); - - j1939_tp_set_rxtimeout(session, 1250); + j1939_tp_set_rxtimeout(session, 250); + } else { + j1939_tp_set_rxtimeout(session, 1250); + } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); @@ -1687,11 +1689,15 @@ static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb, } session->last_cmd = cmd; - j1939_tp_set_rxtimeout(session, 1250); - - if (cmd != J1939_TP_CMD_BAM && !session->transmission) { - j1939_session_txtimer_cancel(session); - j1939_tp_schedule_txtimer(session, 0); + if (cmd == J1939_TP_CMD_BAM) { + if (!session->transmission) + j1939_tp_set_rxtimeout(session, 750); + } else { + if (!session->transmission) { + j1939_session_txtimer_cancel(session); + j1939_tp_schedule_txtimer(session, 0); + } + j1939_tp_set_rxtimeout(session, 1250); } j1939_session_put(session); @@ -1742,6 +1748,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, int offset; int nbytes; bool final = false; + bool remain = false; bool do_cts_eoma = false; int packet; @@ -1817,6 +1824,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, j1939_cb_is_broadcast(&session->skcb)) { if (session->pkt.rx >= session->pkt.total) final = true; + else + remain = true; } else { /* never final, an EOMA must follow */ if (session->pkt.rx >= session->pkt.last) @@ -1826,6 +1835,9 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (final) { j1939_session_timers_cancel(session); j1939_session_completed(session); + } else if (remain) { + if (!session->transmission) + j1939_tp_set_rxtimeout(session, 750); } else if (do_cts_eoma) { j1939_tp_set_rxtimeout(session, 1250); if (!session->transmission) From 832707021666411d04795c564a4adea5d6b94f17 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 14 Aug 2020 20:05:58 -0700 Subject: [PATCH 63/75] bonding: fix a potential double-unregister When we tear down a network namespace, we unregister all the netdevices within it. So we may queue a slave device and a bonding device together in the same unregister queue. If the only slave device is non-ethernet, it would automatically unregister the bonding device as well. Thus, we may end up unregistering the bonding device twice. Workaround this special case by checking reg_state. Fixes: 9b5e383c11b0 ("net: Introduce unregister_netdevice_many()") Reported-by: syzbot+af23e7f3e0a7e10c8b67@syzkaller.appspotmail.com Cc: Eric Dumazet Cc: Andy Gospodarek Cc: Jay Vosburgh Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4239cdcfd798..415a37e44cae 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2207,7 +2207,8 @@ static int bond_release_and_destroy(struct net_device *bond_dev, int ret; ret = __bond_release_one(bond_dev, slave_dev, false, true); - if (ret == 0 && !bond_has_slaves(bond)) { + if (ret == 0 && !bond_has_slaves(bond) && + bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond\n"); bond_remove_proc_entry(bond); From d0f5c7076e01fef6fcb86988d9508bf3ce258bd4 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 14 Aug 2020 22:53:24 -0700 Subject: [PATCH 64/75] ipvlan: fix device features Processing NETDEV_FEAT_CHANGE causes IPvlan links to lose NETIF_F_LLTX feature because of the incorrect handling of features in ipvlan_fix_features(). --before-- lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~# ethtool -K ipvl0 tso off Cannot change tcp-segmentation-offload Actual changes: vlan-challenged: off [fixed] tx-lockless: off [fixed] lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: off [fixed] lpaa10:~# --after-- lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~# ethtool -K ipvl0 tso off Cannot change tcp-segmentation-offload Could not change any device features lpaa10:~# ethtool -k ipvl0 | grep tx-lockless tx-lockless: on [fixed] lpaa10:~# Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Mahesh Bandewar Cc: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 15e87c097b0b..5bca94c99006 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -106,12 +106,21 @@ static void ipvlan_port_destroy(struct net_device *dev) kfree(port); } +#define IPVLAN_ALWAYS_ON_OFLOADS \ + (NETIF_F_SG | NETIF_F_HW_CSUM | \ + NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) + +#define IPVLAN_ALWAYS_ON \ + (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) + #define IPVLAN_FEATURES \ - (NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) + /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ + #define IPVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) @@ -125,7 +134,9 @@ static int ipvlan_init(struct net_device *dev) dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); dev->features = phy_dev->features & IPVLAN_FEATURES; - dev->features |= NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED; + dev->features |= IPVLAN_ALWAYS_ON; + dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; + dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; dev->gso_max_size = phy_dev->gso_max_size; dev->gso_max_segs = phy_dev->gso_max_segs; @@ -227,7 +238,14 @@ static netdev_features_t ipvlan_fix_features(struct net_device *dev, { struct ipvl_dev *ipvlan = netdev_priv(dev); - return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES); + features |= NETIF_F_ALL_FOR_ALL; + features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); + features = netdev_increment_features(ipvlan->phy_dev->features, + features, features); + features |= IPVLAN_ALWAYS_ON; + features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); + + return features; } static void ipvlan_change_rx_flags(struct net_device *dev, int change) @@ -734,10 +752,9 @@ static int ipvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - ipvlan->dev->features = dev->features & IPVLAN_FEATURES; ipvlan->dev->gso_max_size = dev->gso_max_size; ipvlan->dev->gso_max_segs = dev->gso_max_segs; - netdev_features_change(ipvlan->dev); + netdev_update_features(ipvlan->dev); } break; From f8414a8d886b613b90d9fdf7cda6feea313b1069 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 15 Aug 2020 09:29:30 +0200 Subject: [PATCH 65/75] net: xdp: pull ethernet header off packet after computing skb->protocol When an XDP program changes the ethernet header protocol field, eth_type_trans is used to recalculate skb->protocol. In order for eth_type_trans to work correctly, the ethernet header must actually be part of the skb data segment, so the code first pushes that onto the head of the skb. However, it subsequently forgets to pull it back off, making the behavior of the passed-on packet inconsistent between the protocol modifying case and the static protocol case. This patch fixes the issue by simply pulling the ethernet header back off of the skb head. Fixes: 297249569932 ("net: fix generic XDP to handle if eth header was mangled") Cc: Jesper Dangaard Brouer Cc: David S. Miller Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index b5d1129d8310..7592d7dd6109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4676,6 +4676,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); + __skb_pull(skb, ETH_HLEN); } switch (act) { From 55eff0eb7460c3d50716ed9eccf22257b046ca92 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 15 Aug 2020 04:44:31 -0400 Subject: [PATCH 66/75] net: Fix potential wrong skb->protocol in skb_vlan_untag() We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). So we should pull VLAN_HLEN + sizeof(unsigned short) in skb_vlan_untag() or we may access the wrong data. Fixes: 0d5501c1c828 ("net: Always untag vlan-tagged traffic on input.") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7e2e502ef519..5c3b906aeef3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5418,8 +5418,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) goto err_free; vhdr = (struct vlan_hdr *)skb->data; From 47733f9daf4fe4f7e0eb9e273f21ad3a19130487 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 15 Aug 2020 16:29:15 -0700 Subject: [PATCH 67/75] tipc: fix uninit skb->data in tipc_nl_compat_dumpit() __tipc_nl_compat_dumpit() has two callers, and it expects them to pass a valid nlmsghdr via arg->data. This header is artificial and crafted just for __tipc_nl_compat_dumpit(). tipc_nl_compat_publ_dump() does so by putting a genlmsghdr as well as some nested attribute, TIPC_NLA_SOCK. But the other caller tipc_nl_compat_dumpit() does not, this leaves arg->data uninitialized on this call path. Fix this by just adding a similar nlmsghdr without any payload in tipc_nl_compat_dumpit(). This bug exists since day 1, but the recent commit 6ea67769ff33 ("net: tipc: prepare attrs in __tipc_nl_compat_dumpit()") makes it easier to appear. Reported-and-tested-by: syzbot+0e7181deafa7e0b79923@syzkaller.appspotmail.com Fixes: d0796d1ef63d ("tipc: convert legacy nl bearer dump to nl compat") Cc: Jon Maloy Cc: Ying Xue Cc: Richard Alpe Signed-off-by: Cong Wang Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 217516357ef2..90e3c70a91ad 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -275,8 +275,9 @@ err_out: static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg) { - int err; + struct nlmsghdr *nlh; struct sk_buff *arg; + int err; if (msg->req_type && (!msg->req_size || !TLV_CHECK_TYPE(msg->req, msg->req_type))) @@ -305,6 +306,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } + nlh = nlmsg_put(arg, 0, 0, tipc_genl_family.id, 0, NLM_F_MULTI); + if (!nlh) { + kfree_skb(arg); + kfree_skb(msg->rep); + msg->rep = NULL; + return -EMSGSIZE; + } + nlmsg_end(arg, nlh); + err = __tipc_nl_compat_dumpit(cmd, msg, arg); if (err) { kfree_skb(msg->rep); From c530189905efe91b6a464db4ec1b56b4c069609f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Aug 2020 17:32:03 +0800 Subject: [PATCH 68/75] tipc: not enable tipc when ipv6 works as a module When using ipv6_dev_find() in one module, it requires ipv6 not to work as a module. Otherwise, this error occurs in build: undefined reference to `ipv6_dev_find'. So fix it by adding "depends on IPV6 || IPV6=n" to tipc/Kconfig, as it does in sctp/Kconfig. Fixes: 5a6f6f579178 ("tipc: set ub->ifindex for local ipv6 address") Reported-by: kernel test robot Acked-by: Randy Dunlap Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/tipc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index 9dd780215eef..be1c4003d67d 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -6,6 +6,7 @@ menuconfig TIPC tristate "The TIPC Protocol" depends on INET + depends on IPV6 || IPV6=n help The Transparent Inter Process Communication (TIPC) protocol is specially designed for intra cluster communication. This protocol From bd71ea60673180eccf17b1a1dda3504a04783789 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 16 Aug 2020 21:26:38 +0200 Subject: [PATCH 69/75] net: devlink: Remove overzealous WARN_ON with snapshots It is possible to trigger this WARN_ON from user space by triggering a devlink snapshot with an ID which already exists. We don't need both -EEXISTS being reported and spamming the kernel log. Signed-off-by: Andrew Lunn Tested-by: Chris Healy Signed-off-by: David S. Miller --- net/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index e674f0f46dc2..e5feb87beca7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4063,7 +4063,7 @@ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) { lockdep_assert_held(&devlink->lock); - if (WARN_ON(xa_load(&devlink->snapshot_ids, id))) + if (xa_load(&devlink->snapshot_ids, id)) return -EEXIST; return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), From b3b2854dcf704c1db05d897072f98e8b79398af1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 16 Aug 2020 23:14:20 +0200 Subject: [PATCH 70/75] mptcp: sendmsg: reset iter on error redux This fix wasn't correct: When this function is invoked from the retransmission worker, the iterator contains garbage and resetting it causes a crash. As the work queue should not be performance critical also zero the msghdr struct. Fixes: 35759383133f64d "(mptcp: sendmsg: reset iter on error)" Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c84b4051c2a4..1aad411a0e46 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -740,7 +740,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); if (ret <= 0) { - iov_iter_revert(&msg->msg_iter, psize); + if (!retransmission) + iov_iter_revert(&msg->msg_iter, psize); return ret; } @@ -1392,7 +1393,9 @@ static void mptcp_worker(struct work_struct *work) struct mptcp_data_frag *dfrag; u64 orig_write_seq; size_t copied = 0; - struct msghdr msg; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT, + }; long timeo = 0; lock_sock(sk); @@ -1425,7 +1428,6 @@ static void mptcp_worker(struct work_struct *work) lock_sock(ssk); - msg.msg_flags = MSG_DONTWAIT; orig_len = dfrag->data_len; orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; From 0b76e642f9ad2471e899e2dd71b9543b7e85e9f6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 Aug 2020 15:25:49 -0700 Subject: [PATCH 71/75] phylink: : fix function prototype kernel-doc warning Fix a kernel-doc warning for the pcs_config() function prototype: ../include/linux/phylink.h:406: warning: Excess function parameter 'permit_pause_to_mac' description in 'pcs_config' Fixes: 7137e18f6f88 ("net: phylink: add struct phylink_pcs") Signed-off-by: Randy Dunlap Cc: Russell King Cc: David S. Miller Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/phylink.h b/include/linux/phylink.h index a8e876317e25..c36fb41a7d90 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -402,7 +402,8 @@ void pcs_get_state(struct phylink_pcs *pcs, * For most 10GBASE-R, there is no advertisement. */ int pcs_config(struct phylink_pcs *pcs, unsigned int mode, - phy_interface_t interface, const unsigned long *advertising); + phy_interface_t interface, const unsigned long *advertising, + bool permit_pause_to_mac); /** * pcs_an_restart() - restart 802.3z BaseX autonegotiation From 7f9bf6e82461b97ce43a912cb4a959c5a41367ac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 17 Aug 2020 11:48:05 -0700 Subject: [PATCH 72/75] Revert "net: xdp: pull ethernet header off packet after computing skb->protocol" This reverts commit f8414a8d886b613b90d9fdf7cda6feea313b1069. eth_type_trans() does the necessary pull on the skb. Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 7592d7dd6109..b5d1129d8310 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4676,7 +4676,6 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - __skb_pull(skb, ETH_HLEN); } switch (act) { From bcf7ddb0186d366f761f86196b480ea6dd2dc18c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Aug 2020 09:43:33 -0600 Subject: [PATCH 73/75] selftests: disable rp_filter for icmp_redirect.sh h1 is initially configured to reach h2 via r1 rather than the more direct path through r2. If rp_filter is set and inherited for r2, forwarding fails since the source address of h1 is reachable from eth0 vs the packet coming to it via r1 and eth1. Since rp_filter setting affects the test, explicitly reset it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/icmp_redirect.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh index 18c5de53558a..bf361f30d6ef 100755 --- a/tools/testing/selftests/net/icmp_redirect.sh +++ b/tools/testing/selftests/net/icmp_redirect.sh @@ -180,6 +180,8 @@ setup() ;; r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 ip netns exec $ns sysctl -q -w net.ipv4.conf.all.send_redirects=1 + ip netns exec $ns sysctl -q -w net.ipv4.conf.default.rp_filter=0 + ip netns exec $ns sysctl -q -w net.ipv4.conf.all.rp_filter=0 ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 ip netns exec $ns sysctl -q -w net.ipv6.route.mtu_expires=10 From 8dfddfb79653df7c38a9c8c4c034f242a36acee9 Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Mon, 17 Aug 2020 15:54:48 +0000 Subject: [PATCH 74/75] net: qrtr: fix usage of idr in port assignment to socket Passing large uint32 sockaddr_qrtr.port numbers for port allocation triggers a warning within idr_alloc() since the port number is cast to int, and thus interpreted as a negative number. This leads to the rejection of such valid port numbers in qrtr_port_assign() as idr_alloc() fails. To avoid the problem, switch to idr_alloc_u32() instead. Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Reported-by: syzbot+f31428628ef672716ea8@syzkaller.appspotmail.com Signed-off-by: Necip Fazil Yildiran Reviewed-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index b4c0db0b7d31..90c558f89d46 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -692,23 +692,25 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { + u32 min_port; int rc; mutex_lock(&qrtr_port_lock); if (!*port) { - rc = idr_alloc(&qrtr_ports, ipc, - QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1, - GFP_ATOMIC); - if (rc >= 0) - *port = rc; + min_port = QRTR_MIN_EPH_SOCKET; + rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, QRTR_MAX_EPH_SOCKET, GFP_ATOMIC); + if (!rc) + *port = min_port; } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { - rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC); + min_port = 0; + rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, 0, GFP_ATOMIC); } else { - rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC); - if (rc >= 0) - *port = rc; + min_port = *port; + rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, *port, GFP_ATOMIC); + if (!rc) + *port = min_port; } mutex_unlock(&qrtr_port_lock); From bf2bcd6f1a8822ea45465f86d705951725883ee8 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 17 Aug 2020 02:04:13 +0000 Subject: [PATCH 75/75] otx2_common: Use devm_kcalloc() in otx2_config_npa() A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "devm_kcalloc". Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 5975521a4c86..93c4cf7fedbf 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1226,8 +1226,8 @@ int otx2_config_npa(struct otx2_nic *pfvf) if (!hw->pool_cnt) return -EINVAL; - qset->pool = devm_kzalloc(pfvf->dev, sizeof(struct otx2_pool) * - hw->pool_cnt, GFP_KERNEL); + qset->pool = devm_kcalloc(pfvf->dev, hw->pool_cnt, + sizeof(struct otx2_pool), GFP_KERNEL); if (!qset->pool) return -ENOMEM;