From 48e5d98a0eb1e4cec308ed63444a505a7e7dd9e3 Mon Sep 17 00:00:00 2001 From: Adrian Ratiu Date: Wed, 20 Mar 2019 12:10:54 +0200 Subject: [PATCH 01/29] selftests/bpf: Add arm target register definitions eBPF "restricted C" code can be compiled with LLVM/clang using target triplets like armv7l-unknown-linux-gnueabihf and loaded/run with small cross-compiled gobpf/elf [1] programs without requiring a full BCC port which is also undesirable on small embedded systems due to its size footprint. The only missing pieces are these helper macros which otherwise have to be redefined by each eBPF arm program. [1] https://github.com/iovisor/gobpf/tree/master/elf Signed-off-by: Adrian Ratiu Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_helpers.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index c81fc350f7ad..41f8a4b676a4 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -274,6 +274,9 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #elif defined(__TARGET_ARCH_s930x) #define bpf_target_s930x #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm) + #define bpf_target_arm + #define bpf_target_defined #elif defined(__TARGET_ARCH_arm64) #define bpf_target_arm64 #define bpf_target_defined @@ -296,6 +299,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define bpf_target_x86 #elif defined(__s390x__) #define bpf_target_s930x +#elif defined(__arm__) + #define bpf_target_arm #elif defined(__aarch64__) #define bpf_target_arm64 #elif defined(__mips__) @@ -333,6 +338,19 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define PT_REGS_SP(x) ((x)->gprs[15]) #define PT_REGS_IP(x) ((x)->psw.addr) +#elif defined(bpf_target_arm) + +#define PT_REGS_PARM1(x) ((x)->uregs[0]) +#define PT_REGS_PARM2(x) ((x)->uregs[1]) +#define PT_REGS_PARM3(x) ((x)->uregs[2]) +#define PT_REGS_PARM4(x) ((x)->uregs[3]) +#define PT_REGS_PARM5(x) ((x)->uregs[4]) +#define PT_REGS_RET(x) ((x)->uregs[14]) +#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->uregs[0]) +#define PT_REGS_SP(x) ((x)->uregs[13]) +#define PT_REGS_IP(x) ((x)->uregs[12]) + #elif defined(bpf_target_arm64) #define PT_REGS_PARM1(x) ((x)->regs[0]) From 0f3adc288df8ba2ac2fea0a8e4890f9fb4cd075d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:53:59 +0800 Subject: [PATCH 02/29] bpf: track references based on is_acquire_func So far, the verifier only acquires reference tracking state for RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every new return type which desires these semantics, acquire reference tracking state iff the called helper is an acquire function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86f9cd5d1c4e..868a82ad5597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3147,19 +3147,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3170,9 +3158,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: [PATCH 03/29] bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f02367faa58d..f62897198844 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -205,6 +205,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ + RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 868a82ad5597..a476e13201d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3148,6 +3148,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: [PATCH 04/29] bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 20 +++++- kernel/bpf/verifier.c | 3 +- net/core/filter.c | 146 +++++++++++++++++++++++++++++++++------ 3 files changed, 144 insertions(+), 25 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929c8e537a14..fab05317f5e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2431,6 +2431,23 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2531,7 +2548,8 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476e13201d6..dffeec3706ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) diff --git a/net/core/filter.c b/net/core/filter.c index 647c63a7b25b..b6d83ba97621 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5156,15 +5156,15 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, return sk; } -/* bpf_sk_lookup performs the core lookup for different types of sockets, +/* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. * Returns the socket as an 'unsigned long' to simplify the casting in the * callers to satisfy BPF_CALL declarations. */ -static unsigned long -__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { struct sock *sk = NULL; u8 family = AF_UNSPEC; @@ -5192,15 +5192,27 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, put_net(net); } - if (sk) - sk = sk_to_full_sk(sk); out: - return (unsigned long) sk; + return sk; } -static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; +} + +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; @@ -5213,14 +5225,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex = 0; } - return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, - proto, netns_id, flags); + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags); } +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; +} + +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { @@ -5238,7 +5283,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { @@ -5273,8 +5319,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -5289,14 +5336,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -5311,11 +5382,31 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -5332,8 +5423,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -5586,6 +5678,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); @@ -5719,6 +5813,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5754,6 +5850,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5846,6 +5944,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); From 399040847084a69f345e0a52fd62f04654e0fce3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:02 +0800 Subject: [PATCH 05/29] bpf: add helper to check for a valid SYN cookie Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet belongs to a known connection. However, there is one corner case: no sockets are created if SYN cookies are active. This means that the final ACK in the 3WHS is misclassified. Using the helper, we can look up the listening socket via bpf_skc_lookup_tcp and then check whether a packet is a valid SYN cookie ACK. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 +++++++++- net/core/filter.c | 72 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fab05317f5e7..3c04410137d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2448,6 +2448,21 @@ union bpf_attr { * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2549,7 +2564,8 @@ union bpf_attr { FN(tcp_sock), \ FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ - FN(skc_lookup_tcp), + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index b6d83ba97621..d2511fe46db3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5553,6 +5553,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + int ret; + + if (unlikely(th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + cookie = ntohl(th->ack_seq) - 1; + + switch (sk->sk_family) { + case AF_INET: + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case AF_INET6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5815,6 +5883,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5852,6 +5922,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); From 253c8dde3cf60879d37b5fa47bde4a62c295c72b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:03 +0800 Subject: [PATCH 06/29] tools: update include/uapi/linux/bpf.h Pull definitions for bpf_skc_lookup_tcp and bpf_sk_check_syncookie. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 929c8e537a14..3c04410137d9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2431,6 +2431,38 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2531,7 +2563,9 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call From dbaf2877e9ad0ac77c463d1bf87b2eb7efc46160 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:04 +0800 Subject: [PATCH 07/29] selftests/bpf: allow specifying helper for BPF_SK_LOOKUP Make the BPF_SK_LOOKUP macro take a helper function, to ease writing tests for new helpers. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 6 +- .../selftests/bpf/verifier/ref_tracking.c | 78 +++++++++---------- tools/testing/selftests/bpf/verifier/unpriv.c | 8 +- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 477a9dcf9fff..19b5d03acc2a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -198,7 +198,7 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self) } /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */ -#define BPF_SK_LOOKUP \ +#define BPF_SK_LOOKUP(func) \ /* struct bpf_sock_tuple tuple = {} */ \ BPF_MOV64_IMM(BPF_REG_2, 0), \ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), \ @@ -207,13 +207,13 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self) BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32), \ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40), \ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48), \ - /* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */ \ + /* sk = func(ctx, &tuple, sizeof tuple, 0, 0) */ \ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), \ BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)), \ BPF_MOV64_IMM(BPF_REG_4, 0), \ BPF_MOV64_IMM(BPF_REG_5, 0), \ - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp) + BPF_EMIT_CALL(BPF_FUNC_ ## func) /* BPF_DIRECT_PKT_R2 contains 7 instructions, it initializes default return * value into 0 and does necessary preparation for direct packet access diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 923f2110072d..a6905e5017dc 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -1,7 +1,7 @@ { "reference tracking: leak potential reference", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */ BPF_EXIT_INSN(), }, @@ -12,7 +12,7 @@ { "reference tracking: leak potential reference on stack", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0), @@ -26,7 +26,7 @@ { "reference tracking: leak potential reference on stack 2", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0), @@ -41,7 +41,7 @@ { "reference tracking: zero potential reference", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */ BPF_EXIT_INSN(), }, @@ -52,7 +52,7 @@ { "reference tracking: copy and zero potential references", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */ @@ -65,7 +65,7 @@ { "reference tracking: release reference without check", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), /* reference in r0 may be NULL */ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 0), @@ -79,7 +79,7 @@ { "reference tracking: release reference", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_EMIT_CALL(BPF_FUNC_sk_release), @@ -91,7 +91,7 @@ { "reference tracking: release reference 2", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), @@ -104,7 +104,7 @@ { "reference tracking: release reference twice", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), @@ -120,7 +120,7 @@ { "reference tracking: release reference twice inside branch", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* goto end */ @@ -147,7 +147,7 @@ BPF_EXIT_INSN(), BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2, offsetof(struct __sk_buff, mark)), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */ /* Leak reference in R0 */ BPF_EXIT_INSN(), @@ -175,7 +175,7 @@ BPF_EXIT_INSN(), BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2, offsetof(struct __sk_buff, mark)), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), /* mark == 0? */ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), @@ -193,7 +193,7 @@ { "reference tracking in call: free reference in subprog", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), @@ -211,7 +211,7 @@ { "reference tracking in call: free reference in subprog and outside", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), @@ -241,7 +241,7 @@ /* subprog 1 */ BPF_MOV64_REG(BPF_REG_6, BPF_REG_4), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), /* spill unchecked sk_ptr into stack of caller */ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), @@ -262,7 +262,7 @@ BPF_EXIT_INSN(), /* subprog 1 */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_EXIT_INSN(), /* return sk */ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -291,7 +291,7 @@ BPF_EXIT_INSN(), /* subprog 2 */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -324,7 +324,7 @@ BPF_EXIT_INSN(), /* subprog 2 */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -334,7 +334,7 @@ "reference tracking: allow LD_ABS", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_EMIT_CALL(BPF_FUNC_sk_release), @@ -350,7 +350,7 @@ "reference tracking: forbid LD_ABS while holding reference", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_LD_ABS(BPF_B, 0), BPF_LD_ABS(BPF_H, 0), BPF_LD_ABS(BPF_W, 0), @@ -367,7 +367,7 @@ "reference tracking: allow LD_IND", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_EMIT_CALL(BPF_FUNC_sk_release), @@ -384,7 +384,7 @@ "reference tracking: forbid LD_IND while holding reference", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_7, 1), BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000), @@ -402,7 +402,7 @@ "reference tracking: check reference or tail call", .insns = { BPF_MOV64_REG(BPF_REG_7, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), /* if (sk) bpf_sk_release() */ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 7), @@ -424,7 +424,7 @@ "reference tracking: release reference then tail call", .insns = { BPF_MOV64_REG(BPF_REG_7, BPF_REG_1), - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), /* if (sk) bpf_sk_release() */ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), @@ -446,7 +446,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_7, BPF_REG_1), /* Look up socket and store in REG_6 */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), /* bpf_tail_call() */ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_3, 2), @@ -470,7 +470,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_7, BPF_REG_1), /* Look up socket and store in REG_6 */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* if (!sk) goto end */ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), @@ -492,7 +492,7 @@ { "reference tracking: mangle and release sock_or_null", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), @@ -506,7 +506,7 @@ { "reference tracking: mangle and release sock", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5), @@ -520,7 +520,7 @@ { "reference tracking: access member", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4), @@ -534,7 +534,7 @@ { "reference tracking: write to member", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), @@ -553,7 +553,7 @@ { "reference tracking: invalid 64-bit access of member", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), @@ -568,7 +568,7 @@ { "reference tracking: access after release", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_EMIT_CALL(BPF_FUNC_sk_release), @@ -608,7 +608,7 @@ { "reference tracking: use ptr from bpf_tcp_sock() after release", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -631,7 +631,7 @@ { "reference tracking: use ptr from bpf_sk_fullsock() after release", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -654,7 +654,7 @@ { "reference tracking: use ptr from bpf_sk_fullsock(tp) after release", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -681,7 +681,7 @@ { "reference tracking: use sk after bpf_sk_release(tp)", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -703,7 +703,7 @@ { "reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -725,7 +725,7 @@ { "reference tracking: bpf_sk_release(listen_sk)", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), @@ -750,7 +750,7 @@ /* !bpf_sk_fullsock(sk) is checked but !bpf_tcp_sock(sk) is not checked */ "reference tracking: tp->snd_cwnd after bpf_sk_fullsock(sk) and bpf_tcp_sock(sk)", .insns = { - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index dbaf5be947b2..91bb77c24a2e 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -242,7 +242,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), /* struct bpf_sock *sock = bpf_sock_lookup(...); */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), /* u64 foo; */ /* void *target = &foo; */ @@ -276,7 +276,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), /* struct bpf_sock *sock = bpf_sock_lookup(...); */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), /* u64 foo; */ /* void *target = &foo; */ @@ -307,7 +307,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), /* struct bpf_sock *sock = bpf_sock_lookup(...); */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), /* u64 foo; */ /* void *target = &foo; */ @@ -339,7 +339,7 @@ .insns = { BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), /* struct bpf_sock *sock = bpf_sock_lookup(...); */ - BPF_SK_LOOKUP, + BPF_SK_LOOKUP(sk_lookup_tcp), BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), /* u64 foo; */ /* void *target = &foo; */ From 5792d52df1e77110abf0d11b1131992a8c0c8d17 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:05 +0800 Subject: [PATCH 08/29] selftests/bpf: test references to sock_common Make sure that returning a struct sock_common * reference invokes the reference tracking machinery in the verifier. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/ref_tracking.c | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index a6905e5017dc..ebcbf154c460 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -9,6 +9,17 @@ .errstr = "Unreleased reference", .result = REJECT, }, +{ + "reference tracking: leak potential reference to sock_common", + .insns = { + BPF_SK_LOOKUP(skc_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */ + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, { "reference tracking: leak potential reference on stack", .insns = { @@ -49,6 +60,17 @@ .errstr = "Unreleased reference", .result = REJECT, }, +{ + "reference tracking: zero potential reference to sock_common", + .insns = { + BPF_SK_LOOKUP(skc_lookup_tcp), + BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */ + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, { "reference tracking: copy and zero potential references", .insns = { @@ -76,6 +98,20 @@ .errstr = "type=sock_or_null expected=sock", .result = REJECT, }, +{ + "reference tracking: release reference to sock_common without check", + .insns = { + BPF_SK_LOOKUP(skc_lookup_tcp), + /* reference in r0 may be NULL */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "type=sock_common_or_null expected=sock", + .result = REJECT, +}, { "reference tracking: release reference", .insns = { @@ -88,6 +124,18 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, +{ + "reference tracking: release reference to sock_common", + .insns = { + BPF_SK_LOOKUP(skc_lookup_tcp), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, { "reference tracking: release reference 2", .insns = { From bafc0ba8261e36e36b0b1e851749fd3712a2a6f4 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:06 +0800 Subject: [PATCH 09/29] selftests/bpf: add tests for bpf_tcp_check_syncookie and bpf_skc_lookup_tcp Add tests which verify that the new helpers work for both IPv4 and IPv6, by forcing SYN cookies to always on. Use a new network namespace to avoid clobbering the global SYN cookie settings. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 5 +- tools/testing/selftests/bpf/bpf_helpers.h | 8 + .../bpf/progs/test_tcp_check_syncookie_kern.c | 129 +++++++++++ .../selftests/bpf/test_tcp_check_syncookie.sh | 81 +++++++ .../bpf/test_tcp_check_syncookie_user.c | 212 ++++++++++++++++++ 6 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c create mode 100755 tools/testing/selftests/bpf/test_tcp_check_syncookie.sh create mode 100644 tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3b74d23fffab..41e8a689aa77 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -30,4 +30,5 @@ test_netcnt test_section_names test_tcpnotify_user test_libbpf +test_tcp_check_syncookie_user alu32 diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2aed37ea61a4..25d3939eb840 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -51,7 +51,8 @@ TEST_PROGS := test_kmod.sh \ test_skb_cgroup_id.sh \ test_flow_dissector.sh \ test_xdp_vlan.sh \ - test_lwt_ip_encap.sh + test_lwt_ip_encap.sh \ + test_tcp_check_syncookie.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ @@ -60,7 +61,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ - flow_dissector_load test_flow_dissector + flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user include ../lib.mk diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 41f8a4b676a4..97d140961438 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -159,6 +159,11 @@ static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, int size, unsigned long long netns_id, unsigned long long flags) = (void *) BPF_FUNC_sk_lookup_tcp; +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_skc_lookup_tcp; static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, int size, unsigned long long netns_id, @@ -184,6 +189,9 @@ static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_get_listener_sock; static int (*bpf_skb_ecn_set_ce)(void *ctx) = (void *) BPF_FUNC_skb_ecn_set_ce; +static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, + void *ip, int ip_len, void *tcp, int tcp_len) = + (void *) BPF_FUNC_tcp_check_syncookie; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c new file mode 100644 index 000000000000..1ab095bcacd8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook +// Copyright (c) 2019 Cloudflare + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +struct bpf_map_def SEC("maps") results = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64), + .max_entries = 1, +}; + +static __always_inline void check_syncookie(void *ctx, void *data, + void *data_end) +{ + struct bpf_sock_tuple tup; + struct bpf_sock *sk; + struct ethhdr *ethh; + struct iphdr *ipv4h; + struct ipv6hdr *ipv6h; + struct tcphdr *tcph; + int ret; + __u32 key = 0; + __u64 value = 1; + + ethh = data; + if (ethh + 1 > data_end) + return; + + switch (bpf_ntohs(ethh->h_proto)) { + case ETH_P_IP: + ipv4h = data + sizeof(struct ethhdr); + if (ipv4h + 1 > data_end) + return; + + if (ipv4h->ihl != 5) + return; + + tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr); + if (tcph + 1 > data_end) + return; + + tup.ipv4.saddr = ipv4h->saddr; + tup.ipv4.daddr = ipv4h->daddr; + tup.ipv4.sport = tcph->source; + tup.ipv4.dport = tcph->dest; + + sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv4), + BPF_F_CURRENT_NETNS, 0); + if (!sk) + return; + + if (sk->state != BPF_TCP_LISTEN) + goto release; + + ret = bpf_tcp_check_syncookie(sk, ipv4h, sizeof(*ipv4h), + tcph, sizeof(*tcph)); + break; + + case ETH_P_IPV6: + ipv6h = data + sizeof(struct ethhdr); + if (ipv6h + 1 > data_end) + return; + + if (ipv6h->nexthdr != IPPROTO_TCP) + return; + + tcph = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr); + if (tcph + 1 > data_end) + return; + + memcpy(tup.ipv6.saddr, &ipv6h->saddr, sizeof(tup.ipv6.saddr)); + memcpy(tup.ipv6.daddr, &ipv6h->daddr, sizeof(tup.ipv6.daddr)); + tup.ipv6.sport = tcph->source; + tup.ipv6.dport = tcph->dest; + + sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv6), + BPF_F_CURRENT_NETNS, 0); + if (!sk) + return; + + if (sk->state != BPF_TCP_LISTEN) + goto release; + + ret = bpf_tcp_check_syncookie(sk, ipv6h, sizeof(*ipv6h), + tcph, sizeof(*tcph)); + break; + + default: + return; + } + + if (ret == 0) + bpf_map_update_elem(&results, &key, &value, 0); + +release: + bpf_sk_release(sk); +} + +SEC("clsact/check_syncookie") +int check_syncookie_clsact(struct __sk_buff *skb) +{ + check_syncookie(skb, (void *)(long)skb->data, + (void *)(long)skb->data_end); + return TC_ACT_OK; +} + +SEC("xdp/check_syncookie") +int check_syncookie_xdp(struct xdp_md *ctx) +{ + check_syncookie(ctx, (void *)(long)ctx->data, + (void *)(long)ctx->data_end); + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh new file mode 100755 index 000000000000..d48e51716d19 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Facebook +# Copyright (c) 2019 Cloudflare + +set -eu + +wait_for_ip() +{ + local _i + printf "Wait for IP %s to become available " "$1" + for _i in $(seq ${MAX_PING_TRIES}); do + printf "." + if ns1_exec ping -c 1 -W 1 "$1" >/dev/null 2>&1; then + echo " OK" + return + fi + sleep 1 + done + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." + exit 1 +} + +get_prog_id() +{ + awk '/ id / {sub(/.* id /, "", $0); print($1)}' +} + +ns1_exec() +{ + ip netns exec ns1 "$@" +} + +setup() +{ + ip netns add ns1 + ns1_exec ip link set lo up + + ns1_exec sysctl -w net.ipv4.tcp_syncookies=2 + + wait_for_ip 127.0.0.1 + wait_for_ip ::1 +} + +cleanup() +{ + ip netns del ns1 2>/dev/null || : +} + +main() +{ + trap cleanup EXIT 2 3 6 15 + setup + + printf "Testing clsact..." + ns1_exec tc qdisc add dev "${TEST_IF}" clsact + ns1_exec tc filter add dev "${TEST_IF}" ingress \ + bpf obj "${BPF_PROG_OBJ}" sec "${CLSACT_SECTION}" da + + BPF_PROG_ID=$(ns1_exec tc filter show dev "${TEST_IF}" ingress | \ + get_prog_id) + ns1_exec "${PROG}" "${BPF_PROG_ID}" + ns1_exec tc qdisc del dev "${TEST_IF}" clsact + + printf "Testing XDP..." + ns1_exec ip link set "${TEST_IF}" xdp \ + object "${BPF_PROG_OBJ}" section "${XDP_SECTION}" + BPF_PROG_ID=$(ns1_exec ip link show "${TEST_IF}" | get_prog_id) + ns1_exec "${PROG}" "${BPF_PROG_ID}" +} + +DIR=$(dirname $0) +TEST_IF=lo +MAX_PING_TRIES=5 +BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.o" +CLSACT_SECTION="clsact/check_syncookie" +XDP_SECTION="xdp/check_syncookie" +BPF_PROG_ID=0 +PROG="${DIR}/test_tcp_check_syncookie_user" + +main diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c new file mode 100644 index 000000000000..87829c86c746 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook +// Copyright (c) 2019 Cloudflare + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" + +static int start_server(const struct sockaddr *addr, socklen_t len) +{ + int fd; + + fd = socket(addr->sa_family, SOCK_STREAM, 0); + if (fd == -1) { + log_err("Failed to create server socket"); + goto out; + } + + if (bind(fd, addr, len) == -1) { + log_err("Failed to bind server socket"); + goto close_out; + } + + if (listen(fd, 128) == -1) { + log_err("Failed to listen on server socket"); + goto close_out; + } + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int connect_to_server(int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int fd = -1; + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + goto out; + } + + fd = socket(addr.ss_family, SOCK_STREAM, 0); + if (fd == -1) { + log_err("Failed to create client socket"); + goto out; + } + + if (connect(fd, (const struct sockaddr *)&addr, len) == -1) { + log_err("Fail to connect to server"); + goto close_out; + } + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int get_map_fd_by_prog_id(int prog_id) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 map_ids[1]; + int prog_fd = -1; + int map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + log_err("Failed to get fd by prog id %d", prog_id); + goto err; + } + + info.nr_map_ids = 1; + info.map_ids = (__u64)(unsigned long)map_ids; + + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { + log_err("Failed to get info by prog fd %d", prog_fd); + goto err; + } + + if (!info.nr_map_ids) { + log_err("No maps found for prog fd %d", prog_fd); + goto err; + } + + map_fd = bpf_map_get_fd_by_id(map_ids[0]); + if (map_fd < 0) + log_err("Failed to get fd by map id %d", map_ids[0]); +err: + if (prog_fd >= 0) + close(prog_fd); + return map_fd; +} + +static int run_test(int server_fd, int results_fd) +{ + int client = -1, srv_client = -1; + int ret = 0; + __u32 key = 0; + __u64 value = 0; + + if (bpf_map_update_elem(results_fd, &key, &value, 0) < 0) { + log_err("Can't clear results"); + goto err; + } + + client = connect_to_server(server_fd); + if (client == -1) + goto err; + + srv_client = accept(server_fd, NULL, 0); + if (srv_client == -1) { + log_err("Can't accept connection"); + goto err; + } + + if (bpf_map_lookup_elem(results_fd, &key, &value) < 0) { + log_err("Can't lookup result"); + goto err; + } + + if (value != 1) { + log_err("Didn't match syncookie: %llu", value); + goto err; + } + + goto out; + +err: + ret = 1; +out: + close(client); + close(srv_client); + return ret; +} + +int main(int argc, char **argv) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + int server = -1; + int server_v6 = -1; + int results = -1; + int err = 0; + + if (argc < 2) { + fprintf(stderr, "Usage: %s prog_id\n", argv[0]); + exit(1); + } + + results = get_map_fd_by_prog_id(atoi(argv[1])); + if (results < 0) { + log_err("Can't get map"); + goto err; + } + + memset(&addr4, 0, sizeof(addr4)); + addr4.sin_family = AF_INET; + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr4.sin_port = 0; + + memset(&addr6, 0, sizeof(addr6)); + addr6.sin6_family = AF_INET6; + addr6.sin6_addr = in6addr_loopback; + addr6.sin6_port = 0; + + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); + if (server == -1) + goto err; + + server_v6 = start_server((const struct sockaddr *)&addr6, + sizeof(addr6)); + if (server_v6 == -1) + goto err; + + if (run_test(server, results)) + goto err; + + if (run_test(server_v6, results)) + goto err; + + printf("ok\n"); + goto out; +err: + err = 1; +out: + close(server); + close(server_v6); + close(results); + return err; +} From ab99e7a8f7fe190f8669e32d1b8732d18f42e249 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Wed, 20 Mar 2019 13:17:47 +0900 Subject: [PATCH 10/29] samples: bpf: add xdp_sample_pkts to .gitignore This commit adds xdp_sample_pkts to .gitignore which is currently ommited from the ignore file. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov --- samples/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index dbb817dbacfc..59e40998e249 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -44,5 +44,6 @@ xdp_redirect_cpu xdp_redirect_map xdp_router_ipv4 xdp_rxq_info +xdp_sample_pkts xdp_tx_iptunnel xdpsock From f6827526279d75f0b1c1605b1bf560024bd7696f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 15 Mar 2019 21:04:14 +0100 Subject: [PATCH 11/29] selftests: bpf: modify urandom_read and link it non-statically After some experiences I found that urandom_read does not need to be linked statically. When the 'read' syscall call is moved to separate non-inlined function then bpf_get_stackid() is able to find the executable in stack trace and extract its build_id from it. Signed-off-by: Ivan Vecera Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/urandom_read.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 25d3939eb840..edd59707cb1f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -70,7 +70,7 @@ TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c - $(CC) -o $@ -static $< -Wl,--build-id + $(CC) -o $@ $< -Wl,--build-id BPFOBJ := $(OUTPUT)/libbpf.a diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c index 9de8b7cb4e6d..db781052758d 100644 --- a/tools/testing/selftests/bpf/urandom_read.c +++ b/tools/testing/selftests/bpf/urandom_read.c @@ -7,11 +7,19 @@ #define BUF_SIZE 256 +static __attribute__((noinline)) +void urandom_read(int fd, int count) +{ + char buf[BUF_SIZE]; + int i; + + for (i = 0; i < count; ++i) + read(fd, buf, BUF_SIZE); +} + int main(int argc, char *argv[]) { int fd = open("/dev/urandom", O_RDONLY); - int i; - char buf[BUF_SIZE]; int count = 4; if (fd < 0) @@ -20,8 +28,7 @@ int main(int argc, char *argv[]) if (argc == 2) count = atoi(argv[1]); - for (i = 0; i < count; ++i) - read(fd, buf, BUF_SIZE); + urandom_read(fd, count); close(fd); return 0; From 908adce6465394ea4a09c144507a40848e1d7db5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:48 -0400 Subject: [PATCH 12/29] bpf: in bpf_skb_adjust_room avoid copy in tx fast path bpf_skb_adjust_room calls skb_cow on grow. This expensive operation can be avoided in the fast path when the only other clone has released the header. This is the common case for TCP, where one headerless clone is kept on the retransmit queue. It is safe to do so even when touching the gso fields in skb_shinfo. Regular tunnel encap with iptunnel_handle_offloads takes the same optimization. The tcp stack unclones in the unlikely case that it accesses these fields through headerless clones packets on the retransmit queue (see __tcp_retransmit_skb). If any other clones are present, e.g., from packet sockets, skb_cow_head returns the same value as skb_cow(). Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index d2511fe46db3..d21e1acdde29 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2971,7 +2971,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; - ret = skb_cow(skb, len_diff); + ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; From 98cdabcd0798bd9991821493120b928ed0dfab73 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:49 -0400 Subject: [PATCH 13/29] selftests/bpf: bpf tunnel encap test Validate basic tunnel encapsulation using ipip. Set up two namespaces connected by veth. Connect a client and server. Do this with and without bpf encap. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/progs/test_tc_tunnel.c | 83 +++++++++++++++++++ tools/testing/selftests/bpf/test_tc_tunnel.sh | 75 +++++++++++++++++ 3 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tc_tunnel.c create mode 100755 tools/testing/selftests/bpf/test_tc_tunnel.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index edd59707cb1f..cdcc54ddf4b9 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -52,7 +52,8 @@ TEST_PROGS := test_kmod.sh \ test_flow_dissector.sh \ test_xdp_vlan.sh \ test_lwt_ip_encap.sh \ - test_tcp_check_syncookie.sh + test_tcp_check_syncookie.sh \ + test_tc_tunnel.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c new file mode 100644 index 000000000000..8223e4347be8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* In-place tunneling */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_endian.h" +#include "bpf_helpers.h" + +static const int cfg_port = 8000; + +static __always_inline void set_ipv4_csum(struct iphdr *iph) +{ + __u16 *iph16 = (__u16 *)iph; + __u32 csum; + int i; + + iph->check = 0; + +#pragma clang loop unroll(full) + for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++) + csum += *iph16++; + + iph->check = ~((csum & 0xffff) + (csum >> 16)); +} + +SEC("encap") +int encap_f(struct __sk_buff *skb) +{ + struct iphdr iph_outer, iph_inner; + struct tcphdr tcph; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + /* filter only packets we want */ + if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner), + &tcph, sizeof(tcph)) < 0) + return TC_ACT_OK; + + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + iph_outer = iph_inner; + iph_outer.protocol = IPPROTO_IPIP; + iph_outer.tot_len = bpf_htons(sizeof(iph_outer) + + bpf_htons(iph_outer.tot_len)); + set_ipv4_csum(&iph_outer); + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved header to start of room: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh new file mode 100755 index 000000000000..6ebb288a3afc --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# In-place tunneling + +# must match the port that the bpf program filters on +readonly port=8000 + +readonly ns_prefix="ns-$$-" +readonly ns1="${ns_prefix}1" +readonly ns2="${ns_prefix}2" + +readonly ns1_v4=192.168.1.1 +readonly ns2_v4=192.168.1.2 + +setup() { + ip netns add "${ns1}" + ip netns add "${ns2}" + + ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ + peer name veth2 mtu 1500 netns "${ns2}" + + ip -netns "${ns1}" link set veth1 up + ip -netns "${ns2}" link set veth2 up + + ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 + ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 + + sleep 1 +} + +cleanup() { + ip netns del "${ns2}" + ip netns del "${ns1}" +} + +server_listen() { + ip netns exec "${ns2}" nc -l -p "${port}" & + sleep 0.2 +} + +client_connect() { + ip netns exec "${ns1}" nc -z -w 1 "${ns2_v4}" "${port}" + echo $? +} + +set -e +trap cleanup EXIT + +setup + +# basic communication works +echo "test basic connectivity" +server_listen +client_connect + +# clientside, insert bpf program to encap all TCP to port ${port} +# client can no longer connect +ip netns exec "${ns1}" tc qdisc add dev veth1 clsact +ip netns exec "${ns1}" tc filter add dev veth1 egress \ + bpf direct-action object-file ./test_tc_tunnel.o section encap +echo "test bpf encap without decap (expect failure)" +server_listen +! client_connect + +# serverside, insert decap module +# server is still running +# client can connect again +ip netns exec "${ns2}" ip link add dev testtun0 type ipip \ + remote "${ns1_v4}" local "${ns2_v4}" +ip netns exec "${ns2}" ip link set dev testtun0 up +echo "test bpf encap with tunnel device decap" +client_connect + +echo OK From ccd34cd3577dd6e244269bb8ccfab228360aa53d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:50 -0400 Subject: [PATCH 14/29] selftests/bpf: expand bpf tunnel test with decap The bpf tunnel test encapsulates using bpf, then decapsulates using a standard tunnel device to verify correctness. Once encap is verified, also test decap, by replacing the tunnel device on decap with another bpf program. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_tc_tunnel.c | 31 +++++++++++++++++++ tools/testing/selftests/bpf/test_tc_tunnel.sh | 9 ++++++ 2 files changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 8223e4347be8..25db148635ab 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -80,4 +80,35 @@ int encap_f(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("decap") +int decap_f(struct __sk_buff *skb) +{ + struct iphdr iph_outer, iph_inner; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, + sizeof(iph_outer)) < 0) + return TC_ACT_OK; + + if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + if (bpf_skb_adjust_room(skb, -(int)sizeof(iph_outer), + BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved outer over inner header: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 6ebb288a3afc..91151d91e5a1 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -72,4 +72,13 @@ ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect +# serverside, use BPF for decap +ip netns exec "${ns2}" ip link del dev testtun0 +ip netns exec "${ns2}" tc qdisc add dev veth2 clsact +ip netns exec "${ns2}" tc filter add dev veth2 ingress \ + bpf direct-action object-file ./test_tc_tunnel.o section decap +server_listen +echo "test bpf encap with bpf decap" +client_connect + echo OK From ef81bd054942e2bd8289c91a3528e6fc0ca26c1c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:51 -0400 Subject: [PATCH 15/29] selftests/bpf: expand bpf tunnel test to ipv6 The test only uses ipv4 so far, expand to ipv6. This is mostly a boilerplate near copy of the ipv4 path. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 2 + .../selftests/bpf/progs/test_tc_tunnel.c | 118 +++++++++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 53 +++++++- 3 files changed, 150 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 37f947ec44ed..a42f4fc4dc11 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y CONFIG_BPF_STREAM_PARSER=y CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y +CONFIG_IPV6_TUNNEL=y +CONFIG_IPV6_GRE=y diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 25db148635ab..591f540ce513 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -31,15 +32,11 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -SEC("encap") -int encap_f(struct __sk_buff *skb) +static int encap_ipv4(struct __sk_buff *skb) { struct iphdr iph_outer, iph_inner; struct tcphdr tcph; - if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) - return TC_ACT_OK; - if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) return TC_ACT_OK; @@ -80,14 +77,82 @@ int encap_f(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("decap") -int decap_f(struct __sk_buff *skb) +static int encap_ipv6(struct __sk_buff *skb) { - struct iphdr iph_outer, iph_inner; + struct ipv6hdr iph_outer, iph_inner; + struct tcphdr tcph; - if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) return TC_ACT_OK; + /* filter only packets we want */ + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner), + &tcph, sizeof(tcph)) < 0) + return TC_ACT_OK; + + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + iph_outer = iph_inner; + iph_outer.nexthdr = IPPROTO_IPV6; + iph_outer.payload_len = bpf_htons(sizeof(iph_outer) + + bpf_ntohs(iph_outer.payload_len)); + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved header to start of room: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +SEC("encap") +int encap_f(struct __sk_buff *skb) +{ + switch (skb->protocol) { + case __bpf_constant_htons(ETH_P_IP): + return encap_ipv4(skb); + case __bpf_constant_htons(ETH_P_IPV6): + return encap_ipv6(skb); + default: + /* does not match, ignore */ + return TC_ACT_OK; + } +} + +static int decap_internal(struct __sk_buff *skb, int off, int len) +{ + char buf[sizeof(struct ipv6hdr)]; + + if (bpf_skb_load_bytes(skb, off + len, &buf, len) < 0) + return TC_ACT_OK; + + if (bpf_skb_adjust_room(skb, -len, BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved outer over inner header: restore */ + if (bpf_skb_store_bytes(skb, off, buf, len, BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +static int decap_ipv4(struct __sk_buff *skb) +{ + struct iphdr iph_outer; + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer)) < 0) return TC_ACT_OK; @@ -95,20 +160,35 @@ int decap_f(struct __sk_buff *skb) if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) return TC_ACT_OK; - if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_outer), - &iph_inner, sizeof(iph_inner)) < 0) + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); +} + +static int decap_ipv6(struct __sk_buff *skb) +{ + struct ipv6hdr iph_outer; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, + sizeof(iph_outer)) < 0) return TC_ACT_OK; - if (bpf_skb_adjust_room(skb, -(int)sizeof(iph_outer), - BPF_ADJ_ROOM_NET, 0)) - return TC_ACT_SHOT; + if (iph_outer.nexthdr != IPPROTO_IPV6) + return TC_ACT_OK; - /* bpf_skb_adjust_room has moved outer over inner header: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) - return TC_ACT_SHOT; + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); +} - return TC_ACT_OK; +SEC("decap") +int decap_f(struct __sk_buff *skb) +{ + switch (skb->protocol) { + case __bpf_constant_htons(ETH_P_IP): + return decap_ipv4(skb); + case __bpf_constant_htons(ETH_P_IPV6): + return decap_ipv6(skb); + default: + /* does not match, ignore */ + return TC_ACT_OK; + } } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 91151d91e5a1..7b1758f3006b 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -12,6 +12,9 @@ readonly ns2="${ns_prefix}2" readonly ns1_v4=192.168.1.1 readonly ns2_v4=192.168.1.2 +readonly ns1_v6=fd::1 +readonly ns2_v6=fd::2 + setup() { ip netns add "${ns1}" @@ -25,6 +28,8 @@ setup() { ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 + ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad + ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad sleep 1 } @@ -35,16 +40,56 @@ cleanup() { } server_listen() { - ip netns exec "${ns2}" nc -l -p "${port}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" & sleep 0.2 } client_connect() { - ip netns exec "${ns1}" nc -z -w 1 "${ns2_v4}" "${port}" + ip netns exec "${ns1}" nc "${netcat_opt}" -z -w 1 "${addr2}" "${port}" echo $? } set -e + +# no arguments: automated test, run all +if [[ "$#" -eq "0" ]]; then + echo "ipip" + $0 ipv4 + + echo "ip6ip6" + $0 ipv6 + + echo "OK. All tests passed" + exit 0 +fi + +if [[ "$#" -ne "1" ]]; then + echo "Usage: $0" + echo " or: $0 " + exit 1 +fi + +case "$1" in +"ipv4") + readonly tuntype=ipip + readonly addr1="${ns1_v4}" + readonly addr2="${ns2_v4}" + readonly netcat_opt=-4 + ;; +"ipv6") + readonly tuntype=ip6tnl + readonly addr1="${ns1_v6}" + readonly addr2="${ns2_v6}" + readonly netcat_opt=-6 + ;; +*) + echo "unknown arg: $1" + exit 1 + ;; +esac + +echo "encap ${addr1} to ${addr2}, type ${tuntype}" + trap cleanup EXIT setup @@ -66,8 +111,8 @@ server_listen # serverside, insert decap module # server is still running # client can connect again -ip netns exec "${ns2}" ip link add dev testtun0 type ipip \ - remote "${ns1_v4}" local "${ns2_v4}" +ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ + remote "${addr1}" local "${addr2}" ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect From 7255fade7b93e7e84e12f27ae5e8af9cf8b93745 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:52 -0400 Subject: [PATCH 16/29] selftests/bpf: extend bpf tunnel test with gre GRE is a commonly used protocol. Add GRE cases for both IPv4 and IPv6. It also inserts different sized headers, which can expose some unexpected edge cases. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_tc_tunnel.c | 148 +++++++++++++----- tools/testing/selftests/bpf/test_tc_tunnel.sh | 21 ++- 2 files changed, 123 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 591f540ce513..900c5653105f 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -2,6 +2,9 @@ /* In-place tunneling */ +#include +#include + #include #include #include @@ -17,6 +20,18 @@ static const int cfg_port = 8000; +struct grev4hdr { + struct iphdr ip; + __be16 flags; + __be16 protocol; +} __attribute__((packed)); + +struct grev6hdr { + struct ipv6hdr ip; + __be16 flags; + __be16 protocol; +} __attribute__((packed)); + static __always_inline void set_ipv4_csum(struct iphdr *iph) { __u16 *iph16 = (__u16 *)iph; @@ -32,10 +47,12 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static int encap_ipv4(struct __sk_buff *skb) +static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) { - struct iphdr iph_outer, iph_inner; + struct grev4hdr h_outer; + struct iphdr iph_inner; struct tcphdr tcph; + int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) @@ -52,24 +69,33 @@ static int encap_ipv4(struct __sk_buff *skb) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ - iph_outer = iph_inner; - iph_outer.protocol = IPPROTO_IPIP; - iph_outer.tot_len = bpf_htons(sizeof(iph_outer) + - bpf_htons(iph_outer.tot_len)); - set_ipv4_csum(&iph_outer); + h_outer.ip = iph_inner; + h_outer.ip.tot_len = bpf_htons(olen + + bpf_htons(h_outer.ip.tot_len)); + if (with_gre) { + h_outer.ip.protocol = IPPROTO_GRE; + h_outer.protocol = bpf_htons(ETH_P_IP); + h_outer.flags = 0; + } else { + h_outer.ip.protocol = IPPROTO_IPIP; + } + + set_ipv4_csum((void *)&h_outer.ip); /* store new outer network header */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, &iph_inner, sizeof(iph_inner), BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; @@ -77,10 +103,12 @@ static int encap_ipv4(struct __sk_buff *skb) return TC_ACT_OK; } -static int encap_ipv6(struct __sk_buff *skb) +static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) { - struct ipv6hdr iph_outer, iph_inner; + struct ipv6hdr iph_inner; + struct grev6hdr h_outer; struct tcphdr tcph; + int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) @@ -94,23 +122,31 @@ static int encap_ipv6(struct __sk_buff *skb) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ - iph_outer = iph_inner; - iph_outer.nexthdr = IPPROTO_IPV6; - iph_outer.payload_len = bpf_htons(sizeof(iph_outer) + - bpf_ntohs(iph_outer.payload_len)); + h_outer.ip = iph_inner; + h_outer.ip.payload_len = bpf_htons(olen + + bpf_ntohs(h_outer.ip.payload_len)); + if (with_gre) { + h_outer.ip.nexthdr = IPPROTO_GRE; + h_outer.protocol = bpf_htons(ETH_P_IPV6); + h_outer.flags = 0; + } else { + h_outer.ip.nexthdr = IPPROTO_IPV6; + } /* store new outer network header */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, &iph_inner, sizeof(iph_inner), BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; @@ -118,28 +154,63 @@ static int encap_ipv6(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap") -int encap_f(struct __sk_buff *skb) +SEC("encap_ipip") +int __encap_ipip(struct __sk_buff *skb) { - switch (skb->protocol) { - case __bpf_constant_htons(ETH_P_IP): - return encap_ipv4(skb); - case __bpf_constant_htons(ETH_P_IPV6): - return encap_ipv6(skb); - default: - /* does not match, ignore */ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv4(skb, false); + else return TC_ACT_OK; - } } -static int decap_internal(struct __sk_buff *skb, int off, int len) +SEC("encap_gre") +int __encap_gre(struct __sk_buff *skb) { - char buf[sizeof(struct ipv6hdr)]; + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv4(skb, true); + else + return TC_ACT_OK; +} - if (bpf_skb_load_bytes(skb, off + len, &buf, len) < 0) +SEC("encap_ip6tnl") +int __encap_ip6tnl(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return encap_ipv6(skb, false); + else + return TC_ACT_OK; +} + +SEC("encap_ip6gre") +int __encap_ip6gre(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return encap_ipv6(skb, true); + else + return TC_ACT_OK; +} + +static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) +{ + char buf[sizeof(struct grev6hdr)]; + int olen; + + switch (proto) { + case IPPROTO_IPIP: + case IPPROTO_IPV6: + olen = len; + break; + case IPPROTO_GRE: + olen = len + 4 /* gre hdr */; + break; + default: + return TC_ACT_OK; + } + + if (bpf_skb_load_bytes(skb, off + olen, &buf, olen) < 0) return TC_ACT_OK; - if (bpf_skb_adjust_room(skb, -len, BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved outer over inner header: restore */ @@ -157,10 +228,11 @@ static int decap_ipv4(struct __sk_buff *skb) sizeof(iph_outer)) < 0) return TC_ACT_OK; - if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + if (iph_outer.ihl != 5) return TC_ACT_OK; - return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), + iph_outer.protocol); } static int decap_ipv6(struct __sk_buff *skb) @@ -171,10 +243,8 @@ static int decap_ipv6(struct __sk_buff *skb) sizeof(iph_outer)) < 0) return TC_ACT_OK; - if (iph_outer.nexthdr != IPPROTO_IPV6) - return TC_ACT_OK; - - return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), + iph_outer.nexthdr); } SEC("decap") diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7b1758f3006b..c78922048610 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -54,30 +54,36 @@ set -e # no arguments: automated test, run all if [[ "$#" -eq "0" ]]; then echo "ipip" - $0 ipv4 + $0 ipv4 ipip echo "ip6ip6" - $0 ipv6 + $0 ipv6 ip6tnl + + echo "ip gre" + $0 ipv4 gre + + echo "ip6 gre" + $0 ipv6 ip6gre echo "OK. All tests passed" exit 0 fi -if [[ "$#" -ne "1" ]]; then +if [[ "$#" -ne "2" ]]; then echo "Usage: $0" - echo " or: $0 " + echo " or: $0 " exit 1 fi case "$1" in "ipv4") - readonly tuntype=ipip + readonly tuntype=$2 readonly addr1="${ns1_v4}" readonly addr2="${ns2_v4}" readonly netcat_opt=-4 ;; "ipv6") - readonly tuntype=ip6tnl + readonly tuntype=$2 readonly addr1="${ns1_v6}" readonly addr2="${ns2_v6}" readonly netcat_opt=-6 @@ -103,7 +109,8 @@ client_connect # client can no longer connect ip netns exec "${ns1}" tc qdisc add dev veth1 clsact ip netns exec "${ns1}" tc filter add dev veth1 egress \ - bpf direct-action object-file ./test_tc_tunnel.o section encap + bpf direct-action object-file ./test_tc_tunnel.o \ + section "encap_${tuntype}" echo "test bpf encap without decap (expect failure)" server_listen ! client_connect From 8142958954d17a31e0ac9e3a9c91103a1c171179 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:53 -0400 Subject: [PATCH 17/29] selftests/bpf: extend bpf tunnel test with tso Segmentation offload takes a longer path. Verify that the feature works with large packets. The test succeeds if not setting dodgy in bpf_skb_adjust_room, as veth TSO is permissive. If not setting SKB_GSO_DODGY, this enables tunneled TSO offload on supporting NICs. The feature sets SKB_GSO_DODGY because the caller is untrusted. As a result the packets traverse through the gso stack at least up to TCP. And fail the gso_type validation, such as the skb->encapsulation check in gre_gso_segment and the gso_type checks introduced in commit 418e897e0716 ("gso: validate gso_type on ipip style tunnel"). This will be addressed in a follow-on feature patch. In the meantime, disable the new gso tests. Changes v1->v2: - not all netcat versions support flag '-q', use timeout instead Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_tc_tunnel.sh | 60 +++++++++++++++---- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index c78922048610..9e18754f2354 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -15,6 +15,8 @@ readonly ns2_v4=192.168.1.2 readonly ns1_v6=fd::1 readonly ns2_v6=fd::2 +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" setup() { ip netns add "${ns1}" @@ -23,6 +25,8 @@ setup() { ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ peer name veth2 mtu 1500 netns "${ns2}" + ip netns exec "${ns1}" ethtool -K veth1 tso off + ip -netns "${ns1}" link set veth1 up ip -netns "${ns2}" link set veth2 up @@ -32,58 +36,86 @@ setup() { ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none } cleanup() { ip netns del "${ns2}" ip netns del "${ns1}" + + if [[ -f "${outfile}" ]]; then + rm "${outfile}" + fi + if [[ -f "${infile}" ]]; then + rm "${infile}" + fi } server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" & + server_pid=$! sleep 0.2 } client_connect() { - ip netns exec "${ns1}" nc "${netcat_opt}" -z -w 1 "${addr2}" "${port}" + ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}" echo $? } +verify_data() { + wait "${server_pid}" + # sha1sum returns two fields [sha1] [filepath] + # convert to bash array and access first elem + insum=($(sha1sum ${infile})) + outsum=($(sha1sum ${outfile})) + if [[ "${insum[0]}" != "${outsum[0]}" ]]; then + echo "data mismatch" + exit 1 + fi +} + set -e # no arguments: automated test, run all if [[ "$#" -eq "0" ]]; then echo "ipip" - $0 ipv4 ipip + $0 ipv4 ipip 100 echo "ip6ip6" - $0 ipv6 ip6tnl + $0 ipv6 ip6tnl 100 echo "ip gre" - $0 ipv4 gre + $0 ipv4 gre 100 echo "ip6 gre" - $0 ipv6 ip6gre + $0 ipv6 ip6gre 100 + + # disabled until passes SKB_GSO_DODGY checks + # echo "ip gre gso" + # $0 ipv4 gre 2000 + + # disabled until passes SKB_GSO_DODGY checks + # echo "ip6 gre gso" + # $0 ipv6 ip6gre 2000 echo "OK. All tests passed" exit 0 fi -if [[ "$#" -ne "2" ]]; then +if [[ "$#" -ne "3" ]]; then echo "Usage: $0" - echo " or: $0 " + echo " or: $0 " exit 1 fi case "$1" in "ipv4") - readonly tuntype=$2 readonly addr1="${ns1_v4}" readonly addr2="${ns2_v4}" readonly netcat_opt=-4 ;; "ipv6") - readonly tuntype=$2 readonly addr1="${ns1_v6}" readonly addr2="${ns2_v6}" readonly netcat_opt=-6 @@ -94,7 +126,10 @@ case "$1" in ;; esac -echo "encap ${addr1} to ${addr2}, type ${tuntype}" +readonly tuntype=$2 +readonly datalen=$3 + +echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}" trap cleanup EXIT @@ -104,6 +139,7 @@ setup echo "test basic connectivity" server_listen client_connect +verify_data # clientside, insert bpf program to encap all TCP to port ${port} # client can no longer connect @@ -123,6 +159,7 @@ ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect +verify_data # serverside, use BPF for decap ip netns exec "${ns2}" ip link del dev testtun0 @@ -132,5 +169,6 @@ ip netns exec "${ns2}" tc filter add dev veth2 ingress \ server_listen echo "test bpf encap with bpf decap" client_connect +verify_data echo OK From 14aa31929b724b70fb63a9b0e7877da325b25cfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:54 -0400 Subject: [PATCH 18/29] bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC bpf_skb_adjust_room net allows inserting room in an skb. Existing mode BPF_ADJ_ROOM_NET inserts room after the network header by pulling the skb, moving the network header forward and zeroing the new space. Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac header. This allows inserting tunnel headers in front of the network header without having to recreate the network header in the original space, avoiding two copies. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++++- net/core/filter.c | 38 ++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c04410137d9..7c8fd0647070 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1478,7 +1478,10 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). @@ -2627,6 +2630,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/net/core/filter.c b/net/core/filter.c index d21e1acdde29..e7b7720b18e9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,9 +2963,8 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -2992,9 +2991,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -3027,7 +3025,8 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } -static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); @@ -3035,14 +3034,28 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) u32 len_max = __bpf_skb_max_len(skb); __be16 proto = skb->protocol; bool shrink = len_diff < 0; + u32 off; int ret; + if (unlikely(flags)) + return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + len_cur = skb->len - skb_network_offset(skb); if (skb_transport_header_was_set(skb) && !trans_same) len_cur = skb_network_header_len(skb); @@ -3052,24 +3065,13 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : - bpf_skb_net_grow(skb, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : + bpf_skb_net_grow(skb, off, len_diff_abs); bpf_compute_data_pointers(skb); return ret; } -BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, - u32, mode, u64, flags) -{ - if (unlikely(flags)) - return -EINVAL; - if (likely(mode == BPF_ADJ_ROOM_NET)) - return bpf_skb_adjust_net(skb, len_diff); - - return -ENOTSUPP; -} - static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, From 2278f6cc151a8bef6ba0b3fe3009d14dc3c51c4a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:55 -0400 Subject: [PATCH 19/29] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_adjust_room adjusts gso_size of gso packets to account for the pushed or popped header room. This is not allowed with UDP, where gso_size delineates datagrams. Add an option to avoid these updates and allow this call for datagrams. It can also be used with TCP, when MSS is known to allow headroom, e.g., through MSS clamping or route MTU. Changes v1->v2: - document flag BPF_F_ADJ_ROOM_FIXED_GSO - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change. Link: https://patchwork.ozlabs.org/patch/1052497/ Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++-- net/core/filter.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c8fd0647070..4f157d0ec571 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,8 +1486,10 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * There is one supported flag at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2627,6 +2629,9 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index e7b7720b18e9..d3240a0a0eeb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,12 +2963,19 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) @@ -2982,7 +2989,9 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -2991,12 +3000,17 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) @@ -3010,7 +3024,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3037,7 +3053,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags)) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3065,8 +3081,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : - bpf_skb_net_grow(skb, off, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); bpf_compute_data_pointers(skb); return ret; From 868d523535c2d00b696753ece606e641a816e91e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:56 -0400 Subject: [PATCH 20/29] bpf: add bpf_skb_adjust_room encap flags When pushing tunnel headers, annotate skbs in the same way as tunnel devices. For GSO packets, the network stack requires certain fields set to segment packets with tunnel headers. gro_gse_segment depends on transport and inner mac header, for instance. Add an option to pass this information. Remove the restriction on len_diff to network header length, which is too short, e.g., for GRE protocols. Changes v1->v2: - document new flags - BPF_F_ADJ_ROOM_MASK moved v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 16 +++++++++- net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f157d0ec571..837024512baf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,11 +1486,20 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * There is one supported flag at this time: + * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2632,6 +2641,11 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index d3240a0a0eeb..c1d19b074d6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2963,11 +2963,20 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + unsigned int gso_type = SKB_GSO_DODGY; + u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -2981,10 +2990,60 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + inner_trans = skb->transport_header; + } + ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + /* inner mac == inner_net on l3 encap */ + skb->inner_mac_header = inner_net; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + } + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2993,7 +3052,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; } @@ -3044,7 +3103,6 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { - bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = __bpf_skb_max_len(skb); @@ -3073,8 +3131,6 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, } len_cur = skb->len - skb_network_offset(skb); - if (skb_transport_header_was_set(skb) && !trans_same) - len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && From 6c408decbdc8a12a12a93c4d763cbc2a264f9332 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:57 -0400 Subject: [PATCH 21/29] bpf: Sync bpf.h to tools Sync include/uapi/linux/bpf.h with tools/ Changes v1->v2: - BPF_F_ADJ_ROOM_MASK moved, no longer in this commit v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved, no longer in this commit Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c04410137d9..837024512baf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1478,13 +1478,27 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2624,9 +2638,18 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ From 005edd16562b78f416e2f576a64789c90d96882f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:58 -0400 Subject: [PATCH 22/29] selftests/bpf: convert bpf tunnel test to BPF_ADJ_ROOM_MAC Avoid moving the network layer header when prefixing tunnel headers. This avoids an explicit call to bpf_skb_store_bytes and an implicit move of the network header bytes in bpf_skb_adjust_room. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_tc_tunnel.c | 25 +++---------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 900c5653105f..f6a16fd23dbd 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -72,7 +72,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -94,12 +94,6 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; - /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, - &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) - return TC_ACT_SHOT; - return TC_ACT_OK; } @@ -125,7 +119,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -145,12 +139,6 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; - /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, - &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) - return TC_ACT_SHOT; - return TC_ACT_OK; } @@ -207,14 +195,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_load_bytes(skb, off + olen, &buf, olen) < 0) - return TC_ACT_OK; - - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_NET, 0)) - return TC_ACT_SHOT; - - /* bpf_skb_adjust_room has moved outer over inner header: restore */ - if (bpf_skb_store_bytes(skb, off, buf, len, BPF_F_INVALIDATE_HASH) < 0) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; return TC_ACT_OK; From 94f16813e1b297d31f8fe6217cd9be19e080f998 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:59 -0400 Subject: [PATCH 23/29] selftests/bpf: convert bpf tunnel test to BPF_F_ADJ_ROOM_FIXED_GSO Lower route MTU to ensure packets fit in device MTU after encap, then skip the gso_size changes. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 11 ++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 6 ++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index f6a16fd23dbd..3b79dffb8103 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -52,6 +52,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) struct grev4hdr h_outer; struct iphdr iph_inner; struct tcphdr tcph; + __u64 flags; int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, @@ -69,10 +70,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + flags = BPF_F_ADJ_ROOM_FIXED_GSO; olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -102,6 +104,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) struct ipv6hdr iph_inner; struct grev6hdr h_outer; struct tcphdr tcph; + __u64 flags; int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, @@ -116,10 +119,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + flags = BPF_F_ADJ_ROOM_FIXED_GSO; olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -195,7 +199,8 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO)) return TC_ACT_SHOT; return TC_ACT_OK; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 9e18754f2354..cda5317790d2 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -35,6 +35,12 @@ setup() { ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad + # clamp route to reserve room for tunnel headers + ip -netns "${ns1}" -4 route flush table main + ip -netns "${ns1}" -6 route flush table main + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1 + sleep 1 dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none From 75a1a9fa2e20de6319a19161ce4e2e1817d70e28 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:33:00 -0400 Subject: [PATCH 24/29] selftests/bpf: convert bpf tunnel test to encap modes Make the tests correctly annotate skbs with tunnel metadata. This makes the gso tests succeed. Enable them. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_tc_tunnel.c | 19 +++++++++++++++---- tools/testing/selftests/bpf/test_tc_tunnel.sh | 10 ++++------ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 3b79dffb8103..f541c2de947d 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -70,8 +70,13 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; - flags = BPF_F_ADJ_ROOM_FIXED_GSO; - olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4; + if (with_gre) { + flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; + olen = sizeof(h_outer); + } else { + olen = sizeof(h_outer.ip); + } /* add room between mac and network header */ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) @@ -119,8 +124,14 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; - flags = BPF_F_ADJ_ROOM_FIXED_GSO; - olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6; + if (with_gre) { + flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; + olen = sizeof(h_outer); + } else { + olen = sizeof(h_outer.ip); + } + /* add room between mac and network header */ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index cda5317790d2..dcf320626931 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -97,13 +97,11 @@ if [[ "$#" -eq "0" ]]; then echo "ip6 gre" $0 ipv6 ip6gre 100 - # disabled until passes SKB_GSO_DODGY checks - # echo "ip gre gso" - # $0 ipv4 gre 2000 + echo "ip gre gso" + $0 ipv4 gre 2000 - # disabled until passes SKB_GSO_DODGY checks - # echo "ip6 gre gso" - # $0 ipv6 ip6gre 2000 + echo "ip6 gre gso" + $0 ipv6 ip6gre 2000 echo "OK. All tests passed" exit 0 From 315a202987dd2b2e0adebd13c83ceef44836e66f Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 22 Mar 2019 16:40:18 -0700 Subject: [PATCH 25/29] bpf: make bpf_skb_ecn_set_ce callable from BPF_PROG_TYPE_SCHED_ACT This helper is useful if a bpf tc filter sets skb->tstamp. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index c1d19b074d6c..0a972fbf60df 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5959,6 +5959,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return bpf_base_func_proto(func_id); From 7df5e3db8f6354125540dfa50affd2c02b7d2832 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 22 Mar 2019 16:40:19 -0700 Subject: [PATCH 26/29] selftests: bpf: tc-bpf flow shaping with EDT Add a small test that shows how to shape a TCP flow in tc-bpf with EDT and ECN. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- .../testing/selftests/bpf/progs/test_tc_edt.c | 109 ++++++++++++++++++ tools/testing/selftests/bpf/test_tc_edt.sh | 99 ++++++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tc_edt.c create mode 100755 tools/testing/selftests/bpf/test_tc_edt.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cdcc54ddf4b9..77b73b892136 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -53,7 +53,8 @@ TEST_PROGS := test_kmod.sh \ test_xdp_vlan.sh \ test_lwt_ip_encap.sh \ test_tcp_check_syncookie.sh \ - test_tc_tunnel.sh + test_tc_tunnel.sh \ + test_tc_edt.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c new file mode 100644 index 000000000000..3af64c470d64 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +/* the maximum delay we are willing to add (drop packets beyond that) */ +#define TIME_HORIZON_NS (2000 * 1000 * 1000) +#define NS_PER_SEC 1000000000 +#define ECN_HORIZON_NS 5000000 +#define THROTTLE_RATE_BPS (5 * 1000 * 1000) + +/* flow_key => last_tstamp timestamp used */ +struct bpf_map_def SEC("maps") flow_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(uint32_t), + .value_size = sizeof(uint64_t), + .max_entries = 1, +}; + +static inline int throttle_flow(struct __sk_buff *skb) +{ + int key = 0; + uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key); + uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / + THROTTLE_RATE_BPS; + uint64_t now = bpf_ktime_get_ns(); + uint64_t tstamp, next_tstamp = 0; + + if (last_tstamp) + next_tstamp = *last_tstamp + delay_ns; + + tstamp = skb->tstamp; + if (tstamp < now) + tstamp = now; + + /* should we throttle? */ + if (next_tstamp <= tstamp) { + if (bpf_map_update_elem(&flow_map, &key, &tstamp, BPF_ANY)) + return TC_ACT_SHOT; + return TC_ACT_OK; + } + + /* do not queue past the time horizon */ + if (next_tstamp - now >= TIME_HORIZON_NS) + return TC_ACT_SHOT; + + /* set ecn bit, if needed */ + if (next_tstamp - now >= ECN_HORIZON_NS) + bpf_skb_ecn_set_ce(skb); + + if (bpf_map_update_elem(&flow_map, &key, &next_tstamp, BPF_EXIST)) + return TC_ACT_SHOT; + skb->tstamp = next_tstamp; + + return TC_ACT_OK; +} + +static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp) +{ + void *data_end = (void *)(long)skb->data_end; + + /* drop malformed packets */ + if ((void *)(tcp + 1) > data_end) + return TC_ACT_SHOT; + + if (tcp->dest == bpf_htons(9000)) + return throttle_flow(skb); + + return TC_ACT_OK; +} + +static inline int handle_ipv4(struct __sk_buff *skb) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct iphdr *iph; + uint32_t ihl; + + /* drop malformed packets */ + if (data + sizeof(struct ethhdr) > data_end) + return TC_ACT_SHOT; + iph = (struct iphdr *)(data + sizeof(struct ethhdr)); + if ((void *)(iph + 1) > data_end) + return TC_ACT_SHOT; + ihl = iph->ihl * 4; + if (((void *)iph) + ihl > data_end) + return TC_ACT_SHOT; + + if (iph->protocol == IPPROTO_TCP) + return handle_tcp(skb, (struct tcphdr *)(((void *)iph) + ihl)); + + return TC_ACT_OK; +} + +SEC("cls_test") int tc_prog(struct __sk_buff *skb) +{ + if (skb->protocol == bpf_htons(ETH_P_IP)) + return handle_ipv4(skb); + + return TC_ACT_OK; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_edt.sh b/tools/testing/selftests/bpf/test_tc_edt.sh new file mode 100755 index 000000000000..f38567ef694b --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_edt.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test installs a TC bpf program that throttles a TCP flow +# with dst port = 9000 down to 5MBps. Then it measures actual +# throughput of the flow. + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + echo "FAIL" + exit 1 +fi + +# check that nc, dd, and timeout are present +command -v nc >/dev/null 2>&1 || \ + { echo >&2 "nc is not available"; exit 1; } +command -v dd >/dev/null 2>&1 || \ + { echo >&2 "nc is not available"; exit 1; } +command -v timeout >/dev/null 2>&1 || \ + { echo >&2 "timeout is not available"; exit 1; } + +readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" +readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" + +readonly IP_SRC="172.16.1.100" +readonly IP_DST="172.16.2.100" + +cleanup() +{ + ip netns del ${NS_SRC} + ip netns del ${NS_DST} +} + +trap cleanup EXIT + +set -e # exit on error + +ip netns add "${NS_SRC}" +ip netns add "${NS_DST}" +ip link add veth_src type veth peer name veth_dst +ip link set veth_src netns ${NS_SRC} +ip link set veth_dst netns ${NS_DST} + +ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src +ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst + +ip -netns ${NS_SRC} link set dev veth_src up +ip -netns ${NS_DST} link set dev veth_dst up + +ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src +ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst + +# set up TC on TX +ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq +ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact +ip netns exec ${NS_SRC} tc filter add dev veth_src egress \ + bpf da obj test_tc_edt.o sec cls_test + + +# start the listener +ip netns exec ${NS_DST} bash -c \ + "nc -4 -l -s ${IP_DST} -p 9000 >/dev/null &" +declare -i NC_PID=$! +sleep 1 + +declare -ir TIMEOUT=20 +declare -ir EXPECTED_BPS=5000000 + +# run the load, capture RX bytes on DST +declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \ + cat /sys/class/net/veth_dst/statistics/rx_bytes ) + +set +e +ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \ + bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null" +set -e + +declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \ + cat /sys/class/net/veth_dst/statistics/rx_bytes ) + +declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT )) + +echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \ + awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n", + $1, ($2-$3)*100.0/$3}' + +# Pass the test if the actual bps is within 1% of the expected bps. +# The difference is usually about 0.1% on a 20-sec test, and ==> zero +# the longer the test runs. +declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \ + awk 'function abs(x){return ((x < 0.0) ? -x : x)} + {if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" } + else { print "0"} }' ) +if [ "${RES}" == "0" ] ; then + echo "PASS" +else + echo "FAIL" + exit 1 +fi From 62b31b42cff924c7d1e9a095b68ff3bbfc49b15b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 23 Mar 2019 12:23:07 -0400 Subject: [PATCH 27/29] bpf: silence uninitialized var warning in bpf_skb_net_grow These three variables are set in one branch and used in another with the same condition. But on some architectures they still generate compiler warnings of the kind: warning: 'inner_trans' may be used uninitialized in this function [-Wmaybe-uninitialized] Silence these false positives. Use the straightforward approach to always initialize them, if a bit superfluous. Fixes: 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") Reported-by: kbuild test robot Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0a972fbf60df..22eb2edf5573 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2975,8 +2975,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; - u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { From 0c4ea7f87abbdb56df616678bc23f10e51a0b4f8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 25 Mar 2019 09:36:37 +0000 Subject: [PATCH 28/29] bpf: test_tc_tunnel.sh needs reverse path filtering disabled test_tc_tunnel.sh sets up a pair of namespaces connected by a veth pair to verify encap/decap using bpf_skb_adjust_room. In testing this, it uses tunnel links as the peer of the bpf-based encap/decap. However because the same IP header is used for inner and outer IP, when packets arrive at the tunnel interface they will be dropped by reverse path filtering as those packets are expected on the veth interface (where the destination IP of the decapped packet is configured). To avoid this, ensure reverse path filtering is disabled for the namespace using tunneling. Fixes: 98cdabcd0798 ("selftests/bpf: bpf tunnel encap test") Signed-off-by: Alan Maguire Acked-by: Willem de Bruijn Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_tc_tunnel.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index dcf320626931..c805adb88f3a 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -160,6 +160,14 @@ server_listen # client can connect again ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ remote "${addr1}" local "${addr2}" +# Because packets are decapped by the tunnel they arrive on testtun0 from +# the IP stack perspective. Ensure reverse path filtering is disabled +# otherwise we drop the TCP SYN as arriving on testtun0 instead of the +# expected veth2 (veth2 is where 192.168.1.2 is configured). +ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 +# rp needs to be disabled for both all and testtun0 as the rp value is +# selected as the max of the "all" and device-specific values. +ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0 ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect From b4b6aa83433ea4675d4ba1be56774623db81f14f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 19 Mar 2019 14:53:24 -0700 Subject: [PATCH 29/29] selftests: bpf: don't depend on hardcoded perf sample_freq When running stacktrace_build_id_nmi, try to query kernel.perf_event_max_sample_rate sysctl and use it as a sample_freq. If there was an error reading sysctl, fallback to 5000. kernel.perf_event_max_sample_rate sysctl can drift and/or can be adjusted by the perf tool, so assuming a fixed number might be problematic on a long running machine. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/stacktrace_build_id_nmi.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 8a114bb1c379..1c1a2f75f3d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -1,13 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 #include +static __u64 read_perf_max_sample_freq(void) +{ + __u64 sample_freq = 5000; /* fallback to 5000 on error */ + FILE *f; + + f = fopen("/proc/sys/kernel/perf_event_max_sample_rate", "r"); + if (f == NULL) + return sample_freq; + fscanf(f, "%llu", &sample_freq); + fclose(f); + return sample_freq; +} + void test_stacktrace_build_id_nmi(void) { int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; const char *file = "./test_stacktrace_build_id.o"; int err, pmu_fd, prog_fd; struct perf_event_attr attr = { - .sample_freq = 5000, .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -20,6 +32,8 @@ void test_stacktrace_build_id_nmi(void) int build_id_matches = 0; int retry = 1; + attr.sample_freq = read_perf_max_sample_freq(); + retry: err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd); if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))