selftests/bpf: Add selftests for raw syncookie helpers

This commit adds selftests for the new BPF helpers:
bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}.

xdp_synproxy_kern.c is a BPF program that generates SYN cookies on
allowed TCP ports and sends SYNACKs to clients, accelerating synproxy
iptables module.

xdp_synproxy.c is a userspace control application that allows to
configure the following options in runtime: list of allowed ports, MSS,
window scale, TTL.

A selftest is added to prog_tests that leverages the above programs to
test the functionality of the new helpers.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-5-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Maxim Mikityanskiy 2022-06-15 16:48:45 +03:00 committed by Alexei Starovoitov
parent 33bf988504
commit fb5cd0ce70
5 changed files with 1330 additions and 1 deletions

View File

@ -43,3 +43,4 @@ test_cpp
*.tmp
xdpxceiver
xdp_redirect_multi
xdp_synproxy

View File

@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
xdpxceiver xdp_redirect_multi
xdpxceiver xdp_redirect_multi xdp_synproxy
TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
@ -504,6 +504,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
cap_helpers.c
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \
$(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \
ima_setup.sh \
$(wildcard progs/btf_dump_test_case_*.c)
TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE

View File

@ -0,0 +1,146 @@
// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#include <test_progs.h>
#include <network_helpers.h>
#include <ctype.h>
#define CMD_OUT_BUF_SIZE 1023
#define SYS(cmd) ({ \
if (!ASSERT_OK(system(cmd), (cmd))) \
goto out; \
})
#define SYS_OUT(cmd) ({ \
FILE *f = popen((cmd), "r"); \
if (!ASSERT_OK_PTR(f, (cmd))) \
goto out; \
f; \
})
/* out must be at least `size * 4 + 1` bytes long */
static void escape_str(char *out, const char *in, size_t size)
{
static const char *hex = "0123456789ABCDEF";
size_t i;
for (i = 0; i < size; i++) {
if (isprint(in[i]) && in[i] != '\\' && in[i] != '\'') {
*out++ = in[i];
} else {
*out++ = '\\';
*out++ = 'x';
*out++ = hex[(in[i] >> 4) & 0xf];
*out++ = hex[in[i] & 0xf];
}
}
*out++ = '\0';
}
static bool expect_str(char *buf, size_t size, const char *str, const char *name)
{
static char escbuf_expected[CMD_OUT_BUF_SIZE * 4];
static char escbuf_actual[CMD_OUT_BUF_SIZE * 4];
static int duration = 0;
bool ok;
ok = size == strlen(str) && !memcmp(buf, str, size);
if (!ok) {
escape_str(escbuf_expected, str, strlen(str));
escape_str(escbuf_actual, buf, size);
}
CHECK(!ok, name, "unexpected %s: actual '%s' != expected '%s'\n",
name, escbuf_actual, escbuf_expected);
return ok;
}
void test_xdp_synproxy(void)
{
int server_fd = -1, client_fd = -1, accept_fd = -1;
struct nstoken *ns = NULL;
FILE *ctrl_file = NULL;
char buf[CMD_OUT_BUF_SIZE];
size_t size;
SYS("ip netns add synproxy");
SYS("ip link add tmp0 type veth peer name tmp1");
SYS("ip link set tmp1 netns synproxy");
SYS("ip link set tmp0 up");
SYS("ip addr replace 198.18.0.1/24 dev tmp0");
/* When checksum offload is enabled, the XDP program sees wrong
* checksums and drops packets.
*/
SYS("ethtool -K tmp0 tx off");
/* Workaround required for veth. */
SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null");
ns = open_netns("synproxy");
if (!ASSERT_OK_PTR(ns, "setns"))
goto out;
SYS("ip link set lo up");
SYS("ip link set tmp1 up");
SYS("ip addr replace 198.18.0.2/24 dev tmp1");
SYS("sysctl -w net.ipv4.tcp_syncookies=2");
SYS("sysctl -w net.ipv4.tcp_timestamps=1");
SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0");
SYS("iptables -t raw -I PREROUTING \
-i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack");
SYS("iptables -t filter -A INPUT \
-i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \
-j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460");
SYS("iptables -t filter -A INPUT \
-i tmp1 -m state --state INVALID -j DROP");
ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \
--mss4 1460 --mss6 1440 --wscale 7 --ttl 64");
size = fread(buf, 1, sizeof(buf), ctrl_file);
pclose(ctrl_file);
if (!expect_str(buf, size, "Total SYNACKs generated: 0\n",
"initial SYNACKs"))
goto out;
server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0);
if (!ASSERT_GE(server_fd, 0, "start_server"))
goto out;
close_netns(ns);
ns = NULL;
client_fd = connect_to_fd(server_fd, 10000);
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
goto out;
accept_fd = accept(server_fd, NULL, NULL);
if (!ASSERT_GE(accept_fd, 0, "accept"))
goto out;
ns = open_netns("synproxy");
if (!ASSERT_OK_PTR(ns, "setns"))
goto out;
ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single");
size = fread(buf, 1, sizeof(buf), ctrl_file);
pclose(ctrl_file);
if (!expect_str(buf, size, "Total SYNACKs generated: 1\n",
"SYNACKs after connection"))
goto out;
out:
if (accept_fd >= 0)
close(accept_fd);
if (client_fd >= 0)
close(client_fd);
if (server_fd >= 0)
close(server_fd);
if (ns)
close_netns(ns);
system("ip link del tmp0");
system("ip netns del synproxy");
}

View File

@ -0,0 +1,763 @@
// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include <asm/errno.h>
#define NSEC_PER_SEC 1000000000L
#define ETH_ALEN 6
#define ETH_P_IP 0x0800
#define ETH_P_IPV6 0x86DD
#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
#define IP_DF 0x4000
#define IP_MF 0x2000
#define IP_OFFSET 0x1fff
#define NEXTHDR_TCP 6
#define TCPOPT_NOP 1
#define TCPOPT_EOL 0
#define TCPOPT_MSS 2
#define TCPOPT_WINDOW 3
#define TCPOPT_SACK_PERM 4
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_MSS 4
#define TCPOLEN_WINDOW 3
#define TCPOLEN_SACK_PERM 2
#define TCPOLEN_TIMESTAMP 10
#define TCP_TS_HZ 1000
#define TS_OPT_WSCALE_MASK 0xf
#define TS_OPT_SACK (1 << 4)
#define TS_OPT_ECN (1 << 5)
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
#define TCP_MAX_WSCALE 14U
#define IPV4_MAXLEN 60
#define TCP_MAXLEN 60
#define DEFAULT_MSS4 1460
#define DEFAULT_MSS6 1440
#define DEFAULT_WSCALE 7
#define DEFAULT_TTL 64
#define MAX_ALLOWED_PORTS 8
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
#define __get_unaligned_t(type, ptr) ({ \
const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
__pptr->x; \
})
#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, __u32);
__type(value, __u64);
__uint(max_entries, 2);
} values SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, __u32);
__type(value, __u16);
__uint(max_entries, MAX_ALLOWED_PORTS);
} allowed_ports SEC(".maps");
extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
struct bpf_sock_tuple *bpf_tuple,
__u32 len_tuple,
struct bpf_ct_opts *opts,
__u32 len_opts) __ksym;
extern void bpf_ct_release(struct nf_conn *ct) __ksym;
static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
{
__u8 tmp[ETH_ALEN];
__builtin_memcpy(tmp, a, ETH_ALEN);
__builtin_memcpy(a, b, ETH_ALEN);
__builtin_memcpy(b, tmp, ETH_ALEN);
}
static __always_inline __u16 csum_fold(__u32 csum)
{
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
return (__u16)~csum;
}
static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
__u32 len, __u8 proto,
__u32 csum)
{
__u64 s = csum;
s += (__u32)saddr;
s += (__u32)daddr;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
s += proto + len;
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
s += (proto + len) << 8;
#else
#error Unknown endian
#endif
s = (s & 0xffffffff) + (s >> 32);
s = (s & 0xffffffff) + (s >> 32);
return csum_fold((__u32)s);
}
static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
__u32 len, __u8 proto, __u32 csum)
{
__u64 sum = csum;
int i;
#pragma unroll
for (i = 0; i < 4; i++)
sum += (__u32)saddr->in6_u.u6_addr32[i];
#pragma unroll
for (i = 0; i < 4; i++)
sum += (__u32)daddr->in6_u.u6_addr32[i];
/* Don't combine additions to avoid 32-bit overflow. */
sum += bpf_htonl(len);
sum += bpf_htonl(proto);
sum = (sum & 0xffffffff) + (sum >> 32);
sum = (sum & 0xffffffff) + (sum >> 32);
return csum_fold((__u32)sum);
}
static __always_inline __u64 tcp_clock_ns(void)
{
return bpf_ktime_get_ns();
}
static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
{
return ns / (NSEC_PER_SEC / TCP_TS_HZ);
}
static __always_inline __u32 tcp_time_stamp_raw(void)
{
return tcp_ns_to_ts(tcp_clock_ns());
}
struct tcpopt_context {
__u8 *ptr;
__u8 *end;
void *data_end;
__be32 *tsecr;
__u8 wscale;
bool option_timestamp;
bool option_sack;
};
static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
{
__u8 opcode, opsize;
if (ctx->ptr >= ctx->end)
return 1;
if (ctx->ptr >= ctx->data_end)
return 1;
opcode = ctx->ptr[0];
if (opcode == TCPOPT_EOL)
return 1;
if (opcode == TCPOPT_NOP) {
++ctx->ptr;
return 0;
}
if (ctx->ptr + 1 >= ctx->end)
return 1;
if (ctx->ptr + 1 >= ctx->data_end)
return 1;
opsize = ctx->ptr[1];
if (opsize < 2)
return 1;
if (ctx->ptr + opsize > ctx->end)
return 1;
switch (opcode) {
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
ctx->option_timestamp = true;
/* Client's tsval becomes our tsecr. */
*ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
ctx->option_sack = true;
break;
}
ctx->ptr += opsize;
return 0;
}
static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
{
int i;
for (i = 0; i < 7; i++)
if (tscookie_tcpopt_parse(context))
return 1;
return 0;
}
static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
__u16 tcp_len, __be32 *tsval,
__be32 *tsecr, void *data_end)
{
struct tcpopt_context loop_ctx = {
.ptr = (__u8 *)(tcp_header + 1),
.end = (__u8 *)tcp_header + tcp_len,
.data_end = data_end,
.tsecr = tsecr,
.wscale = TS_OPT_WSCALE_MASK,
.option_timestamp = false,
.option_sack = false,
};
u32 cookie;
bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
if (!loop_ctx.option_timestamp)
return false;
cookie = tcp_time_stamp_raw() & ~TSMASK;
cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
if (loop_ctx.option_sack)
cookie |= TS_OPT_SACK;
if (tcp_header->ece && tcp_header->cwr)
cookie |= TS_OPT_ECN;
*tsval = bpf_htonl(cookie);
return true;
}
static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
__u8 *ttl, bool ipv6)
{
__u32 key = 0;
__u64 *value;
value = bpf_map_lookup_elem(&values, &key);
if (value && *value != 0) {
if (ipv6)
*mss = (*value >> 32) & 0xffff;
else
*mss = *value & 0xffff;
*wscale = (*value >> 16) & 0xf;
*ttl = (*value >> 24) & 0xff;
return;
}
*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
*wscale = DEFAULT_WSCALE;
*ttl = DEFAULT_TTL;
}
static __always_inline void values_inc_synacks(void)
{
__u32 key = 1;
__u32 *value;
value = bpf_map_lookup_elem(&values, &key);
if (value)
__sync_fetch_and_add(value, 1);
}
static __always_inline bool check_port_allowed(__u16 port)
{
__u32 i;
for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
__u32 key = i;
__u16 *value;
value = bpf_map_lookup_elem(&allowed_ports, &key);
if (!value)
break;
/* 0 is a terminator value. Check it first to avoid matching on
* a forbidden port == 0 and returning true.
*/
if (*value == 0)
break;
if (*value == port)
return true;
}
return false;
}
struct header_pointers {
struct ethhdr *eth;
struct iphdr *ipv4;
struct ipv6hdr *ipv6;
struct tcphdr *tcp;
__u16 tcp_len;
};
static __always_inline int tcp_dissect(void *data, void *data_end,
struct header_pointers *hdr)
{
hdr->eth = data;
if (hdr->eth + 1 > data_end)
return XDP_DROP;
switch (bpf_ntohs(hdr->eth->h_proto)) {
case ETH_P_IP:
hdr->ipv6 = NULL;
hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
if (hdr->ipv4 + 1 > data_end)
return XDP_DROP;
if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
return XDP_DROP;
if (hdr->ipv4->version != 4)
return XDP_DROP;
if (hdr->ipv4->protocol != IPPROTO_TCP)
return XDP_PASS;
hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
break;
case ETH_P_IPV6:
hdr->ipv4 = NULL;
hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
if (hdr->ipv6 + 1 > data_end)
return XDP_DROP;
if (hdr->ipv6->version != 6)
return XDP_DROP;
/* XXX: Extension headers are not supported and could circumvent
* XDP SYN flood protection.
*/
if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
return XDP_PASS;
hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
break;
default:
/* XXX: VLANs will circumvent XDP SYN flood protection. */
return XDP_PASS;
}
if (hdr->tcp + 1 > data_end)
return XDP_DROP;
hdr->tcp_len = hdr->tcp->doff * 4;
if (hdr->tcp_len < sizeof(*hdr->tcp))
return XDP_DROP;
return XDP_TX;
}
static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers *hdr)
{
struct bpf_ct_opts ct_lookup_opts = {
.netns_id = BPF_F_CURRENT_NETNS,
.l4proto = IPPROTO_TCP,
};
struct bpf_sock_tuple tup = {};
struct nf_conn *ct;
__u32 tup_size;
if (hdr->ipv4) {
/* TCP doesn't normally use fragments, and XDP can't reassemble
* them.
*/
if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
return XDP_DROP;
tup.ipv4.saddr = hdr->ipv4->saddr;
tup.ipv4.daddr = hdr->ipv4->daddr;
tup.ipv4.sport = hdr->tcp->source;
tup.ipv4.dport = hdr->tcp->dest;
tup_size = sizeof(tup.ipv4);
} else if (hdr->ipv6) {
__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
tup.ipv6.sport = hdr->tcp->source;
tup.ipv6.dport = hdr->tcp->dest;
tup_size = sizeof(tup.ipv6);
} else {
/* The verifier can't track that either ipv4 or ipv6 is not
* NULL.
*/
return XDP_ABORTED;
}
ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
if (ct) {
unsigned long status = ct->status;
bpf_ct_release(ct);
if (status & IPS_CONFIRMED_BIT)
return XDP_PASS;
} else if (ct_lookup_opts.error != -ENOENT) {
return XDP_ABORTED;
}
/* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */
return XDP_TX;
}
static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
__u8 wscale)
{
__be32 *start = buf;
*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
if (!tsopt)
return buf - start;
if (tsopt[0] & bpf_htonl(1 << 4))
*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
else
*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
*buf++ = tsopt[0];
*buf++ = tsopt[1];
if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
wscale);
return buf - start;
}
static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
__u32 cookie, __be32 *tsopt,
__u16 mss, __u8 wscale)
{
void *tcp_options;
tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
swap(tcp_header->source, tcp_header->dest);
tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
tcp_header->seq = bpf_htonl(cookie);
tcp_header->window = 0;
tcp_header->urg_ptr = 0;
tcp_header->check = 0; /* Calculate checksum later. */
tcp_options = (void *)(tcp_header + 1);
tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
}
static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
__u32 cookie, __be32 *tsopt)
{
__u8 wscale;
__u16 mss;
__u8 ttl;
values_get_tcpipopts(&mss, &wscale, &ttl, false);
swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
hdr->ipv4->check = 0; /* Calculate checksum later. */
hdr->ipv4->tos = 0;
hdr->ipv4->id = 0;
hdr->ipv4->ttl = ttl;
tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
hdr->tcp_len = hdr->tcp->doff * 4;
hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
}
static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
__u32 cookie, __be32 *tsopt)
{
__u8 wscale;
__u16 mss;
__u8 ttl;
values_get_tcpipopts(&mss, &wscale, &ttl, true);
swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
hdr->ipv6->hop_limit = ttl;
tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
hdr->tcp_len = hdr->tcp->doff * 4;
hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
}
static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
struct xdp_md *ctx,
void *data, void *data_end)
{
__u32 old_pkt_size, new_pkt_size;
/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
* BPF verifier if tsopt is not volatile. Volatile forces it to store
* the pointer value and use it directly, otherwise tcp_mkoptions is
* (mis)compiled like this:
* if (!tsopt)
* return buf - start;
* reg = stored_return_value_of_tscookie_init;
* if (reg)
* tsopt = tsopt_buf;
* else
* tsopt = NULL;
* ...
* *buf++ = tsopt[1];
* It creates a dead branch where tsopt is assigned NULL, but the
* verifier can't prove it's dead and blocks the program.
*/
__be32 * volatile tsopt = NULL;
__be32 tsopt_buf[2] = {};
__u16 ip_len;
__u32 cookie;
__s64 value;
/* Checksum is not yet verified, but both checksum failure and TCP
* header checks return XDP_DROP, so the order doesn't matter.
*/
if (hdr->tcp->fin || hdr->tcp->rst)
return XDP_DROP;
/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
* ports.
*/
if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
return XDP_DROP;
if (hdr->ipv4) {
/* Check the IPv4 and TCP checksums before creating a SYNACK. */
value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
if (value < 0)
return XDP_ABORTED;
if (csum_fold(value) != 0)
return XDP_DROP; /* Bad IPv4 checksum. */
value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
if (value < 0)
return XDP_ABORTED;
if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
hdr->tcp_len, IPPROTO_TCP, value) != 0)
return XDP_DROP; /* Bad TCP checksum. */
ip_len = sizeof(*hdr->ipv4);
value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
hdr->tcp_len);
} else if (hdr->ipv6) {
/* Check the TCP checksum before creating a SYNACK. */
value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
if (value < 0)
return XDP_ABORTED;
if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
hdr->tcp_len, IPPROTO_TCP, value) != 0)
return XDP_DROP; /* Bad TCP checksum. */
ip_len = sizeof(*hdr->ipv6);
value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
hdr->tcp_len);
} else {
return XDP_ABORTED;
}
if (value < 0)
return XDP_ABORTED;
cookie = (__u32)value;
if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
&tsopt_buf[0], &tsopt_buf[1], data_end))
tsopt = tsopt_buf;
/* Check that there is enough space for a SYNACK. It also covers
* the check that the destination of the __builtin_memmove below
* doesn't overflow.
*/
if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
return XDP_ABORTED;
if (hdr->ipv4) {
if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
struct tcphdr *new_tcp_header;
new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
hdr->tcp = new_tcp_header;
hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
}
tcpv4_gen_synack(hdr, cookie, tsopt);
} else if (hdr->ipv6) {
tcpv6_gen_synack(hdr, cookie, tsopt);
} else {
return XDP_ABORTED;
}
/* Recalculate checksums. */
hdr->tcp->check = 0;
value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
if (value < 0)
return XDP_ABORTED;
if (hdr->ipv4) {
hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
hdr->ipv4->daddr,
hdr->tcp_len,
IPPROTO_TCP,
value);
hdr->ipv4->check = 0;
value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
if (value < 0)
return XDP_ABORTED;
hdr->ipv4->check = csum_fold(value);
} else if (hdr->ipv6) {
hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
&hdr->ipv6->daddr,
hdr->tcp_len,
IPPROTO_TCP,
value);
} else {
return XDP_ABORTED;
}
/* Set the new packet size. */
old_pkt_size = data_end - data;
new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
return XDP_ABORTED;
values_inc_synacks();
return XDP_TX;
}
static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
{
int err;
if (hdr->tcp->rst)
return XDP_DROP;
if (hdr->ipv4)
err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
else if (hdr->ipv6)
err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
else
return XDP_ABORTED;
if (err)
return XDP_DROP;
return XDP_PASS;
}
SEC("xdp")
int syncookie_xdp(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct header_pointers hdr;
__s64 value;
int ret;
struct bpf_ct_opts ct_lookup_opts = {
.netns_id = BPF_F_CURRENT_NETNS,
.l4proto = IPPROTO_TCP,
};
ret = tcp_dissect(data, data_end, &hdr);
if (ret != XDP_TX)
return ret;
ret = tcp_lookup(ctx, &hdr);
if (ret != XDP_TX)
return ret;
/* Packet is TCP and doesn't belong to an established connection. */
if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1)
return XDP_DROP;
/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len
* to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
*/
if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len))
return XDP_ABORTED;
data_end = (void *)(long)ctx->data_end;
data = (void *)(long)ctx->data;
if (hdr.ipv4) {
hdr.eth = data;
hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth);
/* IPV4_MAXLEN is needed when calculating checksum.
* At least sizeof(struct iphdr) is needed here to access ihl.
*/
if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end)
return XDP_ABORTED;
hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4;
} else if (hdr.ipv6) {
hdr.eth = data;
hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth);
hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6);
} else {
return XDP_ABORTED;
}
if ((void *)hdr.tcp + TCP_MAXLEN > data_end)
return XDP_ABORTED;
/* We run out of registers, tcp_len gets spilled to the stack, and the
* verifier forgets its min and max values checked above in tcp_dissect.
*/
hdr.tcp_len = hdr.tcp->doff * 4;
if (hdr.tcp_len < sizeof(*hdr.tcp))
return XDP_ABORTED;
return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) :
syncookie_handle_ack(&hdr);
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,418 @@
// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#include <stdnoreturn.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <signal.h>
#include <sys/types.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <net/if.h>
#include <linux/if_link.h>
#include <linux/limits.h>
static unsigned int ifindex;
static __u32 attached_prog_id;
static void noreturn cleanup(int sig)
{
DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
int prog_fd;
int err;
if (attached_prog_id == 0)
exit(0);
prog_fd = bpf_prog_get_fd_by_id(attached_prog_id);
if (prog_fd < 0) {
fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd));
err = bpf_xdp_attach(ifindex, -1, 0, NULL);
if (err < 0) {
fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err));
fprintf(stderr, "Failed to detach XDP program\n");
exit(1);
}
} else {
opts.old_prog_fd = prog_fd;
err = bpf_xdp_attach(ifindex, -1, XDP_FLAGS_REPLACE, &opts);
close(prog_fd);
if (err < 0) {
fprintf(stderr, "Error: bpf_set_link_xdp_fd_opts: %s\n", strerror(-err));
/* Not an error if already replaced by someone else. */
if (err != -EEXIST) {
fprintf(stderr, "Failed to detach XDP program\n");
exit(1);
}
}
}
exit(0);
}
static noreturn void usage(const char *progname)
{
fprintf(stderr, "Usage: %s [--iface <iface>|--prog <prog_id>] [--mss4 <mss ipv4> --mss6 <mss ipv6> --wscale <wscale> --ttl <ttl>] [--ports <port1>,<port2>,...] [--single]\n",
progname);
exit(1);
}
static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigned long limit)
{
unsigned long res;
char *endptr;
errno = 0;
res = strtoul(arg, &endptr, 10);
if (errno != 0 || *endptr != '\0' || arg[0] == '\0' || res > limit)
usage(progname);
return res;
}
static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id,
__u64 *tcpipopts, char **ports, bool *single)
{
static struct option long_options[] = {
{ "help", no_argument, NULL, 'h' },
{ "iface", required_argument, NULL, 'i' },
{ "prog", required_argument, NULL, 'x' },
{ "mss4", required_argument, NULL, 4 },
{ "mss6", required_argument, NULL, 6 },
{ "wscale", required_argument, NULL, 'w' },
{ "ttl", required_argument, NULL, 't' },
{ "ports", required_argument, NULL, 'p' },
{ "single", no_argument, NULL, 's' },
{ NULL, 0, NULL, 0 },
};
unsigned long mss4, mss6, wscale, ttl;
unsigned int tcpipopts_mask = 0;
if (argc < 2)
usage(argv[0]);
*ifindex = 0;
*prog_id = 0;
*tcpipopts = 0;
*ports = NULL;
*single = false;
while (true) {
int opt;
opt = getopt_long(argc, argv, "", long_options, NULL);
if (opt == -1)
break;
switch (opt) {
case 'h':
usage(argv[0]);
break;
case 'i':
*ifindex = if_nametoindex(optarg);
if (*ifindex == 0)
usage(argv[0]);
break;
case 'x':
*prog_id = parse_arg_ul(argv[0], optarg, UINT32_MAX);
if (*prog_id == 0)
usage(argv[0]);
break;
case 4:
mss4 = parse_arg_ul(argv[0], optarg, UINT16_MAX);
tcpipopts_mask |= 1 << 0;
break;
case 6:
mss6 = parse_arg_ul(argv[0], optarg, UINT16_MAX);
tcpipopts_mask |= 1 << 1;
break;
case 'w':
wscale = parse_arg_ul(argv[0], optarg, 14);
tcpipopts_mask |= 1 << 2;
break;
case 't':
ttl = parse_arg_ul(argv[0], optarg, UINT8_MAX);
tcpipopts_mask |= 1 << 3;
break;
case 'p':
*ports = optarg;
break;
case 's':
*single = true;
break;
default:
usage(argv[0]);
}
}
if (optind < argc)
usage(argv[0]);
if (tcpipopts_mask == 0xf) {
if (mss4 == 0 || mss6 == 0 || wscale == 0 || ttl == 0)
usage(argv[0]);
*tcpipopts = (mss6 << 32) | (ttl << 24) | (wscale << 16) | mss4;
} else if (tcpipopts_mask != 0) {
usage(argv[0]);
}
if (*ifindex != 0 && *prog_id != 0)
usage(argv[0]);
if (*ifindex == 0 && *prog_id == 0)
usage(argv[0]);
}
static int syncookie_attach(const char *argv0, unsigned int ifindex)
{
struct bpf_prog_info info = {};
__u32 info_len = sizeof(info);
char xdp_filename[PATH_MAX];
struct bpf_program *prog;
struct bpf_object *obj;
int prog_fd;
int err;
snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0);
obj = bpf_object__open_file(xdp_filename, NULL);
err = libbpf_get_error(obj);
if (err < 0) {
fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err));
return err;
}
err = bpf_object__load(obj);
if (err < 0) {
fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err));
return err;
}
prog = bpf_object__find_program_by_name(obj, "syncookie_xdp");
if (!prog) {
fprintf(stderr, "Error: bpf_object__find_program_by_name: program syncookie_xdp was not found\n");
return -ENOENT;
}
prog_fd = bpf_program__fd(prog);
err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
if (err < 0) {
fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
goto out;
}
attached_prog_id = info.id;
signal(SIGINT, cleanup);
signal(SIGTERM, cleanup);
err = bpf_xdp_attach(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL);
if (err < 0) {
fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err));
signal(SIGINT, SIG_DFL);
signal(SIGTERM, SIG_DFL);
attached_prog_id = 0;
goto out;
}
err = 0;
out:
bpf_object__close(obj);
return err;
}
static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd)
{
struct bpf_prog_info prog_info;
__u32 map_ids[8];
__u32 info_len;
int prog_fd;
int err;
int i;
*values_map_fd = -1;
*ports_map_fd = -1;
prog_fd = bpf_prog_get_fd_by_id(prog_id);
if (prog_fd < 0) {
fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd));
return prog_fd;
}
prog_info = (struct bpf_prog_info) {
.nr_map_ids = 8,
.map_ids = (__u64)map_ids,
};
info_len = sizeof(prog_info);
err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
if (err != 0) {
fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
goto out;
}
if (prog_info.type != BPF_PROG_TYPE_XDP) {
fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n");
err = -ENOENT;
goto out;
}
if (prog_info.nr_map_ids < 2) {
fprintf(stderr, "Error: Found %u BPF maps, expected at least 2\n",
prog_info.nr_map_ids);
err = -ENOENT;
goto out;
}
for (i = 0; i < prog_info.nr_map_ids; i++) {
struct bpf_map_info map_info = {};
int map_fd;
err = bpf_map_get_fd_by_id(map_ids[i]);
if (err < 0) {
fprintf(stderr, "Error: bpf_map_get_fd_by_id: %s\n", strerror(-err));
goto err_close_map_fds;
}
map_fd = err;
info_len = sizeof(map_info);
err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
if (err != 0) {
fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
close(map_fd);
goto err_close_map_fds;
}
if (strcmp(map_info.name, "values") == 0) {
*values_map_fd = map_fd;
continue;
}
if (strcmp(map_info.name, "allowed_ports") == 0) {
*ports_map_fd = map_fd;
continue;
}
close(map_fd);
}
if (*values_map_fd != -1 && *ports_map_fd != -1) {
err = 0;
goto out;
}
err = -ENOENT;
err_close_map_fds:
if (*values_map_fd != -1)
close(*values_map_fd);
if (*ports_map_fd != -1)
close(*ports_map_fd);
*values_map_fd = -1;
*ports_map_fd = -1;
out:
close(prog_fd);
return err;
}
int main(int argc, char *argv[])
{
int values_map_fd, ports_map_fd;
__u64 tcpipopts;
bool firstiter;
__u64 prevcnt;
__u32 prog_id;
char *ports;
bool single;
int err = 0;
parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, &single);
if (prog_id == 0) {
err = bpf_xdp_query_id(ifindex, 0, &prog_id);
if (err < 0) {
fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err));
goto out;
}
if (prog_id == 0) {
err = syncookie_attach(argv[0], ifindex);
if (err < 0)
goto out;
prog_id = attached_prog_id;
}
}
err = syncookie_open_bpf_maps(prog_id, &values_map_fd, &ports_map_fd);
if (err < 0)
goto out;
if (ports) {
__u16 port_last = 0;
__u32 port_idx = 0;
char *p = ports;
fprintf(stderr, "Replacing allowed ports\n");
while (p && *p != '\0') {
char *token = strsep(&p, ",");
__u16 port;
port = parse_arg_ul(argv[0], token, UINT16_MAX);
err = bpf_map_update_elem(ports_map_fd, &port_idx, &port, BPF_ANY);
if (err != 0) {
fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));
fprintf(stderr, "Failed to add port %u (index %u)\n",
port, port_idx);
goto out_close_maps;
}
fprintf(stderr, "Added port %u\n", port);
port_idx++;
}
err = bpf_map_update_elem(ports_map_fd, &port_idx, &port_last, BPF_ANY);
if (err != 0) {
fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));
fprintf(stderr, "Failed to add the terminator value 0 (index %u)\n",
port_idx);
goto out_close_maps;
}
}
if (tcpipopts) {
__u32 key = 0;
fprintf(stderr, "Replacing TCP/IP options\n");
err = bpf_map_update_elem(values_map_fd, &key, &tcpipopts, BPF_ANY);
if (err != 0) {
fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));
goto out_close_maps;
}
}
if ((ports || tcpipopts) && attached_prog_id == 0 && !single)
goto out_close_maps;
prevcnt = 0;
firstiter = true;
while (true) {
__u32 key = 1;
__u64 value;
err = bpf_map_lookup_elem(values_map_fd, &key, &value);
if (err != 0) {
fprintf(stderr, "Error: bpf_map_lookup_elem: %s\n", strerror(-err));
goto out_close_maps;
}
if (firstiter) {
prevcnt = value;
firstiter = false;
}
if (single) {
printf("Total SYNACKs generated: %llu\n", value);
break;
}
printf("SYNACKs generated: %llu (total %llu)\n", value - prevcnt, value);
prevcnt = value;
sleep(1);
}
out_close_maps:
close(values_map_fd);
close(ports_map_fd);
out:
return err == 0 ? 0 : 1;
}