From 47dceb8ecdc1c3ad1818dfea3d659a05b74c3fc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:34 -0400 Subject: [PATCH 1/4] packet: add classic BPF fanout mode Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program to select a socket. This avoids having to keep adding special case fanout modes. One example use case is application layer load balancing. The QUIC protocol, for instance, encodes a connection ID in UDP payload. Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the only user so far. Signed-off-by: Willem de Bruijn Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 2 + net/packet/af_packet.c | 99 +++++++++++++++++++++++++++++++++- net/packet/internal.h | 5 +- 3 files changed, 104 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index d3d715f8c88f..a4bb16fa822e 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -55,6 +55,7 @@ struct sockaddr_ll { #define PACKET_TX_HAS_OFF 19 #define PACKET_QDISC_BYPASS 20 #define PACKET_ROLLOVER_STATS 21 +#define PACKET_FANOUT_DATA 22 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 @@ -62,6 +63,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_ROLLOVER 3 #define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_QM 5 +#define PACKET_FANOUT_CBPF 6 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5afe538bb88..8869d07013e6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -92,6 +92,7 @@ #ifdef CONFIG_INET #include #endif +#include #include "internal.h" @@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f, return skb_get_queue_mapping(skb) % num; } +static unsigned int fanout_demux_bpf(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + struct bpf_prog *prog; + unsigned int ret = 0; + + rcu_read_lock(); + prog = rcu_dereference(f->bpf_prog); + if (prog) + ret = BPF_PROG_RUN(prog, skb) % num; + rcu_read_unlock(); + + return ret; +} + static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); @@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; + case PACKET_FANOUT_CBPF: + idx = fanout_demux_bpf(f, skb, num); + break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) @@ -1502,6 +1522,74 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) return false; } +static void fanout_init_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_LB: + atomic_set(&f->rr_cur, 0); + break; + case PACKET_FANOUT_CBPF: + RCU_INIT_POINTER(f->bpf_prog, NULL); + break; + } +} + +static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) +{ + struct bpf_prog *old; + + spin_lock(&f->lock); + old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); + rcu_assign_pointer(f->bpf_prog, new); + spin_unlock(&f->lock); + + if (old) { + synchronize_net(); + bpf_prog_destroy(old); + } +} + +static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + struct sock_fprog fprog; + int ret; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fprog)) + return -EINVAL; + if (copy_from_user(&fprog, data, len)) + return -EFAULT; + + ret = bpf_prog_create_from_user(&new, &fprog, NULL); + if (ret) + return ret; + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + +static int fanout_set_data(struct packet_sock *po, char __user *data, + unsigned int len) +{ + switch (po->fanout->type) { + case PACKET_FANOUT_CBPF: + return fanout_set_data_cbpf(po, data, len); + default: + return -EINVAL; + }; +} + +static void fanout_release_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_CBPF: + __fanout_set_data_bpf(f, NULL); + }; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1519,6 +1607,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: + case PACKET_FANOUT_CBPF: break; default: return -EINVAL; @@ -1561,10 +1650,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->id = id; match->type = type; match->flags = flags; - atomic_set(&match->rr_cur, 0); INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); atomic_set(&match->sk_ref, 0); + fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; @@ -1610,6 +1699,7 @@ static void fanout_release(struct sock *sk) if (atomic_dec_and_test(&f->sk_ref)) { list_del(&f->list); dev_remove_pack(&f->prot_hook); + fanout_release_data(f); kfree(f); } mutex_unlock(&fanout_mutex); @@ -3529,6 +3619,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return fanout_add(sk, val & 0xffff, val >> 16); } + case PACKET_FANOUT_DATA: + { + if (!po->fanout) + return -EINVAL; + + return fanout_set_data(po, optval, optlen); + } case PACKET_TX_HAS_OFF: { unsigned int val; diff --git a/net/packet/internal.h b/net/packet/internal.h index e20b3e8829b8..9ee46314b7d7 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -79,7 +79,10 @@ struct packet_fanout { u16 id; u8 type; u8 flags; - atomic_t rr_cur; + union { + atomic_t rr_cur; + struct bpf_prog __rcu *bpf_prog; + }; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; spinlock_t lock; From f2e520956a1ab636698f8160194c9b8ac0989aab Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:35 -0400 Subject: [PATCH 2/4] packet: add extended BPF fanout mode Add fanout mode PACKET_FANOUT_EBPF that accepts an en extended BPF program to select a socket. Update the internal eBPF program by passing to socket option SOL_PACKET/PACKET_FANOUT_DATA a file descriptor returned by bpf(). Signed-off-by: Willem de Bruijn Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index a4bb16fa822e..9e7edfd8141e 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -64,6 +64,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_QM 5 #define PACKET_FANOUT_CBPF 6 +#define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8869d07013e6..7b8e39a22387 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1472,6 +1472,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } @@ -1529,6 +1530,7 @@ static void fanout_init_data(struct packet_fanout *f) atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } @@ -1571,12 +1573,39 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, return 0; } +static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + u32 fd; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fd)) + return -EINVAL; + if (copy_from_user(&fd, data, len)) + return -EFAULT; + + new = bpf_prog_get(fd); + if (IS_ERR(new)) + return PTR_ERR(new); + if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(new); + return -EINVAL; + } + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + static int fanout_set_data(struct packet_sock *po, char __user *data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); + case PACKET_FANOUT_EBPF: + return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; }; @@ -1586,6 +1615,7 @@ static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); }; } @@ -1608,6 +1638,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: break; default: return -EINVAL; From 95e22792fa5460e579a69171777ff79d8725be29 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:36 -0400 Subject: [PATCH 3/4] selftests/net: test classic bpf fanout mode Test PACKET_FANOUT_CBPF by inserting a cBPF program that selects a socket by payload. Requires modifying the test program to send packets with multiple payloads. Also fix a bug in testing the return value of mmap() Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/psock_fanout.c | 16 +++++++++--- tools/testing/selftests/net/psock_lib.h | 29 ++++++++++++++++------ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 08c2a36ef7a9..baf46a20e1b3 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -19,6 +19,7 @@ * - PACKET_FANOUT_LB * - PACKET_FANOUT_CPU * - PACKET_FANOUT_ROLLOVER + * - PACKET_FANOUT_CBPF * * Todo: * - functionality: PACKET_FANOUT_FLAG_DEFRAG @@ -115,8 +116,8 @@ static char *sock_fanout_open_ring(int fd) ring = mmap(0, req.tp_block_size * req.tp_block_nr, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (!ring) { - fprintf(stderr, "packetsock ring mmap\n"); + if (ring == MAP_FAILED) { + perror("packetsock ring mmap"); exit(1); } @@ -209,6 +210,7 @@ static int test_datapath(uint16_t typeflags, int port_off, { const int expect0[] = { 0, 0 }; char *rings[2]; + uint8_t type = typeflags & 0xFF; int fds[2], fds_udp[2][2], ret; fprintf(stderr, "test: datapath 0x%hx\n", typeflags); @@ -219,6 +221,9 @@ static int test_datapath(uint16_t typeflags, int port_off, fprintf(stderr, "ERROR: failed open\n"); exit(1); } + if (type == PACKET_FANOUT_CBPF) + sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA); + rings[0] = sock_fanout_open_ring(fds[0]); rings[1] = sock_fanout_open_ring(fds[1]); pair_udp_open(fds_udp[0], PORT_BASE); @@ -227,11 +232,11 @@ static int test_datapath(uint16_t typeflags, int port_off, /* Send data, but not enough to overflow a queue */ pair_udp_send(fds_udp[0], 15); - pair_udp_send(fds_udp[1], 5); + pair_udp_send_char(fds_udp[1], 5, DATA_CHAR_1); ret = sock_fanout_read(fds, rings, expect1); /* Send more data, overflow the queue */ - pair_udp_send(fds_udp[0], 15); + pair_udp_send_char(fds_udp[0], 15, DATA_CHAR_1); /* TODO: ensure consistent order between expect1 and expect2 */ ret |= sock_fanout_read(fds, rings, expect2); @@ -275,6 +280,7 @@ int main(int argc, char **argv) const int expect_rb[2][2] = { { 15, 5 }, { 20, 15 } }; const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } }; const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } }; + const int expect_bpf[2][2] = { { 15, 5 }, { 15, 20 } }; int port_off = 2, tries = 5, ret; test_control_single(); @@ -295,6 +301,8 @@ int main(int argc, char **argv) port_off, expect_lb[0], expect_lb[1]); ret |= test_datapath(PACKET_FANOUT_ROLLOVER, port_off, expect_rb[0], expect_rb[1]); + ret |= test_datapath(PACKET_FANOUT_CBPF, + port_off, expect_bpf[0], expect_bpf[1]); set_cpuaffinity(0); ret |= test_datapath(PACKET_FANOUT_CPU, port_off, diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h index 37da54ac85a9..24bc7ec1be7d 100644 --- a/tools/testing/selftests/net/psock_lib.h +++ b/tools/testing/selftests/net/psock_lib.h @@ -30,6 +30,7 @@ #define DATA_LEN 100 #define DATA_CHAR 'a' +#define DATA_CHAR_1 'b' #define PORT_BASE 8000 @@ -37,29 +38,36 @@ # define __maybe_unused __attribute__ ((__unused__)) #endif -static __maybe_unused void pair_udp_setfilter(int fd) +static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum) { struct sock_filter bpf_filter[] = { { 0x80, 0, 0, 0x00000000 }, /* LD pktlen */ - { 0x35, 0, 5, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/ + { 0x35, 0, 4, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/ { 0x30, 0, 0, 0x00000050 }, /* LD ip[80] */ - { 0x15, 0, 3, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ - { 0x30, 0, 0, 0x00000051 }, /* LD ip[81] */ - { 0x15, 0, 1, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ + { 0x15, 1, 0, DATA_CHAR }, /* JEQ DATA_CHAR [t goto match]*/ + { 0x15, 0, 1, DATA_CHAR_1}, /* JEQ DATA_CHAR_1 [t goto match]*/ { 0x06, 0, 0, 0x00000060 }, /* RET match */ { 0x06, 0, 0, 0x00000000 }, /* RET no match */ }; struct sock_fprog bpf_prog; + if (lvl == SOL_PACKET && optnum == PACKET_FANOUT_DATA) + bpf_filter[5].code = 0x16; /* RET A */ + bpf_prog.filter = bpf_filter; bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter); - if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog, + if (setsockopt(fd, lvl, optnum, &bpf_prog, sizeof(bpf_prog))) { perror("setsockopt SO_ATTACH_FILTER"); exit(1); } } +static __maybe_unused void pair_udp_setfilter(int fd) +{ + sock_setfilter(fd, SOL_SOCKET, SO_ATTACH_FILTER); +} + static __maybe_unused void pair_udp_open(int fds[], uint16_t port) { struct sockaddr_in saddr, daddr; @@ -96,11 +104,11 @@ static __maybe_unused void pair_udp_open(int fds[], uint16_t port) } } -static __maybe_unused void pair_udp_send(int fds[], int num) +static __maybe_unused void pair_udp_send_char(int fds[], int num, char payload) { char buf[DATA_LEN], rbuf[DATA_LEN]; - memset(buf, DATA_CHAR, sizeof(buf)); + memset(buf, payload, sizeof(buf)); while (num--) { /* Should really handle EINTR and EAGAIN */ if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) { @@ -118,6 +126,11 @@ static __maybe_unused void pair_udp_send(int fds[], int num) } } +static __maybe_unused void pair_udp_send(int fds[], int num) +{ + return pair_udp_send_char(fds, num, DATA_CHAR); +} + static __maybe_unused void pair_udp_close(int fds[]) { close(fds[0]); From 30da679e679dbe7bb103c5d9c137ec5a070b7150 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:37 -0400 Subject: [PATCH 4/4] selftests/net: test extended BPF fanout mode Test PACKET_FANOUT_EBPF by inserting a program into the the kernel with bpf(), then attaching it to the fanout group. Observe the same payload-based distribution as in the PACKET_FANOUT_CBPF test. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/psock_fanout.c | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index baf46a20e1b3..412459369686 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -20,6 +20,7 @@ * - PACKET_FANOUT_CPU * - PACKET_FANOUT_ROLLOVER * - PACKET_FANOUT_CBPF + * - PACKET_FANOUT_EBPF * * Todo: * - functionality: PACKET_FANOUT_FLAG_DEFRAG @@ -45,7 +46,9 @@ #include #include #include +#include /* for __NR_bpf */ #include +#include #include #include #include @@ -92,6 +95,51 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets) return fd; } +static void sock_fanout_set_ebpf(int fd) +{ + const int len_off = __builtin_offsetof(struct __sk_buff, len); + struct bpf_insn prog[] = { + { BPF_ALU64 | BPF_MOV | BPF_X, 6, 1, 0, 0 }, + { BPF_LDX | BPF_W | BPF_MEM, 0, 6, len_off, 0 }, + { BPF_JMP | BPF_JGE | BPF_K, 0, 0, 1, DATA_LEN }, + { BPF_JMP | BPF_JA | BPF_K, 0, 0, 4, 0 }, + { BPF_LD | BPF_B | BPF_ABS, 0, 0, 0, 0x50 }, + { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 2, DATA_CHAR }, + { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1, DATA_CHAR_1 }, + { BPF_ALU | BPF_MOV | BPF_K, 0, 0, 0, 0 }, + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + char log_buf[512]; + union bpf_attr attr; + int pfd; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = (unsigned long) prog; + attr.insn_cnt = sizeof(prog) / sizeof(prog[0]); + attr.license = (unsigned long) "GPL"; + attr.log_buf = (unsigned long) log_buf, + attr.log_size = sizeof(log_buf), + attr.log_level = 1, + + pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (pfd < 0) { + perror("bpf"); + fprintf(stderr, "bpf verifier:\n%s\n", log_buf); + exit(1); + } + + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &pfd, sizeof(pfd))) { + perror("fanout data ebpf"); + exit(1); + } + + if (close(pfd)) { + perror("close ebpf"); + exit(1); + } +} + static char *sock_fanout_open_ring(int fd) { struct tpacket_req req = { @@ -223,6 +271,8 @@ static int test_datapath(uint16_t typeflags, int port_off, } if (type == PACKET_FANOUT_CBPF) sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA); + else if (type == PACKET_FANOUT_EBPF) + sock_fanout_set_ebpf(fds[0]); rings[0] = sock_fanout_open_ring(fds[0]); rings[1] = sock_fanout_open_ring(fds[1]); @@ -301,8 +351,11 @@ int main(int argc, char **argv) port_off, expect_lb[0], expect_lb[1]); ret |= test_datapath(PACKET_FANOUT_ROLLOVER, port_off, expect_rb[0], expect_rb[1]); + ret |= test_datapath(PACKET_FANOUT_CBPF, port_off, expect_bpf[0], expect_bpf[1]); + ret |= test_datapath(PACKET_FANOUT_EBPF, + port_off, expect_bpf[0], expect_bpf[1]); set_cpuaffinity(0); ret |= test_datapath(PACKET_FANOUT_CPU, port_off,