From 96f657e6cfc25b27d8189cb6b9eac66e1b1ff4f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:26 -0700 Subject: [PATCH 1/8] net/packet: constify __packet_get_status() argument struct packet_sock is only read. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7fa847dcea30..66fcfd5b51f8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) smp_wmb(); } -static int __packet_get_status(struct packet_sock *po, void *frame) +static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; From d4b5bd98faec345933d6080989e035a833b1077a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:27 -0700 Subject: [PATCH 2/8] net/packet: constify packet_lookup_frame() and __tpacket_has_room() Goal is to be able to use __tpacket_has_room() without holding a lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 66fcfd5b51f8..273bffd2130d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, return ts_status; } -static void *packet_lookup_frame(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int position, - int status) +static void *packet_lookup_frame(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int position, + int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; @@ -1198,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po) #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 -static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.frame_max + 1; - idx = po->rx_ring.head; + len = READ_ONCE(po->rx_ring.frame_max) + 1; + idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) From dcf70cefaf99a40cfa25face160f29c1cf3bf84d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:28 -0700 Subject: [PATCH 3/8] net/packet: constify prb_lookup_block() and __tpacket_v3_has_room() Goal is to be able to use __tpacket_v3_has_room() without holding a lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 273bffd2130d..5ef63d0c3ad0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1082,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po, } } -static void *prb_lookup_block(struct packet_sock *po, - struct packet_ring_buffer *rb, - unsigned int idx, - int status) +static void *prb_lookup_block(const struct packet_sock *po, + const struct packet_ring_buffer *rb, + unsigned int idx, + int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); @@ -1211,12 +1211,12 @@ static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; - len = po->rx_ring.prb_bdqc.knum_blocks; - idx = po->rx_ring.prb_bdqc.kactive_blk_num; + len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); + idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) From 0338a14523e3864524dafb6a66c7b82e94201317 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:29 -0700 Subject: [PATCH 4/8] net/packet: constify __packet_rcv_has_room() Goal is use the helper without lock being held. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5ef63d0c3ad0..a0564855ed9d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1224,15 +1224,18 @@ static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(const struct packet_sock *po, + const struct sk_buff *skb) { - struct sock *sk = &po->sk; + const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { - int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - (skb ? skb->truesize : 0); - if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + + if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; From 8e8e2951e3095732d7e780c241f61ea130955a57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:30 -0700 Subject: [PATCH 5/8] net/packet: make tp_drops atomic Under DDOS, we want to be able to increment tp_drops without touching the spinlock. This will help readers to drain the receive queue slightly faster :/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 20 +++++++++++--------- net/packet/internal.h | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a0564855ed9d..2d499679811a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; - if (po->stats.stats3.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; @@ -2128,10 +2128,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: is_drop_n_account = true; - spin_lock(&sk->sk_receive_queue.lock); - po->stats.stats1.tp_drops++; + atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); - spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2265,7 +2263,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ - if (po->stats.stats1.tp_drops) + if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } @@ -2381,9 +2379,9 @@ drop: return 0; drop_n_account: - is_drop_n_account = true; - po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); + atomic_inc(&po->tp_drops); + is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); @@ -3879,6 +3877,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; + int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3895,14 +3894,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); + drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); - st.stats3.tp_packets += st.stats3.tp_drops; + st.stats3.tp_drops = drops; + st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); - st.stats1.tp_packets += st.stats1.tp_drops; + st.stats1.tp_drops = drops; + st.stats1.tp_packets += drops; data = &st.stats1; } diff --git a/net/packet/internal.h b/net/packet/internal.h index 3bb7c5fb3bff..b5bcff2b7a43 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -131,6 +131,7 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; + atomic_t tp_drops ____cacheline_aligned_in_smp; }; static struct packet_sock *pkt_sk(struct sock *sk) From 2c51c6273c73ff04099f48fb42dfdd2acc7a1b70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:31 -0700 Subject: [PATCH 6/8] net/packet: implement shortcut in tpacket_rcv() tpacket_rcv() can be hit under DDOS quite hard, since it will always grab a socket spinlock, to eventually find there is no room for an additional packet. Using tcpdump [1] on a busy host can lead to catastrophic consequences, because of all cpus spinning on a contended spinlock. This replicates a similar strategy used in packet_rcv() [1] Also some applications mistakenly use af_packet socket bound to ETH_P_ALL only to send packets. Receive queue is never drained and immediately full. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2d499679811a..860ca3e6abf5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2193,6 +2193,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!res) goto drop_n_restore; + /* If we are flooded, just give up */ + if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { + atomic_inc(&po->tp_drops); + goto drop_n_restore; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && From 3a2bb84e4aeab5c90d16e1cd17dbaa17631af529 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:32 -0700 Subject: [PATCH 7/8] net/packet: remove locking from packet_rcv_has_room() __packet_rcv_has_room() can now be run without lock being held. po->pressure is only a non persistent hint, we can mark all read/write accesses with READ_ONCE()/WRITE_ONCE() to document the fact that the field could be written without any synchronization. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 860ca3e6abf5..d409e2fdaa7e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1260,15 +1260,13 @@ static int __packet_rcv_has_room(const struct packet_sock *po, static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int ret; - bool has_room; + int pressure, ret; - spin_lock_bh(&po->sk.sk_receive_queue.lock); ret = __packet_rcv_has_room(po, skb); - has_room = ret == ROOM_NORMAL; - if (po->pressure == has_room) - po->pressure = !has_room; - spin_unlock_bh(&po->sk.sk_receive_queue.lock); + pressure = ret != ROOM_NORMAL; + + if (READ_ONCE(po->pressure) != pressure) + WRITE_ONCE(po->pressure, pressure); return ret; } @@ -1353,7 +1351,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(f->arr[i]); - if (po_next != po_skip && !po_next->pressure && + if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -3310,7 +3308,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; - if (pkt_sk(sk)->pressure) + if (READ_ONCE(pkt_sk(sk)->pressure)) packet_rcv_has_room(pkt_sk(sk), NULL); if (pkt_sk(sk)->has_vnet_hdr) { @@ -4129,8 +4127,8 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - po->pressure = 0; + if (READ_ONCE(po->pressure) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { From 9bb6cd652454969fa0ae6c52c8ebd31472d0171c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 09:52:33 -0700 Subject: [PATCH 8/8] net/packet: introduce packet_rcv_try_clear_pressure() helper There are two places where we want to clear the pressure if possible, add a helper to make it more obvious. Signed-off-by: Eric Dumazet Suggested-by: Willem de Bruijn Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d409e2fdaa7e..8c27e198268a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1271,6 +1271,13 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) return ret; } +static void packet_rcv_try_clear_pressure(struct packet_sock *po) +{ + if (READ_ONCE(po->pressure) && + __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + WRITE_ONCE(po->pressure, 0); +} + static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); @@ -3308,8 +3315,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; - if (READ_ONCE(pkt_sk(sk)->pressure)) - packet_rcv_has_room(pkt_sk(sk), NULL); + packet_rcv_try_clear_pressure(pkt_sk(sk)); if (pkt_sk(sk)->has_vnet_hdr) { err = packet_rcv_vnet(msg, skb, &len); @@ -4127,8 +4133,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } - if (READ_ONCE(po->pressure) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - WRITE_ONCE(po->pressure, 0); + packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) {