PACKET_RX_RING can cause multiple writers to access the same slot if a fast writer wraps the ring while a slow writer is still copying. This is particularly likely with few, large, slots (e.g., GSO packets). Synchronize kernel thread ownership of rx ring slots with a bitmap. Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL while holding the sk receive queue lock. They release this lock before copying and set tp_status to TP_STATUS_USER to release to userspace when done. During copying, another writer may take the lock, also see TP_STATUS_KERNEL, and start writing to the same slot. Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a slot, test and set with the lock held. To release race-free, update tp_status and owner bit as a transaction, so take the lock again. This is the one of a variety of discussed options (see Link below): * instead of a shadow ring, embed the data in the slot itself, such as in tp_padding. But any test for this field may match a value left by userspace, causing deadlock. * avoid the lock on release. This leaves a small race if releasing the shadow slot before setting TP_STATUS_USER. The below reproducer showed that this race is not academic. If releasing the slot after tp_status, the race is more subtle. See the first link for details. * add a new tp_status TP_KERNEL_OWNED to avoid the transactional store of two fields. But, legacy applications may interpret all non-zero tp_status as owned by the user. As libpcap does. So this is possible only opt-in by newer processes. It can be added as an optional mode. * embed the struct at the tail of pg_vec to avoid extra allocation. The implementation proved no less complex than a separate field. The additional locking cost on release adds contention, no different than scaling on multicore or multiqueue h/w. In practice, below reproducer nor small packet tcpdump showed a noticeable change in perf report in cycles spent in spinlock. Where contention is problematic, packet sockets support mitigation through PACKET_FANOUT. And we can consider adding opt-in state TP_KERNEL_OWNED. Easy to reproduce by running multiple netperf or similar TCP_STREAM flows concurrently with `tcpdump -B 129 -n greater 60000`. Based on an earlier patchset by Jon Rosen. See links below. I believe this issue goes back to the introduction of tpacket_rcv, which predates git history. Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg237222.html Suggested-by: Jon Rosen <jrosen@cisco.com> Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: Jon Rosen <jrosen@cisco.com> Signed-off-by: David S. Miller <davem@davemloft.net>
		
			
				
	
	
		
			147 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			147 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| #ifndef __PACKET_INTERNAL_H__
 | |
| #define __PACKET_INTERNAL_H__
 | |
| 
 | |
| #include <linux/refcount.h>
 | |
| 
 | |
| struct packet_mclist {
 | |
| 	struct packet_mclist	*next;
 | |
| 	int			ifindex;
 | |
| 	int			count;
 | |
| 	unsigned short		type;
 | |
| 	unsigned short		alen;
 | |
| 	unsigned char		addr[MAX_ADDR_LEN];
 | |
| };
 | |
| 
 | |
| /* kbdq - kernel block descriptor queue */
 | |
| struct tpacket_kbdq_core {
 | |
| 	struct pgv	*pkbdq;
 | |
| 	unsigned int	feature_req_word;
 | |
| 	unsigned int	hdrlen;
 | |
| 	unsigned char	reset_pending_on_curr_blk;
 | |
| 	unsigned char   delete_blk_timer;
 | |
| 	unsigned short	kactive_blk_num;
 | |
| 	unsigned short	blk_sizeof_priv;
 | |
| 
 | |
| 	/* last_kactive_blk_num:
 | |
| 	 * trick to see if user-space has caught up
 | |
| 	 * in order to avoid refreshing timer when every single pkt arrives.
 | |
| 	 */
 | |
| 	unsigned short	last_kactive_blk_num;
 | |
| 
 | |
| 	char		*pkblk_start;
 | |
| 	char		*pkblk_end;
 | |
| 	int		kblk_size;
 | |
| 	unsigned int	max_frame_len;
 | |
| 	unsigned int	knum_blocks;
 | |
| 	uint64_t	knxt_seq_num;
 | |
| 	char		*prev;
 | |
| 	char		*nxt_offset;
 | |
| 	struct sk_buff	*skb;
 | |
| 
 | |
| 	atomic_t	blk_fill_in_prog;
 | |
| 
 | |
| 	/* Default is set to 8ms */
 | |
| #define DEFAULT_PRB_RETIRE_TOV	(8)
 | |
| 
 | |
| 	unsigned short  retire_blk_tov;
 | |
| 	unsigned short  version;
 | |
| 	unsigned long	tov_in_jiffies;
 | |
| 
 | |
| 	/* timer to retire an outstanding block */
 | |
| 	struct timer_list retire_blk_timer;
 | |
| };
 | |
| 
 | |
| struct pgv {
 | |
| 	char *buffer;
 | |
| };
 | |
| 
 | |
| struct packet_ring_buffer {
 | |
| 	struct pgv		*pg_vec;
 | |
| 
 | |
| 	unsigned int		head;
 | |
| 	unsigned int		frames_per_block;
 | |
| 	unsigned int		frame_size;
 | |
| 	unsigned int		frame_max;
 | |
| 
 | |
| 	unsigned int		pg_vec_order;
 | |
| 	unsigned int		pg_vec_pages;
 | |
| 	unsigned int		pg_vec_len;
 | |
| 
 | |
| 	unsigned int __percpu	*pending_refcnt;
 | |
| 
 | |
| 	union {
 | |
| 		unsigned long			*rx_owner_map;
 | |
| 		struct tpacket_kbdq_core	prb_bdqc;
 | |
| 	};
 | |
| };
 | |
| 
 | |
| extern struct mutex fanout_mutex;
 | |
| #define PACKET_FANOUT_MAX	256
 | |
| 
 | |
| struct packet_fanout {
 | |
| 	possible_net_t		net;
 | |
| 	unsigned int		num_members;
 | |
| 	u16			id;
 | |
| 	u8			type;
 | |
| 	u8			flags;
 | |
| 	union {
 | |
| 		atomic_t		rr_cur;
 | |
| 		struct bpf_prog __rcu	*bpf_prog;
 | |
| 	};
 | |
| 	struct list_head	list;
 | |
| 	struct sock		*arr[PACKET_FANOUT_MAX];
 | |
| 	spinlock_t		lock;
 | |
| 	refcount_t		sk_ref;
 | |
| 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 | |
| };
 | |
| 
 | |
| struct packet_rollover {
 | |
| 	int			sock;
 | |
| 	atomic_long_t		num;
 | |
| 	atomic_long_t		num_huge;
 | |
| 	atomic_long_t		num_failed;
 | |
| #define ROLLOVER_HLEN	(L1_CACHE_BYTES / sizeof(u32))
 | |
| 	u32			history[ROLLOVER_HLEN] ____cacheline_aligned;
 | |
| } ____cacheline_aligned_in_smp;
 | |
| 
 | |
| struct packet_sock {
 | |
| 	/* struct sock has to be the first member of packet_sock */
 | |
| 	struct sock		sk;
 | |
| 	struct packet_fanout	*fanout;
 | |
| 	union  tpacket_stats_u	stats;
 | |
| 	struct packet_ring_buffer	rx_ring;
 | |
| 	struct packet_ring_buffer	tx_ring;
 | |
| 	int			copy_thresh;
 | |
| 	spinlock_t		bind_lock;
 | |
| 	struct mutex		pg_vec_lock;
 | |
| 	unsigned int		running;	/* bind_lock must be held */
 | |
| 	unsigned int		auxdata:1,	/* writer must hold sock lock */
 | |
| 				origdev:1,
 | |
| 				has_vnet_hdr:1,
 | |
| 				tp_loss:1,
 | |
| 				tp_tx_has_off:1;
 | |
| 	int			pressure;
 | |
| 	int			ifindex;	/* bound device		*/
 | |
| 	__be16			num;
 | |
| 	struct packet_rollover	*rollover;
 | |
| 	struct packet_mclist	*mclist;
 | |
| 	atomic_t		mapped;
 | |
| 	enum tpacket_versions	tp_version;
 | |
| 	unsigned int		tp_hdrlen;
 | |
| 	unsigned int		tp_reserve;
 | |
| 	unsigned int		tp_tstamp;
 | |
| 	struct completion	skb_completion;
 | |
| 	struct net_device __rcu	*cached_dev;
 | |
| 	int			(*xmit)(struct sk_buff *skb);
 | |
| 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 | |
| 	atomic_t		tp_drops ____cacheline_aligned_in_smp;
 | |
| };
 | |
| 
 | |
| static struct packet_sock *pkt_sk(struct sock *sk)
 | |
| {
 | |
| 	return (struct packet_sock *)sk;
 | |
| }
 | |
| 
 | |
| #endif
 |