bec1f6f697
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
362 lines
9.2 KiB
C
362 lines
9.2 KiB
C
/*
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
* operating system. INET is implemented using the BSD Socket
|
|
* interface as the means of communication with the user level.
|
|
*
|
|
* Definitions for inet_sock
|
|
*
|
|
* Authors: Many, reorganised here by
|
|
* Arnaldo Carvalho de Melo <acme@mandriva.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
#ifndef _INET_SOCK_H
|
|
#define _INET_SOCK_H
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/jhash.h>
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <net/flow.h>
|
|
#include <net/sock.h>
|
|
#include <net/request_sock.h>
|
|
#include <net/netns/hash.h>
|
|
#include <net/tcp_states.h>
|
|
#include <net/l3mdev.h>
|
|
|
|
/** struct ip_options - IP Options
|
|
*
|
|
* @faddr - Saved first hop address
|
|
* @nexthop - Saved nexthop address in LSRR and SSRR
|
|
* @is_strictroute - Strict source route
|
|
* @srr_is_hit - Packet destination addr was our one
|
|
* @is_changed - IP checksum more not valid
|
|
* @rr_needaddr - Need to record addr of outgoing dev
|
|
* @ts_needtime - Need to record timestamp
|
|
* @ts_needaddr - Need to record addr of outgoing dev
|
|
*/
|
|
struct ip_options {
|
|
__be32 faddr;
|
|
__be32 nexthop;
|
|
unsigned char optlen;
|
|
unsigned char srr;
|
|
unsigned char rr;
|
|
unsigned char ts;
|
|
unsigned char is_strictroute:1,
|
|
srr_is_hit:1,
|
|
is_changed:1,
|
|
rr_needaddr:1,
|
|
ts_needtime:1,
|
|
ts_needaddr:1;
|
|
unsigned char router_alert;
|
|
unsigned char cipso;
|
|
unsigned char __pad2;
|
|
unsigned char __data[0];
|
|
};
|
|
|
|
struct ip_options_rcu {
|
|
struct rcu_head rcu;
|
|
struct ip_options opt;
|
|
};
|
|
|
|
struct ip_options_data {
|
|
struct ip_options_rcu opt;
|
|
char data[40];
|
|
};
|
|
|
|
struct inet_request_sock {
|
|
struct request_sock req;
|
|
#define ir_loc_addr req.__req_common.skc_rcv_saddr
|
|
#define ir_rmt_addr req.__req_common.skc_daddr
|
|
#define ir_num req.__req_common.skc_num
|
|
#define ir_rmt_port req.__req_common.skc_dport
|
|
#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
|
|
#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
|
|
#define ir_iif req.__req_common.skc_bound_dev_if
|
|
#define ir_cookie req.__req_common.skc_cookie
|
|
#define ireq_net req.__req_common.skc_net
|
|
#define ireq_state req.__req_common.skc_state
|
|
#define ireq_family req.__req_common.skc_family
|
|
|
|
u16 snd_wscale : 4,
|
|
rcv_wscale : 4,
|
|
tstamp_ok : 1,
|
|
sack_ok : 1,
|
|
wscale_ok : 1,
|
|
ecn_ok : 1,
|
|
acked : 1,
|
|
no_srccheck: 1,
|
|
smc_ok : 1;
|
|
u32 ir_mark;
|
|
union {
|
|
struct ip_options_rcu __rcu *ireq_opt;
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
struct {
|
|
struct ipv6_txoptions *ipv6_opt;
|
|
struct sk_buff *pktopts;
|
|
};
|
|
#endif
|
|
};
|
|
};
|
|
|
|
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
|
|
{
|
|
return (struct inet_request_sock *)sk;
|
|
}
|
|
|
|
static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
|
|
return skb->mark;
|
|
|
|
return sk->sk_mark;
|
|
}
|
|
|
|
static inline int inet_request_bound_dev_if(const struct sock *sk,
|
|
struct sk_buff *skb)
|
|
{
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
struct net *net = sock_net(sk);
|
|
|
|
if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
|
|
return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
|
|
#endif
|
|
|
|
return sk->sk_bound_dev_if;
|
|
}
|
|
|
|
static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq)
|
|
{
|
|
return rcu_dereference_check(ireq->ireq_opt,
|
|
refcount_read(&ireq->req.rsk_refcnt) > 0);
|
|
}
|
|
|
|
struct inet_cork {
|
|
unsigned int flags;
|
|
__be32 addr;
|
|
struct ip_options *opt;
|
|
unsigned int fragsize;
|
|
int length; /* Total length of all frames */
|
|
struct dst_entry *dst;
|
|
u8 tx_flags;
|
|
__u8 ttl;
|
|
__s16 tos;
|
|
char priority;
|
|
__u16 gso_size;
|
|
};
|
|
|
|
struct inet_cork_full {
|
|
struct inet_cork base;
|
|
struct flowi fl;
|
|
};
|
|
|
|
struct ip_mc_socklist;
|
|
struct ipv6_pinfo;
|
|
struct rtable;
|
|
|
|
/** struct inet_sock - representation of INET sockets
|
|
*
|
|
* @sk - ancestor class
|
|
* @pinet6 - pointer to IPv6 control block
|
|
* @inet_daddr - Foreign IPv4 addr
|
|
* @inet_rcv_saddr - Bound local IPv4 addr
|
|
* @inet_dport - Destination port
|
|
* @inet_num - Local port
|
|
* @inet_saddr - Sending source
|
|
* @uc_ttl - Unicast TTL
|
|
* @inet_sport - Source port
|
|
* @inet_id - ID counter for DF pkts
|
|
* @tos - TOS
|
|
* @mc_ttl - Multicasting TTL
|
|
* @is_icsk - is this an inet_connection_sock?
|
|
* @uc_index - Unicast outgoing device index
|
|
* @mc_index - Multicast device index
|
|
* @mc_list - Group array
|
|
* @cork - info to build ip hdr on each ip frag while socket is corked
|
|
*/
|
|
struct inet_sock {
|
|
/* sk and pinet6 has to be the first two members of inet_sock */
|
|
struct sock sk;
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
struct ipv6_pinfo *pinet6;
|
|
#endif
|
|
/* Socket demultiplex comparisons on incoming packets. */
|
|
#define inet_daddr sk.__sk_common.skc_daddr
|
|
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
|
|
#define inet_dport sk.__sk_common.skc_dport
|
|
#define inet_num sk.__sk_common.skc_num
|
|
|
|
__be32 inet_saddr;
|
|
__s16 uc_ttl;
|
|
__u16 cmsg_flags;
|
|
__be16 inet_sport;
|
|
__u16 inet_id;
|
|
|
|
struct ip_options_rcu __rcu *inet_opt;
|
|
int rx_dst_ifindex;
|
|
__u8 tos;
|
|
__u8 min_ttl;
|
|
__u8 mc_ttl;
|
|
__u8 pmtudisc;
|
|
__u8 recverr:1,
|
|
is_icsk:1,
|
|
freebind:1,
|
|
hdrincl:1,
|
|
mc_loop:1,
|
|
transparent:1,
|
|
mc_all:1,
|
|
nodefrag:1;
|
|
__u8 bind_address_no_port:1,
|
|
defer_connect:1; /* Indicates that fastopen_connect is set
|
|
* and cookie exists so we defer connect
|
|
* until first data frame is written
|
|
*/
|
|
__u8 rcv_tos;
|
|
__u8 convert_csum;
|
|
int uc_index;
|
|
int mc_index;
|
|
__be32 mc_addr;
|
|
struct ip_mc_socklist __rcu *mc_list;
|
|
struct inet_cork_full cork;
|
|
};
|
|
|
|
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
|
|
#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
|
|
|
|
/* cmsg flags for inet */
|
|
#define IP_CMSG_PKTINFO BIT(0)
|
|
#define IP_CMSG_TTL BIT(1)
|
|
#define IP_CMSG_TOS BIT(2)
|
|
#define IP_CMSG_RECVOPTS BIT(3)
|
|
#define IP_CMSG_RETOPTS BIT(4)
|
|
#define IP_CMSG_PASSSEC BIT(5)
|
|
#define IP_CMSG_ORIGDSTADDR BIT(6)
|
|
#define IP_CMSG_CHECKSUM BIT(7)
|
|
#define IP_CMSG_RECVFRAGSIZE BIT(8)
|
|
|
|
/**
|
|
* sk_to_full_sk - Access to a full socket
|
|
* @sk: pointer to a socket
|
|
*
|
|
* SYNACK messages might be attached to request sockets.
|
|
* Some places want to reach the listener in this case.
|
|
*/
|
|
static inline struct sock *sk_to_full_sk(struct sock *sk)
|
|
{
|
|
#ifdef CONFIG_INET
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
sk = inet_reqsk(sk)->rsk_listener;
|
|
#endif
|
|
return sk;
|
|
}
|
|
|
|
/* sk_to_full_sk() variant with a const argument */
|
|
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
|
|
{
|
|
#ifdef CONFIG_INET
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
sk = ((const struct request_sock *)sk)->rsk_listener;
|
|
#endif
|
|
return sk;
|
|
}
|
|
|
|
static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
|
|
{
|
|
return sk_to_full_sk(skb->sk);
|
|
}
|
|
|
|
static inline struct inet_sock *inet_sk(const struct sock *sk)
|
|
{
|
|
return (struct inet_sock *)sk;
|
|
}
|
|
|
|
static inline void __inet_sk_copy_descendant(struct sock *sk_to,
|
|
const struct sock *sk_from,
|
|
const int ancestor_size)
|
|
{
|
|
memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
|
|
sk_from->sk_prot->obj_size - ancestor_size);
|
|
}
|
|
#if !(IS_ENABLED(CONFIG_IPV6))
|
|
static inline void inet_sk_copy_descendant(struct sock *sk_to,
|
|
const struct sock *sk_from)
|
|
{
|
|
__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
|
|
}
|
|
#endif
|
|
|
|
int inet_sk_rebuild_header(struct sock *sk);
|
|
|
|
/**
|
|
* inet_sk_state_load - read sk->sk_state for lockless contexts
|
|
* @sk: socket pointer
|
|
*
|
|
* Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
|
|
* tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
|
|
*/
|
|
static inline int inet_sk_state_load(const struct sock *sk)
|
|
{
|
|
/* state change might impact lockless readers. */
|
|
return smp_load_acquire(&sk->sk_state);
|
|
}
|
|
|
|
/**
|
|
* inet_sk_state_store - update sk->sk_state
|
|
* @sk: socket pointer
|
|
* @newstate: new state
|
|
*
|
|
* Paired with inet_sk_state_load(). Should be used in contexts where
|
|
* state change might impact lockless readers.
|
|
*/
|
|
void inet_sk_state_store(struct sock *sk, int newstate);
|
|
|
|
void inet_sk_set_state(struct sock *sk, int state);
|
|
|
|
static inline unsigned int __inet_ehashfn(const __be32 laddr,
|
|
const __u16 lport,
|
|
const __be32 faddr,
|
|
const __be16 fport,
|
|
u32 initval)
|
|
{
|
|
return jhash_3words((__force __u32) laddr,
|
|
(__force __u32) faddr,
|
|
((__u32) lport) << 16 | (__force __u32)fport,
|
|
initval);
|
|
}
|
|
|
|
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
|
|
struct sock *sk_listener,
|
|
bool attach_listener);
|
|
|
|
static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
|
|
{
|
|
__u8 flags = 0;
|
|
|
|
if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
|
|
flags |= FLOWI_FLAG_ANYSRC;
|
|
return flags;
|
|
}
|
|
|
|
static inline void inet_inc_convert_csum(struct sock *sk)
|
|
{
|
|
inet_sk(sk)->convert_csum++;
|
|
}
|
|
|
|
static inline void inet_dec_convert_csum(struct sock *sk)
|
|
{
|
|
if (inet_sk(sk)->convert_csum > 0)
|
|
inet_sk(sk)->convert_csum--;
|
|
}
|
|
|
|
static inline bool inet_get_convert_csum(struct sock *sk)
|
|
{
|
|
return !!inet_sk(sk)->convert_csum;
|
|
}
|
|
|
|
#endif /* _INET_SOCK_H */
|