Merge branch 'udp-msg_zerocopy'

Willem de Bruijn says:

====================
udp msg_zerocopy

Enable MSG_ZEROCOPY for udp sockets

Patch 1/3 is the main patch, a rework of RFC patch
  http://patchwork.ozlabs.org/patch/899630/
  more details in the patch commit message

Patch 2/3 is an optimization to remove a branch from the UDP hot path
  and refcount_inc/refcount_dec_and_test pair when zerocopy is used.
  This used to be included in the first patch in v2.

Patch 3/3 runs the already existing udp zerocopy tests
  as part of kselftest

See also recent Linux Plumbers presentation
  https://linuxplumbersconf.org/event/2/contributions/106/attachments/104/128/willemdebruijn-lpc2018-udpgso-presentation-20181113.pdf

Changes:
  v1 -> v2
    - Fixup reverse christmas tree violation
  v2 -> v3
    - Split refcount avoidance optimization into separate patch
      - Fix refcount leak on error in fragmented case
        (thanks to Paolo Abeni for pointing this one out!)
      - Fix refcount inc on zero
  v3 -> v4
    - Move skb_zcopy_set below the only kfree_skb that might cause
      a premature uarg destroy before skb_zerocopy_put_abort
      - Move the entire skb_shinfo assignment block, to keep that
	cacheline access in one place
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-12-03 15:58:33 -08:00
commit 6e360f7331
9 changed files with 90 additions and 27 deletions

View File

@ -481,10 +481,11 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
} }
void sock_zerocopy_put(struct ubuf_info *uarg); void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg); void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len, struct msghdr *msg, int len,
struct ubuf_info *uarg); struct ubuf_info *uarg);
@ -1325,10 +1326,14 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
return is_zcopy ? skb_uarg(skb) : NULL; return is_zcopy ? skb_uarg(skb) : NULL;
} }
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
bool *have_ref)
{ {
if (skb && uarg && !skb_zcopy(skb)) { if (skb && uarg && !skb_zcopy(skb)) {
sock_zerocopy_get(uarg); if (unlikely(have_ref && *have_ref))
*have_ref = false;
else
sock_zerocopy_get(uarg);
skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->destructor_arg = uarg;
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
} }
@ -1373,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
struct ubuf_info *uarg = skb_zcopy(skb); struct ubuf_info *uarg = skb_zcopy(skb);
if (uarg) { if (uarg) {
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, false);
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
} }
} }

View File

@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
} }
EXPORT_SYMBOL_GPL(sock_zerocopy_put); EXPORT_SYMBOL_GPL(sock_zerocopy_put);
void sock_zerocopy_put_abort(struct ubuf_info *uarg) void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{ {
if (uarg) { if (uarg) {
struct sock *sk = skb_from_uarg(uarg)->sk; struct sock *sk = skb_from_uarg(uarg)->sk;
@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey); atomic_dec(&sk->sk_zckey);
uarg->len--; uarg->len--;
sock_zerocopy_put(uarg); if (have_uref)
sock_zerocopy_put(uarg);
} }
} }
EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length); struct iov_iter *from, size_t length);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len, struct msghdr *msg, int len,
struct ubuf_info *uarg) struct ubuf_info *uarg)
@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return err; return err;
} }
skb_zcopy_set(skb, uarg); skb_zcopy_set(skb, uarg, NULL);
return skb->len - orig_len; return skb->len - orig_len;
} }
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
if (skb_copy_ubufs(nskb, GFP_ATOMIC)) if (skb_copy_ubufs(nskb, GFP_ATOMIC))
return -EIO; return -EIO;
} }
skb_zcopy_set(nskb, skb_uarg(orig)); skb_zcopy_set(nskb, skb_uarg(orig), NULL);
} }
return 0; return 0;
} }

View File

@ -1018,7 +1018,10 @@ set_rcvbuf:
case SO_ZEROCOPY: case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
if (sk->sk_protocol != IPPROTO_TCP) if (!((sk->sk_type == SOCK_STREAM &&
sk->sk_protocol == IPPROTO_TCP) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
ret = -ENOTSUPP; ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) { } else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP; ret = -ENOTSUPP;

View File

@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags) unsigned int flags)
{ {
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb; struct sk_buff *skb;
struct ip_options *opt = cork->opt; struct ip_options *opt = cork->opt;
@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst; struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0; unsigned int wmem_alloc_delta = 0;
bool paged, extra_uref;
u32 tskey = 0; u32 tskey = 0;
bool paged;
skb = skb_peek_tail(queue); skb = skb_peek_tail(queue);
@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL; csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = true;
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
cork->length += length; cork->length += length;
/* So, what's going on in the loop below? /* So, what's going on in the loop below?
@ -1001,12 +1016,6 @@ alloc_new_skb:
skb->csum = 0; skb->csum = 0;
skb_reserve(skb, hh_len); skb_reserve(skb, hh_len);
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
/* /*
* Find where to start putting bytes. * Find where to start putting bytes.
*/ */
@ -1039,6 +1048,13 @@ alloc_new_skb:
exthdrlen = 0; exthdrlen = 0;
csummode = CHECKSUM_NONE; csummode = CHECKSUM_NONE;
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev) if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1); skb_set_dst_pending_confirm(skb, 1);
@ -1068,7 +1084,7 @@ alloc_new_skb:
err = -EFAULT; err = -EFAULT;
goto error; goto error;
} }
} else { } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags; int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM; err = -ENOMEM;
@ -1098,6 +1114,10 @@ alloc_new_skb:
skb->data_len += copy; skb->data_len += copy;
skb->truesize += copy; skb->truesize += copy;
wmem_alloc_delta += copy; wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
if (err < 0)
goto error;
} }
offset += copy; offset += copy;
length -= copy; length -= copy;
@ -1110,6 +1130,7 @@ alloc_new_skb:
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length; cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

View File

@ -1423,7 +1423,7 @@ do_error:
if (copied + copied_syn) if (copied + copied_syn)
goto out; goto out;
out_err: out_err:
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err); err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */ /* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&

View File

@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
{ {
struct sk_buff *skb, *skb_prev = NULL; struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0; int exthdrlen = 0;
int dst_exthdrlen = 0; int dst_exthdrlen = 0;
int hh_len; int hh_len;
@ -1257,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize; unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0; unsigned int wmem_alloc_delta = 0;
bool paged; bool paged, extra_uref;
skb = skb_peek_tail(queue); skb = skb_peek_tail(queue);
if (!skb) { if (!skb) {
@ -1322,6 +1323,20 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL; csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = true;
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
/* /*
* Let's try using as much space as possible. * Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU. * Use MTU if total length of the message fits into the MTU.
@ -1440,12 +1455,6 @@ alloc_new_skb:
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
dst_exthdrlen); dst_exthdrlen);
/* Only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
/* /*
* Find where to start putting bytes * Find where to start putting bytes
*/ */
@ -1477,6 +1486,13 @@ alloc_new_skb:
exthdrlen = 0; exthdrlen = 0;
dst_exthdrlen = 0; dst_exthdrlen = 0;
/* Only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev) if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1); skb_set_dst_pending_confirm(skb, 1);
@ -1506,7 +1522,7 @@ alloc_new_skb:
err = -EFAULT; err = -EFAULT;
goto error; goto error;
} }
} else { } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags; int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM; err = -ENOMEM;
@ -1536,6 +1552,10 @@ alloc_new_skb:
skb->data_len += copy; skb->data_len += copy;
skb->truesize += copy; skb->truesize += copy;
wmem_alloc_delta += copy; wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
if (err < 0)
goto error;
} }
offset += copy; offset += copy;
length -= copy; length -= copy;
@ -1548,6 +1568,7 @@ alloc_new_skb:
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length; cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

View File

@ -651,12 +651,13 @@ static void do_flush_datagram(int fd, int type)
static void do_rx(int domain, int type, int protocol) static void do_rx(int domain, int type, int protocol)
{ {
const int cfg_receiver_wait_ms = 400;
uint64_t tstop; uint64_t tstop;
int fd; int fd;
fd = do_setup_rx(domain, type, protocol); fd = do_setup_rx(domain, type, protocol);
tstop = gettimeofday_ms() + cfg_runtime_ms; tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
do { do {
if (type == SOCK_STREAM) if (type == SOCK_STREAM)
do_flush_tcp(fd); do_flush_tcp(fd);

View File

@ -25,6 +25,8 @@ readonly path_sysctl_mem="net.core.optmem_max"
if [[ "$#" -eq "0" ]]; then if [[ "$#" -eq "0" ]]; then
$0 4 tcp -t 1 $0 4 tcp -t 1
$0 6 tcp -t 1 $0 6 tcp -t 1
$0 4 udp -t 1
$0 6 udp -t 1
echo "OK. All tests passed" echo "OK. All tests passed"
exit 0 exit 0
fi fi

View File

@ -35,6 +35,9 @@ run_udp() {
echo "udp gso" echo "udp gso"
run_in_netns ${args} -S 0 run_in_netns ${args} -S 0
echo "udp gso zerocopy"
run_in_netns ${args} -S 0 -z
} }
run_tcp() { run_tcp() {