From 9e2db9d3993e270b24fbc4ce1ca7e09756e8df25 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:41 +0100 Subject: [PATCH 1/5] net: always try to set ubuf in skb_zerocopy_iter_stream skb_zcopy_set() does nothing if there is already a ubuf_info associated with an skb, and since ->link_skb should have set it several lines above the check here essentially does nothing and can be removed. It's also safer this way, because even if the callback is faulty we'll have it set. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eb9a7e65b5c8..52986e1ce13e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1899,8 +1899,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - if (!uarg->ops->link_skb) - skb_zcopy_set(skb, uarg, NULL); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); From 7fb05423fed41686ccc1a76c20d486728f62023f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:42 +0100 Subject: [PATCH 2/5] net: split __zerocopy_sg_from_iter() Split a function out of __zerocopy_sg_from_iter() that only cares about the traditional path with refcounted pages and doesn't need to know about ->sg_from_iter. A preparation patch, we'll improve on the function later. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- net/core/datagram.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index e614cfd8e14a..ef81d6ecbe1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -610,16 +610,10 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb, struct iov_iter *from, - size_t length) +static int zerocopy_fill_skb_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) { - int frag; - - if (msg && msg->msg_ubuf && msg->sg_from_iter) - return msg->sg_from_iter(sk, skb, from, length); - - frag = skb_shinfo(skb)->nr_frags; + int frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; @@ -692,6 +686,16 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, } return 0; } + +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length) +{ + if (msg && msg->msg_ubuf && msg->sg_from_iter) + return msg->sg_from_iter(sk, skb, from, length); + else + return zerocopy_fill_skb_from_iter(sk, skb, from, length); +} EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** From aeb320fc05c74e1d3b429aa0e3a777b8a931c189 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:43 +0100 Subject: [PATCH 3/5] net: batch zerocopy_fill_skb_from_iter accounting Instead of accounting every page range against the socket separately, do it in batch based on the change in skb->truesize. It's also moved into __zerocopy_sg_from_iter(), so that zerocopy_fill_skb_from_iter() is simpler and responsible for setting frags but not the accounting. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- net/core/datagram.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index ef81d6ecbe1e..b0dccefd4a09 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -610,7 +610,7 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -static int zerocopy_fill_skb_from_iter(struct sock *sk, struct sk_buff *skb, +static int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { int frag = skb_shinfo(skb)->nr_frags; @@ -621,7 +621,6 @@ static int zerocopy_fill_skb_from_iter(struct sock *sk, struct sk_buff *skb, int refs, order, n = 0; size_t start; ssize_t copied; - unsigned long truesize; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; @@ -633,17 +632,9 @@ static int zerocopy_fill_skb_from_iter(struct sock *sk, struct sk_buff *skb, length -= copied; - truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; - skb->truesize += truesize; - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); - } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); - } + skb->truesize += PAGE_ALIGN(copied + start); head = compound_head(pages[n]); order = compound_order(head); @@ -691,10 +682,24 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { + unsigned long orig_size = skb->truesize; + unsigned long truesize; + int ret; + if (msg && msg->msg_ubuf && msg->sg_from_iter) return msg->sg_from_iter(sk, skb, from, length); - else - return zerocopy_fill_skb_from_iter(sk, skb, from, length); + + ret = zerocopy_fill_skb_from_iter(skb, from, length); + truesize = skb->truesize - orig_size; + + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; } EXPORT_SYMBOL(__zerocopy_sg_from_iter); From 060f4ba6e40338a70932603a3564903acf5f5734 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:44 +0100 Subject: [PATCH 4/5] io_uring/net: move charging socket out of zc io_uring Currently, io_uring's io_sg_from_iter() duplicates the part of __zerocopy_sg_from_iter() charging pages to the socket. It'd be too easy to miss while changing it in net/, the chunk is not the most straightforward for outside users and full of internal implementation details. io_uring is not a good place to keep it, deduplicate it by moving out of the callback into __zerocopy_sg_from_iter(). Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/linux/socket.h | 2 +- io_uring/net.c | 16 ++++------------ net/core/datagram.c | 10 +++++----- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f4cda3fbdb75..9c29bdd5596d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1703,6 +1703,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370b..2a1ff91d1914 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -76,7 +76,7 @@ struct msghdr { __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; - int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; diff --git a/io_uring/net.c b/io_uring/net.c index 7c98c4d50946..84a7602bcef1 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1265,14 +1265,14 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); } -static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter_iovec(struct sk_buff *skb, struct iov_iter *from, size_t length) { skb_zcopy_downgrade_managed(skb); - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); } -static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1285,7 +1285,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, if (!frag) shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; else if (unlikely(!skb_zcopy_managed(skb))) - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); bi.bi_size = min(from->count, length); bi.bi_bvec_done = from->iov_offset; @@ -1312,14 +1312,6 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); - } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); - } return ret; } diff --git a/net/core/datagram.c b/net/core/datagram.c index b0dccefd4a09..16507b7cc4fb 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -610,8 +610,8 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -static int zerocopy_fill_skb_from_iter(struct sk_buff *skb, - struct iov_iter *from, size_t length) +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length) { int frag = skb_shinfo(skb)->nr_frags; @@ -687,11 +687,11 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, int ret; if (msg && msg->msg_ubuf && msg->sg_from_iter) - return msg->sg_from_iter(sk, skb, from, length); + ret = msg->sg_from_iter(skb, from, length); + else + ret = zerocopy_fill_skb_from_iter(skb, from, length); - ret = zerocopy_fill_skb_from_iter(skb, from, length); truesize = skb->truesize - orig_size; - if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); if (!skb_zcopy_pure(skb)) From 2ca58ed21cefdda45520a0a2b1980c008efe9874 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:45 +0100 Subject: [PATCH 5/5] net: limit scope of a skb_zerocopy_iter_stream var skb_zerocopy_iter_stream() only uses @orig_uarg in the !link_skb path, and we can move the local variable in the appropriate block. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 52986e1ce13e..0ed4d00d258c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1871,7 +1871,6 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) { - struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; if (uarg->ops->link_skb) { @@ -1879,6 +1878,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, if (err) return err; } else { + struct ubuf_info *orig_uarg = skb_zcopy(skb); + /* An skb can only point to one uarg. This edge case happens * when TCP appends to an skb, but zerocopy_realloc triggered * a new alloc.