net-zerocopy: Defer vm zap unless actually needed.
Zapping pages is required only if we are calling vm_insert_page into a region where pages had previously been mapped. Receive zerocopy allows reusing such regions, and hitherto called zap_page_range() before calling vm_insert_page() in that range. zap_page_range() can also be triggered from userspace with madvise(MADV_DONTNEED). If userspace is configured to call this before reusing a segment, or if there was nothing mapped at this virtual address to begin with, we can avoid calling zap_page_range() under the socket lock. That said, if userspace does not do that, then we are still responsible for calling zap_page_range(). This patch adds a flag that the user can use to hint to the kernel that a zap is not required. If the flag is not set, or if an older user application does not have a flags field at all, then the kernel calls zap_page_range as before. Also, if the flag is set but a zap is still required, the kernel performs that zap as necessary. Thus incorrectly indicating that a zap can be avoided does not change the correctness of operation. It also increases the batchsize for vm_insert_pages and prefetches the page struct for the batch since we're about to bump the refcount. An alternative mechanism could be to not have a flag, assume by default a zap is not needed, and fall back to zapping if needed. However, this would harm performance for older applications for which a zap is necessary, and thus we implement it with an explicit flag so newer applications can opt in. When using RPC-style traffic with medium sized (tens of KB) RPCs, this change yields an efficency improvement of about 30% for QPS/CPU usage. Signed-off-by: Arjun Roy <arjunroy@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
0c3936d32f
commit
94ab9eb9b2
@ -343,6 +343,7 @@ struct tcp_diag_md5sig {
|
|||||||
|
|
||||||
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
|
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
|
||||||
|
|
||||||
|
#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
|
||||||
struct tcp_zerocopy_receive {
|
struct tcp_zerocopy_receive {
|
||||||
__u64 address; /* in: address of mapping */
|
__u64 address; /* in: address of mapping */
|
||||||
__u32 length; /* in/out: number of bytes to map/mapped */
|
__u32 length; /* in/out: number of bytes to map/mapped */
|
||||||
@ -351,5 +352,6 @@ struct tcp_zerocopy_receive {
|
|||||||
__s32 err; /* out: socket error */
|
__s32 err; /* out: socket error */
|
||||||
__u64 copybuf_address; /* in: copybuf address (small reads) */
|
__u64 copybuf_address; /* in: copybuf address (small reads) */
|
||||||
__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
|
__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
|
||||||
|
__u32 flags; /* in: flags */
|
||||||
};
|
};
|
||||||
#endif /* _UAPI_LINUX_TCP_H */
|
#endif /* _UAPI_LINUX_TCP_H */
|
||||||
|
147
net/ipv4/tcp.c
147
net/ipv4/tcp.c
@ -1924,51 +1924,101 @@ static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
|
|||||||
return zc->copybuf_len < 0 ? 0 : copylen;
|
return zc->copybuf_len < 0 ? 0 : copylen;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
|
||||||
|
struct page **pending_pages,
|
||||||
|
unsigned long pages_remaining,
|
||||||
|
unsigned long *address,
|
||||||
|
u32 *length,
|
||||||
|
u32 *seq,
|
||||||
|
struct tcp_zerocopy_receive *zc,
|
||||||
|
u32 total_bytes_to_map,
|
||||||
|
int err)
|
||||||
|
{
|
||||||
|
/* At least one page did not map. Try zapping if we skipped earlier. */
|
||||||
|
if (err == -EBUSY &&
|
||||||
|
zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
|
||||||
|
u32 maybe_zap_len;
|
||||||
|
|
||||||
|
maybe_zap_len = total_bytes_to_map - /* All bytes to map */
|
||||||
|
*length + /* Mapped or pending */
|
||||||
|
(pages_remaining * PAGE_SIZE); /* Failed map. */
|
||||||
|
zap_page_range(vma, *address, maybe_zap_len);
|
||||||
|
err = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!err) {
|
||||||
|
unsigned long leftover_pages = pages_remaining;
|
||||||
|
int bytes_mapped;
|
||||||
|
|
||||||
|
/* We called zap_page_range, try to reinsert. */
|
||||||
|
err = vm_insert_pages(vma, *address,
|
||||||
|
pending_pages,
|
||||||
|
&pages_remaining);
|
||||||
|
bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
|
||||||
|
*seq += bytes_mapped;
|
||||||
|
*address += bytes_mapped;
|
||||||
|
}
|
||||||
|
if (err) {
|
||||||
|
/* Either we were unable to zap, OR we zapped, retried an
|
||||||
|
* insert, and still had an issue. Either ways, pages_remaining
|
||||||
|
* is the number of pages we were unable to map, and we unroll
|
||||||
|
* some state we speculatively touched before.
|
||||||
|
*/
|
||||||
|
const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
|
||||||
|
|
||||||
|
*length -= bytes_not_mapped;
|
||||||
|
zc->recv_skip_hint += bytes_not_mapped;
|
||||||
|
}
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
|
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
|
||||||
struct page **pages,
|
struct page **pages,
|
||||||
unsigned long pages_to_map,
|
unsigned int pages_to_map,
|
||||||
unsigned long *insert_addr,
|
unsigned long *address,
|
||||||
u32 *length_with_pending,
|
u32 *length,
|
||||||
u32 *seq,
|
u32 *seq,
|
||||||
struct tcp_zerocopy_receive *zc)
|
struct tcp_zerocopy_receive *zc,
|
||||||
|
u32 total_bytes_to_map)
|
||||||
{
|
{
|
||||||
unsigned long pages_remaining = pages_to_map;
|
unsigned long pages_remaining = pages_to_map;
|
||||||
int bytes_mapped;
|
unsigned int pages_mapped;
|
||||||
int ret;
|
unsigned int bytes_mapped;
|
||||||
|
int err;
|
||||||
|
|
||||||
ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
|
err = vm_insert_pages(vma, *address, pages, &pages_remaining);
|
||||||
bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
|
pages_mapped = pages_to_map - (unsigned int)pages_remaining;
|
||||||
|
bytes_mapped = PAGE_SIZE * pages_mapped;
|
||||||
/* Even if vm_insert_pages fails, it may have partially succeeded in
|
/* Even if vm_insert_pages fails, it may have partially succeeded in
|
||||||
* mapping (some but not all of the pages).
|
* mapping (some but not all of the pages).
|
||||||
*/
|
*/
|
||||||
*seq += bytes_mapped;
|
*seq += bytes_mapped;
|
||||||
*insert_addr += bytes_mapped;
|
*address += bytes_mapped;
|
||||||
if (ret) {
|
|
||||||
/* But if vm_insert_pages did fail, we have to unroll some state
|
if (likely(!err))
|
||||||
* we speculatively touched before.
|
return 0;
|
||||||
*/
|
|
||||||
const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
|
/* Error: maybe zap and retry + rollback state for failed inserts. */
|
||||||
*length_with_pending -= bytes_not_mapped;
|
return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
|
||||||
zc->recv_skip_hint += bytes_not_mapped;
|
pages_remaining, address, length, seq, zc, total_bytes_to_map,
|
||||||
}
|
err);
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
|
||||||
static int tcp_zerocopy_receive(struct sock *sk,
|
static int tcp_zerocopy_receive(struct sock *sk,
|
||||||
struct tcp_zerocopy_receive *zc)
|
struct tcp_zerocopy_receive *zc)
|
||||||
{
|
{
|
||||||
u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
|
u32 length = 0, offset, vma_len, avail_len, copylen = 0;
|
||||||
unsigned long address = (unsigned long)zc->address;
|
unsigned long address = (unsigned long)zc->address;
|
||||||
|
struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
|
||||||
s32 copybuf_len = zc->copybuf_len;
|
s32 copybuf_len = zc->copybuf_len;
|
||||||
struct tcp_sock *tp = tcp_sk(sk);
|
struct tcp_sock *tp = tcp_sk(sk);
|
||||||
#define PAGE_BATCH_SIZE 8
|
|
||||||
struct page *pages[PAGE_BATCH_SIZE];
|
|
||||||
const skb_frag_t *frags = NULL;
|
const skb_frag_t *frags = NULL;
|
||||||
|
unsigned int pages_to_map = 0;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
struct sk_buff *skb = NULL;
|
struct sk_buff *skb = NULL;
|
||||||
unsigned long pg_idx = 0;
|
|
||||||
unsigned long curr_addr;
|
|
||||||
u32 seq = tp->copied_seq;
|
u32 seq = tp->copied_seq;
|
||||||
|
u32 total_bytes_to_map;
|
||||||
int inq = tcp_inq(sk);
|
int inq = tcp_inq(sk);
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
@ -2002,34 +2052,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
|
|||||||
}
|
}
|
||||||
vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
|
vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
|
||||||
avail_len = min_t(u32, vma_len, inq);
|
avail_len = min_t(u32, vma_len, inq);
|
||||||
aligned_len = avail_len & ~(PAGE_SIZE - 1);
|
total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
|
||||||
if (aligned_len) {
|
if (total_bytes_to_map) {
|
||||||
zap_page_range(vma, address, aligned_len);
|
if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
|
||||||
zc->length = aligned_len;
|
zap_page_range(vma, address, total_bytes_to_map);
|
||||||
|
zc->length = total_bytes_to_map;
|
||||||
zc->recv_skip_hint = 0;
|
zc->recv_skip_hint = 0;
|
||||||
} else {
|
} else {
|
||||||
zc->length = avail_len;
|
zc->length = avail_len;
|
||||||
zc->recv_skip_hint = avail_len;
|
zc->recv_skip_hint = avail_len;
|
||||||
}
|
}
|
||||||
ret = 0;
|
ret = 0;
|
||||||
curr_addr = address;
|
|
||||||
while (length + PAGE_SIZE <= zc->length) {
|
while (length + PAGE_SIZE <= zc->length) {
|
||||||
int mappable_offset;
|
int mappable_offset;
|
||||||
|
struct page *page;
|
||||||
|
|
||||||
if (zc->recv_skip_hint < PAGE_SIZE) {
|
if (zc->recv_skip_hint < PAGE_SIZE) {
|
||||||
u32 offset_frag;
|
u32 offset_frag;
|
||||||
|
|
||||||
/* If we're here, finish the current batch. */
|
|
||||||
if (pg_idx) {
|
|
||||||
ret = tcp_zerocopy_vm_insert_batch(vma, pages,
|
|
||||||
pg_idx,
|
|
||||||
&curr_addr,
|
|
||||||
&length,
|
|
||||||
&seq, zc);
|
|
||||||
if (ret)
|
|
||||||
goto out;
|
|
||||||
pg_idx = 0;
|
|
||||||
}
|
|
||||||
if (skb) {
|
if (skb) {
|
||||||
if (zc->recv_skip_hint > 0)
|
if (zc->recv_skip_hint > 0)
|
||||||
break;
|
break;
|
||||||
@ -2050,24 +2090,31 @@ static int tcp_zerocopy_receive(struct sock *sk,
|
|||||||
zc->recv_skip_hint = mappable_offset;
|
zc->recv_skip_hint = mappable_offset;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
pages[pg_idx] = skb_frag_page(frags);
|
page = skb_frag_page(frags);
|
||||||
pg_idx++;
|
prefetchw(page);
|
||||||
|
pages[pages_to_map++] = page;
|
||||||
length += PAGE_SIZE;
|
length += PAGE_SIZE;
|
||||||
zc->recv_skip_hint -= PAGE_SIZE;
|
zc->recv_skip_hint -= PAGE_SIZE;
|
||||||
frags++;
|
frags++;
|
||||||
if (pg_idx == PAGE_BATCH_SIZE) {
|
if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
|
||||||
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
|
zc->recv_skip_hint < PAGE_SIZE) {
|
||||||
&curr_addr, &length,
|
/* Either full batch, or we're about to go to next skb
|
||||||
&seq, zc);
|
* (and we cannot unroll failed ops across skbs).
|
||||||
|
*/
|
||||||
|
ret = tcp_zerocopy_vm_insert_batch(vma, pages,
|
||||||
|
pages_to_map,
|
||||||
|
&address, &length,
|
||||||
|
&seq, zc,
|
||||||
|
total_bytes_to_map);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
pg_idx = 0;
|
pages_to_map = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pg_idx) {
|
if (pages_to_map) {
|
||||||
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
|
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
|
||||||
&curr_addr, &length, &seq,
|
&address, &length, &seq,
|
||||||
zc);
|
zc, total_bytes_to_map);
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
mmap_read_unlock(current->mm);
|
mmap_read_unlock(current->mm);
|
||||||
|
Loading…
Reference in New Issue
Block a user