Merge branch 'RDS-zerocopy-support'

Sowmini Varadhan says:

====================
RDS: zerocopy support

This is version 3 of the series, following up on review comments for
 http://patchwork.ozlabs.org/project/netdev/list/?series=28530

Review comments addressed
Patch 4
  - fix fragile use of skb->cb[], do not set ee_code incorrectly.
Patch 5:
  - remove needless bzero of skb->cb[], consolidate err cleanup

A brief overview of this feature follows.

This patch series provides support for MSG_ZERCOCOPY
on a PF_RDS socket based on the APIs and infrastructure added
by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY")

For single threaded rds-stress testing using rds-tcp with the
ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
results show that  there is a significant reduction in latency: about
90 usec with zerocopy, compared with 200 usec without zerocopy.

This patchset modifies the above for zerocopy in the following manner.
- if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
- if the SO_ZEROCOPY  socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg are pinned. The pinning
uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit
on MSG_ZEROCOPY pages"). The message is unpinned when all references
to the message go down to 0, and the message is freed by rds_message_purge.

A multithreaded application using this infrastructure must send down
a unique 32 bit cookie as ancillary data with each sendmsg invocation.
The format of this ancillary data is described in Patch 5 of the series.
The cookie is passed up to the application on the sk_error_queue when
the message is unpinned, indicating to the application that it is now
safe to free/reuse the message buffer. The details of the completion
notification are provided in Patch 4 of this series.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-02-16 16:04:18 -05:00
commit 80c6d2b8d8
11 changed files with 339 additions and 35 deletions

View File

@ -466,6 +466,9 @@ struct ubuf_info {
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg);

View File

@ -20,11 +20,13 @@ struct sock_extended_err {
#define SO_EE_ORIGIN_ICMP6 3
#define SO_EE_ORIGIN_TXSTATUS 4
#define SO_EE_ORIGIN_ZEROCOPY 5
#define SO_EE_ORIGIN_ZCOOKIE 6
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
#define SO_EE_CODE_ZEROCOPY_COPIED 1
#define SO_EE_ORIGIN_MAX_ZCOOKIES 8
/**
* struct scm_timestamping - timestamps exposed through cmsg

View File

@ -103,6 +103,7 @@
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
#define RDS_CMSG_RXPATH_LATENCY 11
#define RDS_CMSG_ZCOPY_COOKIE 12
#define RDS_INFO_FIRST 10000
#define RDS_INFO_COUNTERS 10000

View File

@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
unsigned long max_pg, num_pg, new_pg, old_pg;
struct user_struct *user;
@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
static void mm_unaccount_pinned_pages(struct mmpin *mmp)
void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
if (mmp->user) {
atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
free_uid(mmp->user);
}
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
{

View File

@ -1049,18 +1049,21 @@ set_rcvbuf:
break;
case SO_ZEROCOPY:
if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
if (sk->sk_protocol != IPPROTO_TCP)
ret = -ENOTSUPP;
else if (sk->sk_state != TCP_CLOSE)
ret = -EBUSY;
} else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
else if (sk->sk_protocol != IPPROTO_TCP)
ret = -ENOTSUPP;
else if (sk->sk_state != TCP_CLOSE)
ret = -EBUSY;
else if (val < 0 || val > 1)
ret = -EINVAL;
else
sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
break;
}
if (!ret) {
if (val < 0 || val > 1)
ret = -EINVAL;
else
sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
break;
}
default:
ret = -ENOPROTOOPT;
break;

View File

@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
mask |= (EPOLLIN | EPOLLRDNORM);
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (EPOLLOUT | EPOLLWRNORM);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
/* clear state any time we wake a seen-congested socket */

View File

@ -33,6 +33,9 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/errqueue.h>
#include "rds.h"
@ -53,20 +56,92 @@ void rds_message_addref(struct rds_message *rm)
}
EXPORT_SYMBOL_GPL(rds_message_addref);
static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
{
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
int ncookies;
u32 *ptr;
if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
return false;
ncookies = serr->ee.ee_data;
if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
return false;
ptr = skb_put(skb, sizeof(u32));
*ptr = cookie;
serr->ee.ee_data = ++ncookies;
return true;
}
static void rds_rm_zerocopy_callback(struct rds_sock *rs,
struct rds_znotifier *znotif)
{
struct sock *sk = rds_rs_to_sk(rs);
struct sk_buff *skb, *tail;
struct sock_exterr_skb *serr;
unsigned long flags;
struct sk_buff_head *q;
u32 cookie = znotif->z_cookie;
q = &sk->sk_error_queue;
spin_lock_irqsave(&q->lock, flags);
tail = skb_peek_tail(q);
if (tail && skb_zcookie_add(tail, cookie)) {
spin_unlock_irqrestore(&q->lock, flags);
mm_unaccount_pinned_pages(&znotif->z_mmp);
consume_skb(rds_skb_from_znotifier(znotif));
sk->sk_error_report(sk);
return;
}
skb = rds_skb_from_znotifier(znotif);
serr = SKB_EXT_ERR(skb);
memset(&serr->ee, 0, sizeof(serr->ee));
serr->ee.ee_errno = 0;
serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
serr->ee.ee_info = 0;
WARN_ON(!skb_zcookie_add(skb, cookie));
__skb_queue_tail(q, skb);
spin_unlock_irqrestore(&q->lock, flags);
sk->sk_error_report(sk);
mm_unaccount_pinned_pages(&znotif->z_mmp);
}
/*
* This relies on dma_map_sg() not touching sg[].page during merging.
*/
static void rds_message_purge(struct rds_message *rm)
{
unsigned long i;
unsigned long i, flags;
bool zcopy = false;
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
spin_lock_irqsave(&rm->m_rs_lock, flags);
if (rm->m_rs) {
struct rds_sock *rs = rm->m_rs;
if (rm->data.op_mmp_znotifier) {
zcopy = true;
rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
rm->data.op_mmp_znotifier = NULL;
}
sock_put(rds_rs_to_sk(rs));
rm->m_rs = NULL;
}
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
for (i = 0; i < rm->data.op_nents; i++) {
rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
/* XXX will have to put_page for page refs */
__free_page(sg_page(&rm->data.op_sg[i]));
if (!zcopy)
__free_page(sg_page(&rm->data.op_sg[i]));
else
put_page(sg_page(&rm->data.op_sg[i]));
}
rm->data.op_nents = 0;
@ -266,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
return rm;
}
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
bool zcopy)
{
unsigned long to_copy, nbytes;
unsigned long sg_off;
struct scatterlist *sg;
int ret = 0;
int length = iov_iter_count(from);
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
@ -281,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
sg = rm->data.op_sg;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
if (zcopy) {
int total_copied = 0;
struct sk_buff *skb;
skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
GFP_KERNEL);
if (!skb)
return -ENOMEM;
rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
length)) {
ret = -ENOMEM;
goto err;
}
while (iov_iter_count(from)) {
struct page *pages;
size_t start;
ssize_t copied;
copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
1, &start);
if (copied < 0) {
struct mmpin *mmp;
int i;
for (i = 0; i < rm->data.op_nents; i++)
put_page(sg_page(&rm->data.op_sg[i]));
mmp = &rm->data.op_mmp_znotifier->z_mmp;
mm_unaccount_pinned_pages(mmp);
ret = -EFAULT;
goto err;
}
total_copied += copied;
iov_iter_advance(from, copied);
length -= copied;
sg_set_page(sg, pages, copied, start);
rm->data.op_nents++;
sg++;
}
WARN_ON_ONCE(length != 0);
return ret;
err:
consume_skb(skb);
rm->data.op_mmp_znotifier = NULL;
return ret;
} /* zcopy */
while (iov_iter_count(from)) {
if (!sg_page(sg)) {
ret = rds_page_remainder_alloc(sg, iov_iter_count(from),

View File

@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
#define RDS_MSG_PAGEVEC 7
#define RDS_MSG_FLUSH 8
struct rds_znotifier {
struct list_head z_list;
struct mmpin z_mmp;
u32 z_cookie;
};
#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0]))
static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z)
{
return container_of((void *)z, struct sk_buff, cb);
}
struct rds_message {
refcount_t m_refcount;
struct list_head m_sock_item;
@ -436,6 +449,7 @@ struct rds_message {
unsigned int op_count;
unsigned int op_dmasg;
unsigned int op_dmaoff;
struct rds_znotifier *op_mmp_znotifier;
struct scatterlist *op_sg;
} data;
};
@ -771,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
bool zcopy);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);

View File

@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (msg_flags & MSG_OOB)
goto out;
if (msg_flags & MSG_ERRQUEUE)
return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
while (1) {
/* If there are pending notifications, do those - and nothing else */

View File

@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
rm->rdma.op_notifier = NULL;
}
was_on_sock = 1;
rm->m_rs = NULL;
}
spin_unlock(&rs->rs_lock);
@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
*/
if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
spin_unlock_irqrestore(&cp->cp_lock, flags);
spin_lock_irqsave(&rm->m_rs_lock, flags);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
continue;
}
list_del_init(&rm->m_conn_item);
@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
rds_message_put(rm);
@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
rds_message_put(rm);
@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
rds_message_addref(rm);
sock_hold(rds_rs_to_sk(rs));
rm->m_rs = rs;
/* The code ordering is a little weird, but we're
@ -880,12 +875,13 @@ out:
* rds_message is getting to be quite complicated, and we'd like to allocate
* it all in one go. This figures out how big it needs to be up front.
*/
static int rds_rm_size(struct msghdr *msg, int data_len)
static int rds_rm_size(struct msghdr *msg, int num_sgs)
{
struct cmsghdr *cmsg;
int size = 0;
int cmsg_groups = 0;
int retval;
bool zcopy_cookie = false;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@ -904,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
break;
case RDS_CMSG_ZCOPY_COOKIE:
zcopy_cookie = true;
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
cmsg_groups |= 2;
@ -924,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
}
size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
return -EINVAL;
size += num_sgs * sizeof(struct scatterlist);
/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
if (cmsg_groups == 3)
@ -933,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
return size;
}
static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
u32 *cookie;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
return -EINVAL;
cookie = CMSG_DATA(cmsg);
rm->data.op_mmp_znotifier->z_cookie = *cookie;
return 0;
}
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
struct msghdr *msg, int *allocated_mr)
{
@ -975,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
ret = rds_cmsg_atomic(rs, rm, cmsg);
break;
case RDS_CMSG_ZCOPY_COOKIE:
ret = rds_cmsg_zcopy(rs, rm, cmsg);
break;
default:
return -EINVAL;
}
@ -1045,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
long timeo = sock_sndtimeo(sk, nonblock);
struct rds_conn_path *cpath;
size_t total_payload_len = payload_len, rdma_payload_len = 0;
bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
int num_sgs = ceil(payload_len, PAGE_SIZE);
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
ret = -EOPNOTSUPP;
goto out;
}
@ -1092,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
if (zcopy) {
if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
ret = -EOPNOTSUPP;
goto out;
}
num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
}
/* size of rm including all sgs */
ret = rds_rm_size(msg, payload_len);
ret = rds_rm_size(msg, num_sgs);
if (ret < 0)
goto out;
@ -1105,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* Attach data to the rm */
if (payload_len) {
rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
if (!rm->data.op_sg) {
ret = -ENOMEM;
goto out;
}
ret = rds_message_copy_from_user(rm, &msg->msg_iter);
ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
if (ret)
goto out;
}

View File

@ -14,6 +14,9 @@
* - SOCK_DGRAM
* - SOCK_RAW
*
* PF_RDS
* - SOCK_SEQPACKET
*
* Start this program on two connected hosts, one in send mode and
* the other with option '-r' to put it in receiver mode.
*
@ -53,6 +56,7 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <linux/rds.h>
#ifndef SO_EE_ORIGIN_ZEROCOPY
#define SO_EE_ORIGIN_ZEROCOPY 5
@ -164,17 +168,39 @@ static int do_accept(int fd)
return fd;
}
static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
{
struct cmsghdr *cm;
if (!msg->msg_control)
error(1, errno, "NULL cookie");
cm = (void *)msg->msg_control;
cm->cmsg_len = CMSG_LEN(sizeof(cookie));
cm->cmsg_level = SOL_RDS;
cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
}
static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
{
int ret, len, i, flags;
static uint32_t cookie;
char ckbuf[CMSG_SPACE(sizeof(cookie))];
len = 0;
for (i = 0; i < msg->msg_iovlen; i++)
len += msg->msg_iov[i].iov_len;
flags = MSG_DONTWAIT;
if (do_zerocopy)
if (do_zerocopy) {
flags |= MSG_ZEROCOPY;
if (domain == PF_RDS) {
memset(&msg->msg_control, 0, sizeof(msg->msg_control));
msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
msg->msg_control = (struct cmsghdr *)ckbuf;
add_zcopy_cookie(msg, ++cookie);
}
}
ret = sendmsg(fd, msg, flags);
if (ret == -1 && errno == EAGAIN)
@ -190,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
if (do_zerocopy && ret)
expected_completions++;
}
if (do_zerocopy && domain == PF_RDS) {
msg->msg_control = NULL;
msg->msg_controllen = 0;
}
return true;
}
@ -216,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg)
msg->msg_iov[0].iov_len = payload_len + extra_len;
extra_len = 0;
do_sendmsg(fd, msg, do_zerocopy);
do_sendmsg(fd, msg, do_zerocopy,
(cfg_dst_addr.ss_family == AF_INET ?
PF_INET : PF_INET6));
}
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
@ -300,13 +332,38 @@ static int do_setup_tx(int domain, int type, int protocol)
if (cfg_zerocopy)
do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
if (domain != PF_PACKET)
if (domain != PF_PACKET && domain != PF_RDS)
if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
error(1, errno, "connect");
if (domain == PF_RDS) {
if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
error(1, errno, "bind");
}
return fd;
}
static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
uint32_t *ckbuf, size_t nbytes)
{
int ncookies, i;
if (serr->ee_errno != 0)
error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
ncookies = serr->ee_data;
if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
error(1, 0, "Returned %d cookies, max expected %d\n",
ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
if (nbytes != ncookies * sizeof(uint32_t))
error(1, 0, "Expected %d cookies, got %ld\n",
ncookies, nbytes/sizeof(uint32_t));
for (i = 0; i < ncookies; i++)
if (cfg_verbose >= 2)
fprintf(stderr, "%d\n", ckbuf[i]);
return ncookies;
}
static bool do_recv_completion(int fd)
{
struct sock_extended_err *serr;
@ -315,10 +372,17 @@ static bool do_recv_completion(int fd)
uint32_t hi, lo, range;
int ret, zerocopy;
char control[100];
uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
struct iovec iov;
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
iov.iov_base = ckbuf;
iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
if (ret == -1 && errno == EAGAIN)
return false;
@ -337,6 +401,11 @@ static bool do_recv_completion(int fd)
cm->cmsg_level, cm->cmsg_type);
serr = (void *) CMSG_DATA(cm);
if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
return true;
}
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
if (serr->ee_errno != 0)
@ -444,6 +513,13 @@ static void do_tx(int domain, int type, int protocol)
msg.msg_iovlen++;
}
if (domain == PF_RDS) {
msg.msg_name = &cfg_dst_addr;
msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
sizeof(struct sockaddr_in) :
sizeof(struct sockaddr_in6));
}
iov[2].iov_base = payload;
iov[2].iov_len = cfg_payload_len;
msg.msg_iovlen++;
@ -454,7 +530,7 @@ static void do_tx(int domain, int type, int protocol)
if (cfg_cork)
do_sendmsg_corked(fd, &msg);
else
do_sendmsg(fd, &msg, cfg_zerocopy);
do_sendmsg(fd, &msg, cfg_zerocopy, domain);
while (!do_poll(fd, POLLOUT)) {
if (cfg_zerocopy)
@ -555,6 +631,40 @@ static void do_flush_datagram(int fd, int type)
bytes += cfg_payload_len;
}
static void do_recvmsg(int fd)
{
int ret, off = 0;
char *buf;
struct iovec iov;
struct msghdr msg;
struct sockaddr_storage din;
buf = calloc(cfg_payload_len, sizeof(char));
iov.iov_base = buf;
iov.iov_len = cfg_payload_len;
memset(&msg, 0, sizeof(msg));
msg.msg_name = &din;
msg.msg_namelen = sizeof(din);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
ret = recvmsg(fd, &msg, MSG_TRUNC);
if (ret == -1)
error(1, errno, "recv");
if (ret != cfg_payload_len)
error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
if (memcmp(buf + off, payload, ret))
error(1, 0, "recv: data mismatch");
free(buf);
packets++;
bytes += cfg_payload_len;
}
static void do_rx(int domain, int type, int protocol)
{
uint64_t tstop;
@ -566,6 +676,8 @@ static void do_rx(int domain, int type, int protocol)
do {
if (type == SOCK_STREAM)
do_flush_tcp(fd);
else if (domain == PF_RDS)
do_recvmsg(fd);
else
do_flush_datagram(fd, type);
@ -610,6 +722,7 @@ static void parse_opts(int argc, char **argv)
40 /* max tcp options */;
int c;
char *daddr = NULL, *saddr = NULL;
char *cfg_test;
cfg_payload_len = max_payload_len;
@ -667,6 +780,14 @@ static void parse_opts(int argc, char **argv)
break;
}
}
cfg_test = argv[argc - 1];
if (strcmp(cfg_test, "rds") == 0) {
if (!daddr)
error(1, 0, "-D <server addr> required for PF_RDS\n");
if (!cfg_rx && !saddr)
error(1, 0, "-S <client addr> required for PF_RDS\n");
}
setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
@ -699,6 +820,8 @@ int main(int argc, char **argv)
do_test(cfg_family, SOCK_STREAM, 0);
else if (!strcmp(cfg_test, "udp"))
do_test(cfg_family, SOCK_DGRAM, 0);
else if (!strcmp(cfg_test, "rds"))
do_test(PF_RDS, SOCK_SEQPACKET, 0);
else
error(1, 0, "unknown cfg_test %s", cfg_test);