2022-07-12 23:52:51 +03:00
/* SPDX-License-Identifier: MIT */
/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
# include <assert.h>
# include <errno.h>
# include <error.h>
# include <fcntl.h>
# include <limits.h>
# include <stdbool.h>
# include <stdint.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <unistd.h>
# include <arpa/inet.h>
# include <linux/errqueue.h>
# include <linux/if_packet.h>
# include <linux/io_uring.h>
# include <linux/ipv6.h>
# include <linux/socket.h>
# include <linux/sockios.h>
# include <net/ethernet.h>
# include <net/if.h>
# include <netinet/in.h>
# include <netinet/ip.h>
# include <netinet/ip6.h>
# include <netinet/tcp.h>
# include <netinet/udp.h>
# include <sys/ioctl.h>
# include <sys/mman.h>
# include <sys/resource.h>
# include <sys/socket.h>
# include <sys/stat.h>
# include <sys/time.h>
# include <sys/types.h>
# include <sys/un.h>
# include <sys/wait.h>
# define NOTIF_TAG 0xfffffffULL
# define NONZC_TAG 0
# define ZC_TAG 1
enum {
MODE_NONZC = 0 ,
MODE_ZC = 1 ,
MODE_ZC_FIXED = 2 ,
MODE_MIXED = 3 ,
} ;
static bool cfg_cork = false ;
static int cfg_mode = MODE_ZC_FIXED ;
static int cfg_nr_reqs = 8 ;
static int cfg_family = PF_UNSPEC ;
static int cfg_payload_len ;
static int cfg_port = 8000 ;
static int cfg_runtime_ms = 4200 ;
static socklen_t cfg_alen ;
static struct sockaddr_storage cfg_dst_addr ;
static char payload [ IP_MAXPACKET ] __attribute__ ( ( aligned ( 4096 ) ) ) ;
struct io_sq_ring {
unsigned * head ;
unsigned * tail ;
unsigned * ring_mask ;
unsigned * ring_entries ;
unsigned * flags ;
unsigned * array ;
} ;
struct io_cq_ring {
unsigned * head ;
unsigned * tail ;
unsigned * ring_mask ;
unsigned * ring_entries ;
struct io_uring_cqe * cqes ;
} ;
struct io_uring_sq {
unsigned * khead ;
unsigned * ktail ;
unsigned * kring_mask ;
unsigned * kring_entries ;
unsigned * kflags ;
unsigned * kdropped ;
unsigned * array ;
struct io_uring_sqe * sqes ;
unsigned sqe_head ;
unsigned sqe_tail ;
size_t ring_sz ;
} ;
struct io_uring_cq {
unsigned * khead ;
unsigned * ktail ;
unsigned * kring_mask ;
unsigned * kring_entries ;
unsigned * koverflow ;
struct io_uring_cqe * cqes ;
size_t ring_sz ;
} ;
struct io_uring {
struct io_uring_sq sq ;
struct io_uring_cq cq ;
int ring_fd ;
} ;
# ifdef __alpha__
# ifndef __NR_io_uring_setup
# define __NR_io_uring_setup 535
# endif
# ifndef __NR_io_uring_enter
# define __NR_io_uring_enter 536
# endif
# ifndef __NR_io_uring_register
# define __NR_io_uring_register 537
# endif
# else /* !__alpha__ */
# ifndef __NR_io_uring_setup
# define __NR_io_uring_setup 425
# endif
# ifndef __NR_io_uring_enter
# define __NR_io_uring_enter 426
# endif
# ifndef __NR_io_uring_register
# define __NR_io_uring_register 427
# endif
# endif
# if defined(__x86_64) || defined(__i386__)
# define read_barrier() __asm__ __volatile__("":::"memory")
# define write_barrier() __asm__ __volatile__("":::"memory")
# else
# define read_barrier() __sync_synchronize()
# define write_barrier() __sync_synchronize()
# endif
static int io_uring_setup ( unsigned int entries , struct io_uring_params * p )
{
return syscall ( __NR_io_uring_setup , entries , p ) ;
}
static int io_uring_enter ( int fd , unsigned int to_submit ,
unsigned int min_complete ,
unsigned int flags , sigset_t * sig )
{
return syscall ( __NR_io_uring_enter , fd , to_submit , min_complete ,
flags , sig , _NSIG / 8 ) ;
}
static int io_uring_register_buffers ( struct io_uring * ring ,
const struct iovec * iovecs ,
unsigned nr_iovecs )
{
int ret ;
ret = syscall ( __NR_io_uring_register , ring - > ring_fd ,
IORING_REGISTER_BUFFERS , iovecs , nr_iovecs ) ;
return ( ret < 0 ) ? - errno : ret ;
}
static int io_uring_mmap ( int fd , struct io_uring_params * p ,
struct io_uring_sq * sq , struct io_uring_cq * cq )
{
size_t size ;
void * ptr ;
int ret ;
sq - > ring_sz = p - > sq_off . array + p - > sq_entries * sizeof ( unsigned ) ;
ptr = mmap ( 0 , sq - > ring_sz , PROT_READ | PROT_WRITE ,
MAP_SHARED | MAP_POPULATE , fd , IORING_OFF_SQ_RING ) ;
if ( ptr = = MAP_FAILED )
return - errno ;
sq - > khead = ptr + p - > sq_off . head ;
sq - > ktail = ptr + p - > sq_off . tail ;
sq - > kring_mask = ptr + p - > sq_off . ring_mask ;
sq - > kring_entries = ptr + p - > sq_off . ring_entries ;
sq - > kflags = ptr + p - > sq_off . flags ;
sq - > kdropped = ptr + p - > sq_off . dropped ;
sq - > array = ptr + p - > sq_off . array ;
size = p - > sq_entries * sizeof ( struct io_uring_sqe ) ;
sq - > sqes = mmap ( 0 , size , PROT_READ | PROT_WRITE ,
MAP_SHARED | MAP_POPULATE , fd , IORING_OFF_SQES ) ;
if ( sq - > sqes = = MAP_FAILED ) {
ret = - errno ;
err :
munmap ( sq - > khead , sq - > ring_sz ) ;
return ret ;
}
cq - > ring_sz = p - > cq_off . cqes + p - > cq_entries * sizeof ( struct io_uring_cqe ) ;
ptr = mmap ( 0 , cq - > ring_sz , PROT_READ | PROT_WRITE ,
MAP_SHARED | MAP_POPULATE , fd , IORING_OFF_CQ_RING ) ;
if ( ptr = = MAP_FAILED ) {
ret = - errno ;
munmap ( sq - > sqes , p - > sq_entries * sizeof ( struct io_uring_sqe ) ) ;
goto err ;
}
cq - > khead = ptr + p - > cq_off . head ;
cq - > ktail = ptr + p - > cq_off . tail ;
cq - > kring_mask = ptr + p - > cq_off . ring_mask ;
cq - > kring_entries = ptr + p - > cq_off . ring_entries ;
cq - > koverflow = ptr + p - > cq_off . overflow ;
cq - > cqes = ptr + p - > cq_off . cqes ;
return 0 ;
}
static int io_uring_queue_init ( unsigned entries , struct io_uring * ring ,
unsigned flags )
{
struct io_uring_params p ;
int fd , ret ;
memset ( ring , 0 , sizeof ( * ring ) ) ;
memset ( & p , 0 , sizeof ( p ) ) ;
p . flags = flags ;
fd = io_uring_setup ( entries , & p ) ;
if ( fd < 0 )
return fd ;
ret = io_uring_mmap ( fd , & p , & ring - > sq , & ring - > cq ) ;
if ( ! ret )
ring - > ring_fd = fd ;
else
close ( fd ) ;
return ret ;
}
static int io_uring_submit ( struct io_uring * ring )
{
struct io_uring_sq * sq = & ring - > sq ;
const unsigned mask = * sq - > kring_mask ;
unsigned ktail , submitted , to_submit ;
int ret ;
read_barrier ( ) ;
if ( * sq - > khead ! = * sq - > ktail ) {
submitted = * sq - > kring_entries ;
goto submit ;
}
if ( sq - > sqe_head = = sq - > sqe_tail )
return 0 ;
ktail = * sq - > ktail ;
to_submit = sq - > sqe_tail - sq - > sqe_head ;
for ( submitted = 0 ; submitted < to_submit ; submitted + + ) {
read_barrier ( ) ;
sq - > array [ ktail + + & mask ] = sq - > sqe_head + + & mask ;
}
if ( ! submitted )
return 0 ;
if ( * sq - > ktail ! = ktail ) {
write_barrier ( ) ;
* sq - > ktail = ktail ;
write_barrier ( ) ;
}
submit :
ret = io_uring_enter ( ring - > ring_fd , submitted , 0 ,
IORING_ENTER_GETEVENTS , NULL ) ;
return ret < 0 ? - errno : ret ;
}
static inline void io_uring_prep_send ( struct io_uring_sqe * sqe , int sockfd ,
const void * buf , size_t len , int flags )
{
memset ( sqe , 0 , sizeof ( * sqe ) ) ;
sqe - > opcode = ( __u8 ) IORING_OP_SEND ;
sqe - > fd = sockfd ;
sqe - > addr = ( unsigned long ) buf ;
sqe - > len = len ;
sqe - > msg_flags = ( __u32 ) flags ;
}
static inline void io_uring_prep_sendzc ( struct io_uring_sqe * sqe , int sockfd ,
const void * buf , size_t len , int flags ,
2022-09-01 13:54:05 +03:00
unsigned zc_flags )
2022-07-12 23:52:51 +03:00
{
io_uring_prep_send ( sqe , sockfd , buf , len , flags ) ;
2022-09-01 13:54:05 +03:00
sqe - > opcode = ( __u8 ) IORING_OP_SEND_ZC ;
2022-07-12 23:52:51 +03:00
sqe - > ioprio = zc_flags ;
}
static struct io_uring_sqe * io_uring_get_sqe ( struct io_uring * ring )
{
struct io_uring_sq * sq = & ring - > sq ;
if ( sq - > sqe_tail + 1 - sq - > sqe_head > * sq - > kring_entries )
return NULL ;
return & sq - > sqes [ sq - > sqe_tail + + & * sq - > kring_mask ] ;
}
static int io_uring_wait_cqe ( struct io_uring * ring , struct io_uring_cqe * * cqe_ptr )
{
struct io_uring_cq * cq = & ring - > cq ;
const unsigned mask = * cq - > kring_mask ;
unsigned head = * cq - > khead ;
int ret ;
* cqe_ptr = NULL ;
do {
read_barrier ( ) ;
if ( head ! = * cq - > ktail ) {
* cqe_ptr = & cq - > cqes [ head & mask ] ;
break ;
}
ret = io_uring_enter ( ring - > ring_fd , 0 , 1 ,
IORING_ENTER_GETEVENTS , NULL ) ;
if ( ret < 0 )
return - errno ;
} while ( 1 ) ;
return 0 ;
}
static inline void io_uring_cqe_seen ( struct io_uring * ring )
{
* ( & ring - > cq ) - > khead + = 1 ;
write_barrier ( ) ;
}
static unsigned long gettimeofday_ms ( void )
{
struct timeval tv ;
gettimeofday ( & tv , NULL ) ;
return ( tv . tv_sec * 1000 ) + ( tv . tv_usec / 1000 ) ;
}
static void do_setsockopt ( int fd , int level , int optname , int val )
{
if ( setsockopt ( fd , level , optname , & val , sizeof ( val ) ) )
error ( 1 , errno , " setsockopt %d.%d: %d " , level , optname , val ) ;
}
static int do_setup_tx ( int domain , int type , int protocol )
{
int fd ;
fd = socket ( domain , type , protocol ) ;
if ( fd = = - 1 )
error ( 1 , errno , " socket t " ) ;
do_setsockopt ( fd , SOL_SOCKET , SO_SNDBUF , 1 < < 21 ) ;
if ( connect ( fd , ( void * ) & cfg_dst_addr , cfg_alen ) )
error ( 1 , errno , " connect " ) ;
return fd ;
}
static void do_tx ( int domain , int type , int protocol )
{
struct io_uring_sqe * sqe ;
struct io_uring_cqe * cqe ;
unsigned long packets = 0 , bytes = 0 ;
struct io_uring ring ;
struct iovec iov ;
uint64_t tstop ;
int i , fd , ret ;
int compl_cqes = 0 ;
fd = do_setup_tx ( domain , type , protocol ) ;
ret = io_uring_queue_init ( 512 , & ring , 0 ) ;
if ( ret )
error ( 1 , ret , " io_uring: queue init " ) ;
iov . iov_base = payload ;
iov . iov_len = cfg_payload_len ;
ret = io_uring_register_buffers ( & ring , & iov , 1 ) ;
if ( ret )
error ( 1 , ret , " io_uring: buffer registration " ) ;
tstop = gettimeofday_ms ( ) + cfg_runtime_ms ;
do {
if ( cfg_cork )
do_setsockopt ( fd , IPPROTO_UDP , UDP_CORK , 1 ) ;
for ( i = 0 ; i < cfg_nr_reqs ; i + + ) {
unsigned zc_flags = 0 ;
unsigned buf_idx = 0 ;
unsigned mode = cfg_mode ;
2022-09-01 13:54:05 +03:00
unsigned msg_flags = MSG_WAITALL ;
2022-07-12 23:52:51 +03:00
if ( cfg_mode = = MODE_MIXED )
mode = rand ( ) % 3 ;
sqe = io_uring_get_sqe ( & ring ) ;
if ( mode = = MODE_NONZC ) {
io_uring_prep_send ( sqe , fd , payload ,
cfg_payload_len , msg_flags ) ;
sqe - > user_data = NONZC_TAG ;
} else {
io_uring_prep_sendzc ( sqe , fd , payload ,
cfg_payload_len ,
2022-09-01 13:54:05 +03:00
msg_flags , zc_flags ) ;
2022-07-12 23:52:51 +03:00
if ( mode = = MODE_ZC_FIXED ) {
sqe - > ioprio | = IORING_RECVSEND_FIXED_BUF ;
sqe - > buf_index = buf_idx ;
}
sqe - > user_data = ZC_TAG ;
}
}
ret = io_uring_submit ( & ring ) ;
if ( ret ! = cfg_nr_reqs )
error ( 1 , ret , " submit " ) ;
2022-09-01 13:54:05 +03:00
if ( cfg_cork )
do_setsockopt ( fd , IPPROTO_UDP , UDP_CORK , 0 ) ;
2022-07-12 23:52:51 +03:00
for ( i = 0 ; i < cfg_nr_reqs ; i + + ) {
ret = io_uring_wait_cqe ( & ring , & cqe ) ;
if ( ret )
error ( 1 , ret , " wait cqe " ) ;
2022-09-01 13:54:05 +03:00
if ( cqe - > user_data ! = NONZC_TAG & &
cqe - > user_data ! = ZC_TAG )
error ( 1 , - EINVAL , " invalid cqe->user_data " ) ;
if ( cqe - > flags & IORING_CQE_F_NOTIF ) {
if ( cqe - > flags & IORING_CQE_F_MORE )
error ( 1 , - EINVAL , " invalid notif flags " ) ;
2022-09-23 20:12:09 +03:00
if ( compl_cqes < = 0 )
error ( 1 , - EINVAL , " notification mismatch " ) ;
2022-07-12 23:52:51 +03:00
compl_cqes - - ;
i - - ;
2022-09-23 20:12:09 +03:00
io_uring_cqe_seen ( & ring ) ;
continue ;
}
if ( cqe - > flags & IORING_CQE_F_MORE ) {
if ( cqe - > user_data ! = ZC_TAG )
error ( 1 , cqe - > res , " unexpected F_MORE " ) ;
compl_cqes + + ;
}
if ( cqe - > res > = 0 ) {
2022-09-01 13:54:05 +03:00
packets + + ;
bytes + = cqe - > res ;
2022-09-23 20:12:09 +03:00
} else if ( cqe - > res ! = - EAGAIN ) {
error ( 1 , cqe - > res , " send failed " ) ;
2022-07-12 23:52:51 +03:00
}
io_uring_cqe_seen ( & ring ) ;
}
} while ( gettimeofday_ms ( ) < tstop ) ;
while ( compl_cqes ) {
ret = io_uring_wait_cqe ( & ring , & cqe ) ;
if ( ret )
error ( 1 , ret , " wait cqe " ) ;
2022-09-01 13:54:05 +03:00
if ( cqe - > flags & IORING_CQE_F_MORE )
error ( 1 , - EINVAL , " invalid notif flags " ) ;
if ( ! ( cqe - > flags & IORING_CQE_F_NOTIF ) )
error ( 1 , - EINVAL , " missing notif flag " ) ;
2022-07-12 23:52:51 +03:00
io_uring_cqe_seen ( & ring ) ;
compl_cqes - - ;
}
2022-09-01 13:54:05 +03:00
fprintf ( stderr , " tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu) \n " ,
packets , bytes > > 20 ,
packets / ( cfg_runtime_ms / 1000 ) ,
( bytes > > 20 ) / ( cfg_runtime_ms / 1000 ) ) ;
if ( close ( fd ) )
error ( 1 , errno , " close " ) ;
2022-07-12 23:52:51 +03:00
}
static void do_test ( int domain , int type , int protocol )
{
int i ;
for ( i = 0 ; i < IP_MAXPACKET ; i + + )
payload [ i ] = ' a ' + ( i % 26 ) ;
do_tx ( domain , type , protocol ) ;
}
static void usage ( const char * filepath )
{
2022-09-01 13:54:05 +03:00
error ( 1 , 0 , " Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
" [-t<time s>] [-n<batch>] [-p<port>] [-m<mode>] " , filepath ) ;
2022-07-12 23:52:51 +03:00
}
static void parse_opts ( int argc , char * * argv )
{
const int max_payload_len = sizeof ( payload ) -
sizeof ( struct ipv6hdr ) -
sizeof ( struct tcphdr ) -
40 /* max tcp options */ ;
struct sockaddr_in6 * addr6 = ( void * ) & cfg_dst_addr ;
struct sockaddr_in * addr4 = ( void * ) & cfg_dst_addr ;
char * daddr = NULL ;
int c ;
if ( argc < = 1 )
usage ( argv [ 0 ] ) ;
cfg_payload_len = max_payload_len ;
2022-09-01 13:54:05 +03:00
while ( ( c = getopt ( argc , argv , " 46D:p:s:t:n:c:m: " ) ) ! = - 1 ) {
2022-07-12 23:52:51 +03:00
switch ( c ) {
case ' 4 ' :
if ( cfg_family ! = PF_UNSPEC )
error ( 1 , 0 , " Pass one of -4 or -6 " ) ;
cfg_family = PF_INET ;
cfg_alen = sizeof ( struct sockaddr_in ) ;
break ;
case ' 6 ' :
if ( cfg_family ! = PF_UNSPEC )
error ( 1 , 0 , " Pass one of -4 or -6 " ) ;
cfg_family = PF_INET6 ;
cfg_alen = sizeof ( struct sockaddr_in6 ) ;
break ;
case ' D ' :
daddr = optarg ;
break ;
case ' p ' :
cfg_port = strtoul ( optarg , NULL , 0 ) ;
break ;
case ' s ' :
cfg_payload_len = strtoul ( optarg , NULL , 0 ) ;
break ;
case ' t ' :
cfg_runtime_ms = 200 + strtoul ( optarg , NULL , 10 ) * 1000 ;
break ;
case ' n ' :
cfg_nr_reqs = strtoul ( optarg , NULL , 0 ) ;
break ;
case ' c ' :
cfg_cork = strtol ( optarg , NULL , 0 ) ;
break ;
case ' m ' :
cfg_mode = strtol ( optarg , NULL , 0 ) ;
break ;
}
}
switch ( cfg_family ) {
case PF_INET :
memset ( addr4 , 0 , sizeof ( * addr4 ) ) ;
addr4 - > sin_family = AF_INET ;
addr4 - > sin_port = htons ( cfg_port ) ;
if ( daddr & &
inet_pton ( AF_INET , daddr , & ( addr4 - > sin_addr ) ) ! = 1 )
error ( 1 , 0 , " ipv4 parse error: %s " , daddr ) ;
break ;
case PF_INET6 :
memset ( addr6 , 0 , sizeof ( * addr6 ) ) ;
addr6 - > sin6_family = AF_INET6 ;
addr6 - > sin6_port = htons ( cfg_port ) ;
if ( daddr & &
inet_pton ( AF_INET6 , daddr , & ( addr6 - > sin6_addr ) ) ! = 1 )
error ( 1 , 0 , " ipv6 parse error: %s " , daddr ) ;
break ;
default :
error ( 1 , 0 , " illegal domain " ) ;
}
if ( cfg_payload_len > max_payload_len )
error ( 1 , 0 , " -s: payload exceeds max (%d) " , max_payload_len ) ;
if ( optind ! = argc - 1 )
usage ( argv [ 0 ] ) ;
}
int main ( int argc , char * * argv )
{
const char * cfg_test = argv [ argc - 1 ] ;
parse_opts ( argc , argv ) ;
if ( ! strcmp ( cfg_test , " tcp " ) )
do_test ( cfg_family , SOCK_STREAM , 0 ) ;
else if ( ! strcmp ( cfg_test , " udp " ) )
do_test ( cfg_family , SOCK_DGRAM , 0 ) ;
else
error ( 1 , 0 , " unknown cfg_test %s " , cfg_test ) ;
return 0 ;
}