38e7571c07
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAlyAJvAQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgphb+EACFaKI2HIdjExQ5T7Cxebzwky+Qiro3FV55 ziW00FZrkJ5g0h4ItBzh/5SDlcNQYZDMlA3s4xzWIMadWl5PjMPq1uJul0cITbSl WIJO5hpgNMXeUEhvcXUl6+f/WzpgYUxN40uW8N5V7EKlooaFVfudDqJGlvEv+UgB g8NWQYThSG+/e7r9OGwK0xDRVKfpjxVvmqmnDH3DrxKaDgSOwTf4xn1u41wKwfQ3 3uPfQ+GBeTqt4a2AhOi7K6KQFNnj5Jz5CXYMiOZI2JGtLPcL6dmyBVD7K0a0HUr+ rs4ghNdd1+puvPGNK4TX8qV0uiNrMctoRNVA/JDd1ZTYEKTmNLxeFf+olfYHlwuK K5FRs60/lgNzNkzcUpFvJHitPwYtxYJdB36PyswE1FZP1YviEeVoKNt9W8aIhEoA 549uj90brfA74eCINGhq98pJqj9CNyCPw3bfi76f5Ej2utwYDb9S5Cp2gfSa853X qc/qNda9efEq7ikwCbPzhekRMXZo6TSXtaSmC2C+Vs5+mD1Scc4kdAvdCKGQrtr9 aoy0iQMYO2NDZ/G5fppvXtMVuEPAZWbsGftyOe15IlMysjRze2ycJV8cFahKEVM9 uBeXLyH1pqGU/j7ABP4+XRZ/sbHJTwjKJbnXhTgBsdU8XO/CR3U+kRQFTsidKMfH Wlo3uH2h2A== =p78E -----END PGP SIGNATURE----- Merge tag 'io_uring-2019-03-06' of git://git.kernel.dk/linux-block Pull io_uring IO interface from Jens Axboe: "Second attempt at adding the io_uring interface. Since the first one, we've added basic unit testing of the three system calls, that resides in liburing like the other unit tests that we have so far. It'll take a while to get full coverage of it, but we're working towards it. I've also added two basic test programs to tools/io_uring. One uses the raw interface and has support for all the various features that io_uring supports outside of standard IO, like fixed files, fixed IO buffers, and polled IO. The other uses the liburing API, and is a simplified version of cp(1). This adds support for a new IO interface, io_uring. io_uring allows an application to communicate with the kernel through two rings, the submission queue (SQ) and completion queue (CQ) ring. This allows for very efficient handling of IOs, see the v5 posting for some basic numbers: https://lore.kernel.org/linux-block/20190116175003.17880-1-axboe@kernel.dk/ Outside of just efficiency, the interface is also flexible and extendable, and allows for future use cases like the upcoming NVMe key-value store API, networked IO, and so on. It also supports async buffered IO, something that we've always failed to support in the kernel. Outside of basic IO features, it supports async polled IO as well. This particular feature has already been tested at Facebook months ago for flash storage boxes, with 25-33% improvements. It makes polled IO actually useful for real world use cases, where even basic flash sees a nice win in terms of efficiency, latency, and performance. These boxes were IOPS bound before, now they are not. This series adds three new system calls. One for setting up an io_uring instance (io_uring_setup(2)), one for submitting/completing IO (io_uring_enter(2)), and one for aux functions like registrating file sets, buffers, etc (io_uring_register(2)). Through the help of Arnd, I've coordinated the syscall numbers so merge on that front should be painless. Jon did a writeup of the interface a while back, which (except for minor details that have been tweaked) is still accurate. Find that here: https://lwn.net/Articles/776703/ Huge thanks to Al Viro for helping getting the reference cycle code correct, and to Jann Horn for his extensive reviews focused on both security and bugs in general. There's a userspace library that provides basic functionality for applications that don't need or want to care about how to fiddle with the rings directly. It has helpers to allow applications to easily set up an io_uring instance, and submit/complete IO through it without knowing about the intricacies of the rings. It also includes man pages (thanks to Jeff Moyer), and will continue to grow support helper functions and features as time progresses. Find it here: git://git.kernel.dk/liburing Fio has full support for the raw interface, both in the form of an IO engine (io_uring), but also with a small test application (t/io_uring) that can exercise and benchmark the interface" * tag 'io_uring-2019-03-06' of git://git.kernel.dk/linux-block: io_uring: add a few test tools io_uring: allow workqueue item to handle multiple buffered requests io_uring: add support for IORING_OP_POLL io_uring: add io_kiocb ref count io_uring: add submission polling io_uring: add file set registration net: split out functions related to registering inflight socket files io_uring: add support for pre-mapped user IO buffers block: implement bio helper to add iter bvec pages to bio io_uring: batch io_kiocb allocation io_uring: use fget/fput_many() for file references fs: add fget_many() and fput_many() io_uring: support for IO polling io_uring: add fsync support Add io_uring IO interface
452 lines
9.2 KiB
C
452 lines
9.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/linkage.h>
|
|
#include <linux/errno.h>
|
|
|
|
#include <asm/unistd.h>
|
|
|
|
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
|
|
/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */
|
|
#include <asm/syscall_wrapper.h>
|
|
#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
|
|
|
|
/* we can't #include <linux/syscalls.h> here,
|
|
but tell gcc to not warn with -Wmissing-prototypes */
|
|
asmlinkage long sys_ni_syscall(void);
|
|
|
|
/*
|
|
* Non-implemented system calls get redirected here.
|
|
*/
|
|
asmlinkage long sys_ni_syscall(void)
|
|
{
|
|
return -ENOSYS;
|
|
}
|
|
|
|
#ifndef COND_SYSCALL
|
|
#define COND_SYSCALL(name) cond_syscall(sys_##name)
|
|
#endif /* COND_SYSCALL */
|
|
|
|
#ifndef COND_SYSCALL_COMPAT
|
|
#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
|
|
#endif /* COND_SYSCALL_COMPAT */
|
|
|
|
/*
|
|
* This list is kept in the same order as include/uapi/asm-generic/unistd.h.
|
|
* Architecture specific entries go below, followed by deprecated or obsolete
|
|
* system calls.
|
|
*/
|
|
|
|
COND_SYSCALL(io_setup);
|
|
COND_SYSCALL_COMPAT(io_setup);
|
|
COND_SYSCALL(io_destroy);
|
|
COND_SYSCALL(io_submit);
|
|
COND_SYSCALL_COMPAT(io_submit);
|
|
COND_SYSCALL(io_cancel);
|
|
COND_SYSCALL(io_getevents_time32);
|
|
COND_SYSCALL(io_getevents);
|
|
COND_SYSCALL(io_pgetevents_time32);
|
|
COND_SYSCALL(io_pgetevents);
|
|
COND_SYSCALL_COMPAT(io_pgetevents_time32);
|
|
COND_SYSCALL_COMPAT(io_pgetevents);
|
|
COND_SYSCALL(io_uring_setup);
|
|
COND_SYSCALL(io_uring_enter);
|
|
COND_SYSCALL(io_uring_register);
|
|
|
|
/* fs/xattr.c */
|
|
|
|
/* fs/dcache.c */
|
|
|
|
/* fs/cookies.c */
|
|
COND_SYSCALL(lookup_dcookie);
|
|
COND_SYSCALL_COMPAT(lookup_dcookie);
|
|
|
|
/* fs/eventfd.c */
|
|
COND_SYSCALL(eventfd2);
|
|
|
|
/* fs/eventfd.c */
|
|
COND_SYSCALL(epoll_create1);
|
|
COND_SYSCALL(epoll_ctl);
|
|
COND_SYSCALL(epoll_pwait);
|
|
COND_SYSCALL_COMPAT(epoll_pwait);
|
|
|
|
/* fs/fcntl.c */
|
|
|
|
/* fs/inotify_user.c */
|
|
COND_SYSCALL(inotify_init1);
|
|
COND_SYSCALL(inotify_add_watch);
|
|
COND_SYSCALL(inotify_rm_watch);
|
|
|
|
/* fs/ioctl.c */
|
|
|
|
/* fs/ioprio.c */
|
|
COND_SYSCALL(ioprio_set);
|
|
COND_SYSCALL(ioprio_get);
|
|
|
|
/* fs/locks.c */
|
|
COND_SYSCALL(flock);
|
|
|
|
/* fs/namei.c */
|
|
|
|
/* fs/namespace.c */
|
|
|
|
/* fs/nfsctl.c */
|
|
|
|
/* fs/open.c */
|
|
|
|
/* fs/pipe.c */
|
|
|
|
/* fs/quota.c */
|
|
COND_SYSCALL(quotactl);
|
|
|
|
/* fs/readdir.c */
|
|
|
|
/* fs/read_write.c */
|
|
|
|
/* fs/sendfile.c */
|
|
|
|
/* fs/select.c */
|
|
|
|
/* fs/signalfd.c */
|
|
COND_SYSCALL(signalfd4);
|
|
COND_SYSCALL_COMPAT(signalfd4);
|
|
|
|
/* fs/splice.c */
|
|
|
|
/* fs/stat.c */
|
|
|
|
/* fs/sync.c */
|
|
|
|
/* fs/timerfd.c */
|
|
COND_SYSCALL(timerfd_create);
|
|
COND_SYSCALL(timerfd_settime);
|
|
COND_SYSCALL(timerfd_settime32);
|
|
COND_SYSCALL(timerfd_gettime);
|
|
COND_SYSCALL(timerfd_gettime32);
|
|
|
|
/* fs/utimes.c */
|
|
|
|
/* kernel/acct.c */
|
|
COND_SYSCALL(acct);
|
|
|
|
/* kernel/capability.c */
|
|
COND_SYSCALL(capget);
|
|
COND_SYSCALL(capset);
|
|
|
|
/* kernel/exec_domain.c */
|
|
|
|
/* kernel/exit.c */
|
|
|
|
/* kernel/fork.c */
|
|
|
|
/* kernel/futex.c */
|
|
COND_SYSCALL(futex);
|
|
COND_SYSCALL(futex_time32);
|
|
COND_SYSCALL(set_robust_list);
|
|
COND_SYSCALL_COMPAT(set_robust_list);
|
|
COND_SYSCALL(get_robust_list);
|
|
COND_SYSCALL_COMPAT(get_robust_list);
|
|
|
|
/* kernel/hrtimer.c */
|
|
|
|
/* kernel/itimer.c */
|
|
|
|
/* kernel/kexec.c */
|
|
COND_SYSCALL(kexec_load);
|
|
COND_SYSCALL_COMPAT(kexec_load);
|
|
|
|
/* kernel/module.c */
|
|
COND_SYSCALL(init_module);
|
|
COND_SYSCALL(delete_module);
|
|
|
|
/* kernel/posix-timers.c */
|
|
|
|
/* kernel/printk.c */
|
|
COND_SYSCALL(syslog);
|
|
|
|
/* kernel/ptrace.c */
|
|
|
|
/* kernel/sched/core.c */
|
|
|
|
/* kernel/signal.c */
|
|
|
|
/* kernel/sys.c */
|
|
COND_SYSCALL(setregid);
|
|
COND_SYSCALL(setgid);
|
|
COND_SYSCALL(setreuid);
|
|
COND_SYSCALL(setuid);
|
|
COND_SYSCALL(setresuid);
|
|
COND_SYSCALL(getresuid);
|
|
COND_SYSCALL(setresgid);
|
|
COND_SYSCALL(getresgid);
|
|
COND_SYSCALL(setfsuid);
|
|
COND_SYSCALL(setfsgid);
|
|
COND_SYSCALL(setgroups);
|
|
COND_SYSCALL(getgroups);
|
|
|
|
/* kernel/time.c */
|
|
|
|
/* kernel/timer.c */
|
|
|
|
/* ipc/mqueue.c */
|
|
COND_SYSCALL(mq_open);
|
|
COND_SYSCALL_COMPAT(mq_open);
|
|
COND_SYSCALL(mq_unlink);
|
|
COND_SYSCALL(mq_timedsend);
|
|
COND_SYSCALL(mq_timedsend_time32);
|
|
COND_SYSCALL(mq_timedreceive);
|
|
COND_SYSCALL(mq_timedreceive_time32);
|
|
COND_SYSCALL(mq_notify);
|
|
COND_SYSCALL_COMPAT(mq_notify);
|
|
COND_SYSCALL(mq_getsetattr);
|
|
COND_SYSCALL_COMPAT(mq_getsetattr);
|
|
|
|
/* ipc/msg.c */
|
|
COND_SYSCALL(msgget);
|
|
COND_SYSCALL(old_msgctl);
|
|
COND_SYSCALL(msgctl);
|
|
COND_SYSCALL_COMPAT(msgctl);
|
|
COND_SYSCALL_COMPAT(old_msgctl);
|
|
COND_SYSCALL(msgrcv);
|
|
COND_SYSCALL_COMPAT(msgrcv);
|
|
COND_SYSCALL(msgsnd);
|
|
COND_SYSCALL_COMPAT(msgsnd);
|
|
|
|
/* ipc/sem.c */
|
|
COND_SYSCALL(semget);
|
|
COND_SYSCALL(old_semctl);
|
|
COND_SYSCALL(semctl);
|
|
COND_SYSCALL_COMPAT(semctl);
|
|
COND_SYSCALL_COMPAT(old_semctl);
|
|
COND_SYSCALL(semtimedop);
|
|
COND_SYSCALL(semtimedop_time32);
|
|
COND_SYSCALL(semop);
|
|
|
|
/* ipc/shm.c */
|
|
COND_SYSCALL(shmget);
|
|
COND_SYSCALL(old_shmctl);
|
|
COND_SYSCALL(shmctl);
|
|
COND_SYSCALL_COMPAT(shmctl);
|
|
COND_SYSCALL_COMPAT(old_shmctl);
|
|
COND_SYSCALL(shmat);
|
|
COND_SYSCALL_COMPAT(shmat);
|
|
COND_SYSCALL(shmdt);
|
|
|
|
/* net/socket.c */
|
|
COND_SYSCALL(socket);
|
|
COND_SYSCALL(socketpair);
|
|
COND_SYSCALL(bind);
|
|
COND_SYSCALL(listen);
|
|
COND_SYSCALL(accept);
|
|
COND_SYSCALL(connect);
|
|
COND_SYSCALL(getsockname);
|
|
COND_SYSCALL(getpeername);
|
|
COND_SYSCALL(setsockopt);
|
|
COND_SYSCALL_COMPAT(setsockopt);
|
|
COND_SYSCALL(getsockopt);
|
|
COND_SYSCALL_COMPAT(getsockopt);
|
|
COND_SYSCALL(sendto);
|
|
COND_SYSCALL(shutdown);
|
|
COND_SYSCALL(recvfrom);
|
|
COND_SYSCALL_COMPAT(recvfrom);
|
|
COND_SYSCALL(sendmsg);
|
|
COND_SYSCALL_COMPAT(sendmsg);
|
|
COND_SYSCALL(recvmsg);
|
|
COND_SYSCALL_COMPAT(recvmsg);
|
|
|
|
/* mm/filemap.c */
|
|
|
|
/* mm/nommu.c, also with MMU */
|
|
COND_SYSCALL(mremap);
|
|
|
|
/* security/keys/keyctl.c */
|
|
COND_SYSCALL(add_key);
|
|
COND_SYSCALL(request_key);
|
|
COND_SYSCALL(keyctl);
|
|
COND_SYSCALL_COMPAT(keyctl);
|
|
|
|
/* arch/example/kernel/sys_example.c */
|
|
|
|
/* mm/fadvise.c */
|
|
COND_SYSCALL(fadvise64_64);
|
|
|
|
/* mm/, CONFIG_MMU only */
|
|
COND_SYSCALL(swapon);
|
|
COND_SYSCALL(swapoff);
|
|
COND_SYSCALL(mprotect);
|
|
COND_SYSCALL(msync);
|
|
COND_SYSCALL(mlock);
|
|
COND_SYSCALL(munlock);
|
|
COND_SYSCALL(mlockall);
|
|
COND_SYSCALL(munlockall);
|
|
COND_SYSCALL(mincore);
|
|
COND_SYSCALL(madvise);
|
|
COND_SYSCALL(remap_file_pages);
|
|
COND_SYSCALL(mbind);
|
|
COND_SYSCALL_COMPAT(mbind);
|
|
COND_SYSCALL(get_mempolicy);
|
|
COND_SYSCALL_COMPAT(get_mempolicy);
|
|
COND_SYSCALL(set_mempolicy);
|
|
COND_SYSCALL_COMPAT(set_mempolicy);
|
|
COND_SYSCALL(migrate_pages);
|
|
COND_SYSCALL_COMPAT(migrate_pages);
|
|
COND_SYSCALL(move_pages);
|
|
COND_SYSCALL_COMPAT(move_pages);
|
|
|
|
COND_SYSCALL(perf_event_open);
|
|
COND_SYSCALL(accept4);
|
|
COND_SYSCALL(recvmmsg);
|
|
COND_SYSCALL(recvmmsg_time32);
|
|
COND_SYSCALL_COMPAT(recvmmsg_time32);
|
|
COND_SYSCALL_COMPAT(recvmmsg_time64);
|
|
|
|
/*
|
|
* Architecture specific syscalls: see further below
|
|
*/
|
|
|
|
/* fanotify */
|
|
COND_SYSCALL(fanotify_init);
|
|
COND_SYSCALL(fanotify_mark);
|
|
|
|
/* open by handle */
|
|
COND_SYSCALL(name_to_handle_at);
|
|
COND_SYSCALL(open_by_handle_at);
|
|
COND_SYSCALL_COMPAT(open_by_handle_at);
|
|
|
|
COND_SYSCALL(sendmmsg);
|
|
COND_SYSCALL_COMPAT(sendmmsg);
|
|
COND_SYSCALL(process_vm_readv);
|
|
COND_SYSCALL_COMPAT(process_vm_readv);
|
|
COND_SYSCALL(process_vm_writev);
|
|
COND_SYSCALL_COMPAT(process_vm_writev);
|
|
|
|
/* compare kernel pointers */
|
|
COND_SYSCALL(kcmp);
|
|
|
|
COND_SYSCALL(finit_module);
|
|
|
|
/* operate on Secure Computing state */
|
|
COND_SYSCALL(seccomp);
|
|
|
|
COND_SYSCALL(memfd_create);
|
|
|
|
/* access BPF programs and maps */
|
|
COND_SYSCALL(bpf);
|
|
|
|
/* execveat */
|
|
COND_SYSCALL(execveat);
|
|
|
|
COND_SYSCALL(userfaultfd);
|
|
|
|
/* membarrier */
|
|
COND_SYSCALL(membarrier);
|
|
|
|
COND_SYSCALL(mlock2);
|
|
|
|
COND_SYSCALL(copy_file_range);
|
|
|
|
/* memory protection keys */
|
|
COND_SYSCALL(pkey_mprotect);
|
|
COND_SYSCALL(pkey_alloc);
|
|
COND_SYSCALL(pkey_free);
|
|
|
|
|
|
/*
|
|
* Architecture specific weak syscall entries.
|
|
*/
|
|
|
|
/* pciconfig: alpha, arm, arm64, ia64, sparc */
|
|
COND_SYSCALL(pciconfig_read);
|
|
COND_SYSCALL(pciconfig_write);
|
|
COND_SYSCALL(pciconfig_iobase);
|
|
|
|
/* sys_socketcall: arm, mips, x86, ... */
|
|
COND_SYSCALL(socketcall);
|
|
COND_SYSCALL_COMPAT(socketcall);
|
|
|
|
/* compat syscalls for arm64, x86, ... */
|
|
COND_SYSCALL_COMPAT(sysctl);
|
|
COND_SYSCALL_COMPAT(fanotify_mark);
|
|
|
|
/* x86 */
|
|
COND_SYSCALL(vm86old);
|
|
COND_SYSCALL(modify_ldt);
|
|
COND_SYSCALL_COMPAT(quotactl32);
|
|
COND_SYSCALL(vm86);
|
|
COND_SYSCALL(kexec_file_load);
|
|
|
|
/* s390 */
|
|
COND_SYSCALL(s390_pci_mmio_read);
|
|
COND_SYSCALL(s390_pci_mmio_write);
|
|
COND_SYSCALL(s390_ipc);
|
|
COND_SYSCALL_COMPAT(s390_ipc);
|
|
|
|
/* powerpc */
|
|
COND_SYSCALL(rtas);
|
|
COND_SYSCALL(spu_run);
|
|
COND_SYSCALL(spu_create);
|
|
COND_SYSCALL(subpage_prot);
|
|
|
|
|
|
/*
|
|
* Deprecated system calls which are still defined in
|
|
* include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
|
|
*/
|
|
|
|
/* __ARCH_WANT_SYSCALL_NO_FLAGS */
|
|
COND_SYSCALL(epoll_create);
|
|
COND_SYSCALL(inotify_init);
|
|
COND_SYSCALL(eventfd);
|
|
COND_SYSCALL(signalfd);
|
|
COND_SYSCALL_COMPAT(signalfd);
|
|
|
|
/* __ARCH_WANT_SYSCALL_OFF_T */
|
|
COND_SYSCALL(fadvise64);
|
|
|
|
/* __ARCH_WANT_SYSCALL_DEPRECATED */
|
|
COND_SYSCALL(epoll_wait);
|
|
COND_SYSCALL(recv);
|
|
COND_SYSCALL_COMPAT(recv);
|
|
COND_SYSCALL(send);
|
|
COND_SYSCALL(bdflush);
|
|
COND_SYSCALL(uselib);
|
|
|
|
|
|
/*
|
|
* The syscalls below are not found in include/uapi/asm-generic/unistd.h
|
|
*/
|
|
|
|
/* obsolete: SGETMASK_SYSCALL */
|
|
COND_SYSCALL(sgetmask);
|
|
COND_SYSCALL(ssetmask);
|
|
|
|
/* obsolete: SYSFS_SYSCALL */
|
|
COND_SYSCALL(sysfs);
|
|
|
|
/* obsolete: __ARCH_WANT_SYS_IPC */
|
|
COND_SYSCALL(ipc);
|
|
COND_SYSCALL_COMPAT(ipc);
|
|
|
|
/* obsolete: UID16 */
|
|
COND_SYSCALL(chown16);
|
|
COND_SYSCALL(fchown16);
|
|
COND_SYSCALL(getegid16);
|
|
COND_SYSCALL(geteuid16);
|
|
COND_SYSCALL(getgid16);
|
|
COND_SYSCALL(getgroups16);
|
|
COND_SYSCALL(getresgid16);
|
|
COND_SYSCALL(getresuid16);
|
|
COND_SYSCALL(getuid16);
|
|
COND_SYSCALL(lchown16);
|
|
COND_SYSCALL(setfsgid16);
|
|
COND_SYSCALL(setfsuid16);
|
|
COND_SYSCALL(setgid16);
|
|
COND_SYSCALL(setgroups16);
|
|
COND_SYSCALL(setregid16);
|
|
COND_SYSCALL(setresgid16);
|
|
COND_SYSCALL(setresuid16);
|
|
COND_SYSCALL(setreuid16);
|
|
COND_SYSCALL(setuid16);
|
|
|
|
/* restartable sequence */
|
|
COND_SYSCALL(rseq);
|