2017-11-01 15:08:43 +01:00
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2012-10-04 18:20:15 +01:00
# include <asm/bitsperlong.h>
/*
* This file contains the system call numbers , based on the
* layout of the x86 - 64 architecture , which embeds the
* pointer to the syscall in the table .
*
* As a basic principle , no duplication of functionality
* should be added , e . g . we don ' t use lseek when llseek
* is present . New architectures should use this file
* and implement the less feature - full calls in user space .
*/
# ifndef __SYSCALL
# define __SYSCALL(x, y)
# endif
# if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
# define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
# else
# define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
# endif
# ifdef __SYSCALL_COMPAT
# define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp)
# define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp)
# else
# define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys)
# define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64)
# endif
# define __NR_io_setup 0
__SC_COMP ( __NR_io_setup , sys_io_setup , compat_sys_io_setup )
# define __NR_io_destroy 1
__SYSCALL ( __NR_io_destroy , sys_io_destroy )
# define __NR_io_submit 2
__SC_COMP ( __NR_io_submit , sys_io_submit , compat_sys_io_submit )
# define __NR_io_cancel 3
__SYSCALL ( __NR_io_cancel , sys_io_cancel )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_io_getevents 4
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_io_getevents , sys_io_getevents_time32 , sys_io_getevents )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* fs/xattr.c */
# define __NR_setxattr 5
__SYSCALL ( __NR_setxattr , sys_setxattr )
# define __NR_lsetxattr 6
__SYSCALL ( __NR_lsetxattr , sys_lsetxattr )
# define __NR_fsetxattr 7
__SYSCALL ( __NR_fsetxattr , sys_fsetxattr )
# define __NR_getxattr 8
__SYSCALL ( __NR_getxattr , sys_getxattr )
# define __NR_lgetxattr 9
__SYSCALL ( __NR_lgetxattr , sys_lgetxattr )
# define __NR_fgetxattr 10
__SYSCALL ( __NR_fgetxattr , sys_fgetxattr )
# define __NR_listxattr 11
__SYSCALL ( __NR_listxattr , sys_listxattr )
# define __NR_llistxattr 12
__SYSCALL ( __NR_llistxattr , sys_llistxattr )
# define __NR_flistxattr 13
__SYSCALL ( __NR_flistxattr , sys_flistxattr )
# define __NR_removexattr 14
__SYSCALL ( __NR_removexattr , sys_removexattr )
# define __NR_lremovexattr 15
__SYSCALL ( __NR_lremovexattr , sys_lremovexattr )
# define __NR_fremovexattr 16
__SYSCALL ( __NR_fremovexattr , sys_fremovexattr )
/* fs/dcache.c */
# define __NR_getcwd 17
__SYSCALL ( __NR_getcwd , sys_getcwd )
/* fs/cookies.c */
# define __NR_lookup_dcookie 18
__SC_COMP ( __NR_lookup_dcookie , sys_lookup_dcookie , compat_sys_lookup_dcookie )
/* fs/eventfd.c */
# define __NR_eventfd2 19
__SYSCALL ( __NR_eventfd2 , sys_eventfd2 )
/* fs/eventpoll.c */
# define __NR_epoll_create1 20
__SYSCALL ( __NR_epoll_create1 , sys_epoll_create1 )
# define __NR_epoll_ctl 21
__SYSCALL ( __NR_epoll_ctl , sys_epoll_ctl )
# define __NR_epoll_pwait 22
__SC_COMP ( __NR_epoll_pwait , sys_epoll_pwait , compat_sys_epoll_pwait )
/* fs/fcntl.c */
# define __NR_dup 23
__SYSCALL ( __NR_dup , sys_dup )
# define __NR_dup3 24
__SYSCALL ( __NR_dup3 , sys_dup3 )
# define __NR3264_fcntl 25
__SC_COMP_3264 ( __NR3264_fcntl , sys_fcntl64 , sys_fcntl , compat_sys_fcntl64 )
/* fs/inotify_user.c */
# define __NR_inotify_init1 26
__SYSCALL ( __NR_inotify_init1 , sys_inotify_init1 )
# define __NR_inotify_add_watch 27
__SYSCALL ( __NR_inotify_add_watch , sys_inotify_add_watch )
# define __NR_inotify_rm_watch 28
__SYSCALL ( __NR_inotify_rm_watch , sys_inotify_rm_watch )
/* fs/ioctl.c */
# define __NR_ioctl 29
__SC_COMP ( __NR_ioctl , sys_ioctl , compat_sys_ioctl )
/* fs/ioprio.c */
# define __NR_ioprio_set 30
__SYSCALL ( __NR_ioprio_set , sys_ioprio_set )
# define __NR_ioprio_get 31
__SYSCALL ( __NR_ioprio_get , sys_ioprio_get )
/* fs/locks.c */
# define __NR_flock 32
__SYSCALL ( __NR_flock , sys_flock )
/* fs/namei.c */
# define __NR_mknodat 33
__SYSCALL ( __NR_mknodat , sys_mknodat )
# define __NR_mkdirat 34
__SYSCALL ( __NR_mkdirat , sys_mkdirat )
# define __NR_unlinkat 35
__SYSCALL ( __NR_unlinkat , sys_unlinkat )
# define __NR_symlinkat 36
__SYSCALL ( __NR_symlinkat , sys_symlinkat )
# define __NR_linkat 37
__SYSCALL ( __NR_linkat , sys_linkat )
2016-04-29 22:29:26 +01:00
# ifdef __ARCH_WANT_RENAMEAT
/* renameat is superseded with flags by renameat2 */
2012-10-04 18:20:15 +01:00
# define __NR_renameat 38
__SYSCALL ( __NR_renameat , sys_renameat )
2016-04-29 22:29:26 +01:00
# endif /* __ARCH_WANT_RENAMEAT */
2012-10-04 18:20:15 +01:00
/* fs/namespace.c */
# define __NR_umount2 39
__SYSCALL ( __NR_umount2 , sys_umount )
# define __NR_mount 40
2020-09-17 10:22:34 +02:00
__SYSCALL ( __NR_mount , sys_mount )
2012-10-04 18:20:15 +01:00
# define __NR_pivot_root 41
__SYSCALL ( __NR_pivot_root , sys_pivot_root )
/* fs/nfsctl.c */
# define __NR_nfsservctl 42
__SYSCALL ( __NR_nfsservctl , sys_ni_syscall )
/* fs/open.c */
# define __NR3264_statfs 43
__SC_COMP_3264 ( __NR3264_statfs , sys_statfs64 , sys_statfs , \
compat_sys_statfs64 )
# define __NR3264_fstatfs 44
__SC_COMP_3264 ( __NR3264_fstatfs , sys_fstatfs64 , sys_fstatfs , \
compat_sys_fstatfs64 )
# define __NR3264_truncate 45
__SC_COMP_3264 ( __NR3264_truncate , sys_truncate64 , sys_truncate , \
compat_sys_truncate64 )
# define __NR3264_ftruncate 46
__SC_COMP_3264 ( __NR3264_ftruncate , sys_ftruncate64 , sys_ftruncate , \
compat_sys_ftruncate64 )
# define __NR_fallocate 47
__SC_COMP ( __NR_fallocate , sys_fallocate , compat_sys_fallocate )
# define __NR_faccessat 48
__SYSCALL ( __NR_faccessat , sys_faccessat )
# define __NR_chdir 49
__SYSCALL ( __NR_chdir , sys_chdir )
# define __NR_fchdir 50
__SYSCALL ( __NR_fchdir , sys_fchdir )
# define __NR_chroot 51
__SYSCALL ( __NR_chroot , sys_chroot )
# define __NR_fchmod 52
__SYSCALL ( __NR_fchmod , sys_fchmod )
# define __NR_fchmodat 53
__SYSCALL ( __NR_fchmodat , sys_fchmodat )
# define __NR_fchownat 54
__SYSCALL ( __NR_fchownat , sys_fchownat )
# define __NR_fchown 55
__SYSCALL ( __NR_fchown , sys_fchown )
# define __NR_openat 56
2018-05-16 11:18:48 +03:00
__SYSCALL ( __NR_openat , sys_openat )
2012-10-04 18:20:15 +01:00
# define __NR_close 57
__SYSCALL ( __NR_close , sys_close )
# define __NR_vhangup 58
__SYSCALL ( __NR_vhangup , sys_vhangup )
/* fs/pipe.c */
# define __NR_pipe2 59
__SYSCALL ( __NR_pipe2 , sys_pipe2 )
/* fs/quota.c */
# define __NR_quotactl 60
__SYSCALL ( __NR_quotactl , sys_quotactl )
/* fs/readdir.c */
# define __NR_getdents64 61
2017-04-08 14:34:51 -04:00
__SYSCALL ( __NR_getdents64 , sys_getdents64 )
2012-10-04 18:20:15 +01:00
/* fs/read_write.c */
# define __NR3264_lseek 62
__SC_3264 ( __NR3264_lseek , sys_llseek , sys_lseek )
# define __NR_read 63
__SYSCALL ( __NR_read , sys_read )
# define __NR_write 64
__SYSCALL ( __NR_write , sys_write )
# define __NR_readv 65
2020-09-25 06:51:43 +02:00
__SC_COMP ( __NR_readv , sys_readv , sys_readv )
2012-10-04 18:20:15 +01:00
# define __NR_writev 66
2020-09-25 06:51:43 +02:00
__SC_COMP ( __NR_writev , sys_writev , sys_writev )
2012-10-04 18:20:15 +01:00
# define __NR_pread64 67
__SC_COMP ( __NR_pread64 , sys_pread64 , compat_sys_pread64 )
# define __NR_pwrite64 68
__SC_COMP ( __NR_pwrite64 , sys_pwrite64 , compat_sys_pwrite64 )
# define __NR_preadv 69
__SC_COMP ( __NR_preadv , sys_preadv , compat_sys_preadv )
# define __NR_pwritev 70
__SC_COMP ( __NR_pwritev , sys_pwritev , compat_sys_pwritev )
/* fs/sendfile.c */
# define __NR3264_sendfile 71
__SYSCALL ( __NR3264_sendfile , sys_sendfile64 )
/* fs/select.c */
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_pselect6 72
2019-01-01 01:13:32 +01:00
__SC_COMP_3264 ( __NR_pselect6 , sys_pselect6_time32 , sys_pselect6 , compat_sys_pselect6_time32 )
2012-10-04 18:20:15 +01:00
# define __NR_ppoll 73
2019-01-01 01:13:32 +01:00
__SC_COMP_3264 ( __NR_ppoll , sys_ppoll_time32 , sys_ppoll , compat_sys_ppoll_time32 )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* fs/signalfd.c */
# define __NR_signalfd4 74
__SC_COMP ( __NR_signalfd4 , sys_signalfd4 , compat_sys_signalfd4 )
/* fs/splice.c */
# define __NR_vmsplice 75
2020-09-25 06:51:44 +02:00
__SYSCALL ( __NR_vmsplice , sys_vmsplice )
2012-10-04 18:20:15 +01:00
# define __NR_splice 76
__SYSCALL ( __NR_splice , sys_splice )
# define __NR_tee 77
__SYSCALL ( __NR_tee , sys_tee )
/* fs/stat.c */
# define __NR_readlinkat 78
__SYSCALL ( __NR_readlinkat , sys_readlinkat )
2018-04-13 12:19:47 +02:00
# if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64)
2012-10-04 18:20:15 +01:00
# define __NR3264_fstatat 79
__SC_3264 ( __NR3264_fstatat , sys_fstatat64 , sys_newfstatat )
# define __NR3264_fstat 80
__SC_3264 ( __NR3264_fstat , sys_fstat64 , sys_newfstat )
2018-04-13 12:19:47 +02:00
# endif
2012-10-04 18:20:15 +01:00
/* fs/sync.c */
# define __NR_sync 81
__SYSCALL ( __NR_sync , sys_sync )
# define __NR_fsync 82
__SYSCALL ( __NR_fsync , sys_fsync )
# define __NR_fdatasync 83
__SYSCALL ( __NR_fdatasync , sys_fdatasync )
# ifdef __ARCH_WANT_SYNC_FILE_RANGE2
# define __NR_sync_file_range2 84
__SC_COMP ( __NR_sync_file_range2 , sys_sync_file_range2 , \
compat_sys_sync_file_range2 )
# else
# define __NR_sync_file_range 84
__SC_COMP ( __NR_sync_file_range , sys_sync_file_range , \
compat_sys_sync_file_range )
# endif
/* fs/timerfd.c */
# define __NR_timerfd_create 85
__SYSCALL ( __NR_timerfd_create , sys_timerfd_create )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_timerfd_settime 86
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_timerfd_settime , sys_timerfd_settime32 , \
sys_timerfd_settime )
2012-10-04 18:20:15 +01:00
# define __NR_timerfd_gettime 87
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_timerfd_gettime , sys_timerfd_gettime32 , \
sys_timerfd_gettime )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* fs/utimes.c */
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_utimensat 88
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_utimensat , sys_utimensat_time32 , sys_utimensat )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* kernel/acct.c */
# define __NR_acct 89
__SYSCALL ( __NR_acct , sys_acct )
/* kernel/capability.c */
# define __NR_capget 90
__SYSCALL ( __NR_capget , sys_capget )
# define __NR_capset 91
__SYSCALL ( __NR_capset , sys_capset )
/* kernel/exec_domain.c */
# define __NR_personality 92
__SYSCALL ( __NR_personality , sys_personality )
/* kernel/exit.c */
# define __NR_exit 93
__SYSCALL ( __NR_exit , sys_exit )
# define __NR_exit_group 94
__SYSCALL ( __NR_exit_group , sys_exit_group )
# define __NR_waitid 95
__SC_COMP ( __NR_waitid , sys_waitid , compat_sys_waitid )
/* kernel/fork.c */
# define __NR_set_tid_address 96
__SYSCALL ( __NR_set_tid_address , sys_set_tid_address )
# define __NR_unshare 97
__SYSCALL ( __NR_unshare , sys_unshare )
/* kernel/futex.c */
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_futex 98
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_futex , sys_futex_time32 , sys_futex )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_set_robust_list 99
__SC_COMP ( __NR_set_robust_list , sys_set_robust_list , \
compat_sys_set_robust_list )
# define __NR_get_robust_list 100
__SC_COMP ( __NR_get_robust_list , sys_get_robust_list , \
compat_sys_get_robust_list )
/* kernel/hrtimer.c */
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_nanosleep 101
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_nanosleep , sys_nanosleep_time32 , sys_nanosleep )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* kernel/itimer.c */
# define __NR_getitimer 102
__SC_COMP ( __NR_getitimer , sys_getitimer , compat_sys_getitimer )
# define __NR_setitimer 103
__SC_COMP ( __NR_setitimer , sys_setitimer , compat_sys_setitimer )
/* kernel/kexec.c */
# define __NR_kexec_load 104
__SC_COMP ( __NR_kexec_load , sys_kexec_load , compat_sys_kexec_load )
/* kernel/module.c */
# define __NR_init_module 105
__SYSCALL ( __NR_init_module , sys_init_module )
# define __NR_delete_module 106
__SYSCALL ( __NR_delete_module , sys_delete_module )
/* kernel/posix-timers.c */
# define __NR_timer_create 107
__SC_COMP ( __NR_timer_create , sys_timer_create , compat_sys_timer_create )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_timer_gettime 108
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_timer_gettime , sys_timer_gettime32 , sys_timer_gettime )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_timer_getoverrun 109
__SYSCALL ( __NR_timer_getoverrun , sys_timer_getoverrun )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_timer_settime 110
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_timer_settime , sys_timer_settime32 , sys_timer_settime )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_timer_delete 111
__SYSCALL ( __NR_timer_delete , sys_timer_delete )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_clock_settime 112
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_clock_settime , sys_clock_settime32 , sys_clock_settime )
2012-10-04 18:20:15 +01:00
# define __NR_clock_gettime 113
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_clock_gettime , sys_clock_gettime32 , sys_clock_gettime )
2012-10-04 18:20:15 +01:00
# define __NR_clock_getres 114
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_clock_getres , sys_clock_getres_time32 , sys_clock_getres )
2012-10-04 18:20:15 +01:00
# define __NR_clock_nanosleep 115
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_clock_nanosleep , sys_clock_nanosleep_time32 , \
sys_clock_nanosleep )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* kernel/printk.c */
# define __NR_syslog 116
__SYSCALL ( __NR_syslog , sys_syslog )
/* kernel/ptrace.c */
# define __NR_ptrace 117
2022-04-05 15:13:01 +08:00
__SC_COMP ( __NR_ptrace , sys_ptrace , compat_sys_ptrace )
2012-10-04 18:20:15 +01:00
2013-06-04 13:10:24 +05:30
/* kernel/sched/core.c */
2012-10-04 18:20:15 +01:00
# define __NR_sched_setparam 118
__SYSCALL ( __NR_sched_setparam , sys_sched_setparam )
# define __NR_sched_setscheduler 119
__SYSCALL ( __NR_sched_setscheduler , sys_sched_setscheduler )
# define __NR_sched_getscheduler 120
__SYSCALL ( __NR_sched_getscheduler , sys_sched_getscheduler )
# define __NR_sched_getparam 121
__SYSCALL ( __NR_sched_getparam , sys_sched_getparam )
# define __NR_sched_setaffinity 122
__SC_COMP ( __NR_sched_setaffinity , sys_sched_setaffinity , \
compat_sys_sched_setaffinity )
# define __NR_sched_getaffinity 123
__SC_COMP ( __NR_sched_getaffinity , sys_sched_getaffinity , \
compat_sys_sched_getaffinity )
# define __NR_sched_yield 124
__SYSCALL ( __NR_sched_yield , sys_sched_yield )
# define __NR_sched_get_priority_max 125
__SYSCALL ( __NR_sched_get_priority_max , sys_sched_get_priority_max )
# define __NR_sched_get_priority_min 126
__SYSCALL ( __NR_sched_get_priority_min , sys_sched_get_priority_min )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_sched_rr_get_interval 127
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_sched_rr_get_interval , sys_sched_rr_get_interval_time32 , \
sys_sched_rr_get_interval )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/* kernel/signal.c */
# define __NR_restart_syscall 128
__SYSCALL ( __NR_restart_syscall , sys_restart_syscall )
# define __NR_kill 129
__SYSCALL ( __NR_kill , sys_kill )
# define __NR_tkill 130
__SYSCALL ( __NR_tkill , sys_tkill )
# define __NR_tgkill 131
__SYSCALL ( __NR_tgkill , sys_tgkill )
# define __NR_sigaltstack 132
__SC_COMP ( __NR_sigaltstack , sys_sigaltstack , compat_sys_sigaltstack )
# define __NR_rt_sigsuspend 133
__SC_COMP ( __NR_rt_sigsuspend , sys_rt_sigsuspend , compat_sys_rt_sigsuspend )
# define __NR_rt_sigaction 134
__SC_COMP ( __NR_rt_sigaction , sys_rt_sigaction , compat_sys_rt_sigaction )
# define __NR_rt_sigprocmask 135
2012-12-25 13:58:56 -05:00
__SC_COMP ( __NR_rt_sigprocmask , sys_rt_sigprocmask , compat_sys_rt_sigprocmask )
2012-10-04 18:20:15 +01:00
# define __NR_rt_sigpending 136
2012-12-25 13:58:56 -05:00
__SC_COMP ( __NR_rt_sigpending , sys_rt_sigpending , compat_sys_rt_sigpending )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_rt_sigtimedwait 137
2019-01-01 01:13:32 +01:00
__SC_COMP_3264 ( __NR_rt_sigtimedwait , sys_rt_sigtimedwait_time32 , \
sys_rt_sigtimedwait , compat_sys_rt_sigtimedwait_time32 )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_rt_sigqueueinfo 138
__SC_COMP ( __NR_rt_sigqueueinfo , sys_rt_sigqueueinfo , \
compat_sys_rt_sigqueueinfo )
# define __NR_rt_sigreturn 139
__SC_COMP ( __NR_rt_sigreturn , sys_rt_sigreturn , compat_sys_rt_sigreturn )
/* kernel/sys.c */
# define __NR_setpriority 140
__SYSCALL ( __NR_setpriority , sys_setpriority )
# define __NR_getpriority 141
__SYSCALL ( __NR_getpriority , sys_getpriority )
# define __NR_reboot 142
__SYSCALL ( __NR_reboot , sys_reboot )
# define __NR_setregid 143
__SYSCALL ( __NR_setregid , sys_setregid )
# define __NR_setgid 144
__SYSCALL ( __NR_setgid , sys_setgid )
# define __NR_setreuid 145
__SYSCALL ( __NR_setreuid , sys_setreuid )
# define __NR_setuid 146
__SYSCALL ( __NR_setuid , sys_setuid )
# define __NR_setresuid 147
__SYSCALL ( __NR_setresuid , sys_setresuid )
# define __NR_getresuid 148
__SYSCALL ( __NR_getresuid , sys_getresuid )
# define __NR_setresgid 149
__SYSCALL ( __NR_setresgid , sys_setresgid )
# define __NR_getresgid 150
__SYSCALL ( __NR_getresgid , sys_getresgid )
# define __NR_setfsuid 151
__SYSCALL ( __NR_setfsuid , sys_setfsuid )
# define __NR_setfsgid 152
__SYSCALL ( __NR_setfsgid , sys_setfsgid )
# define __NR_times 153
__SC_COMP ( __NR_times , sys_times , compat_sys_times )
# define __NR_setpgid 154
__SYSCALL ( __NR_setpgid , sys_setpgid )
# define __NR_getpgid 155
__SYSCALL ( __NR_getpgid , sys_getpgid )
# define __NR_getsid 156
__SYSCALL ( __NR_getsid , sys_getsid )
# define __NR_setsid 157
__SYSCALL ( __NR_setsid , sys_setsid )
# define __NR_getgroups 158
__SYSCALL ( __NR_getgroups , sys_getgroups )
# define __NR_setgroups 159
__SYSCALL ( __NR_setgroups , sys_setgroups )
# define __NR_uname 160
__SYSCALL ( __NR_uname , sys_newuname )
# define __NR_sethostname 161
__SYSCALL ( __NR_sethostname , sys_sethostname )
# define __NR_setdomainname 162
__SYSCALL ( __NR_setdomainname , sys_setdomainname )
2018-05-16 11:18:50 +03:00
# ifdef __ARCH_WANT_SET_GET_RLIMIT
/* getrlimit and setrlimit are superseded with prlimit64 */
2012-10-04 18:20:15 +01:00
# define __NR_getrlimit 163
__SC_COMP ( __NR_getrlimit , sys_getrlimit , compat_sys_getrlimit )
# define __NR_setrlimit 164
__SC_COMP ( __NR_setrlimit , sys_setrlimit , compat_sys_setrlimit )
2018-05-16 11:18:50 +03:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_getrusage 165
__SC_COMP ( __NR_getrusage , sys_getrusage , compat_sys_getrusage )
# define __NR_umask 166
__SYSCALL ( __NR_umask , sys_umask )
# define __NR_prctl 167
__SYSCALL ( __NR_prctl , sys_prctl )
# define __NR_getcpu 168
__SYSCALL ( __NR_getcpu , sys_getcpu )
/* kernel/time.c */
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_gettimeofday 169
__SC_COMP ( __NR_gettimeofday , sys_gettimeofday , compat_sys_gettimeofday )
# define __NR_settimeofday 170
__SC_COMP ( __NR_settimeofday , sys_settimeofday , compat_sys_settimeofday )
# define __NR_adjtimex 171
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_adjtimex , sys_adjtimex_time32 , sys_adjtimex )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
2020-11-12 16:56:57 -05:00
/* kernel/sys.c */
2012-10-04 18:20:15 +01:00
# define __NR_getpid 172
__SYSCALL ( __NR_getpid , sys_getpid )
# define __NR_getppid 173
__SYSCALL ( __NR_getppid , sys_getppid )
# define __NR_getuid 174
__SYSCALL ( __NR_getuid , sys_getuid )
# define __NR_geteuid 175
__SYSCALL ( __NR_geteuid , sys_geteuid )
# define __NR_getgid 176
__SYSCALL ( __NR_getgid , sys_getgid )
# define __NR_getegid 177
__SYSCALL ( __NR_getegid , sys_getegid )
# define __NR_gettid 178
__SYSCALL ( __NR_gettid , sys_gettid )
# define __NR_sysinfo 179
__SC_COMP ( __NR_sysinfo , sys_sysinfo , compat_sys_sysinfo )
/* ipc/mqueue.c */
# define __NR_mq_open 180
__SC_COMP ( __NR_mq_open , sys_mq_open , compat_sys_mq_open )
# define __NR_mq_unlink 181
__SYSCALL ( __NR_mq_unlink , sys_mq_unlink )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_mq_timedsend 182
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_mq_timedsend , sys_mq_timedsend_time32 , sys_mq_timedsend )
2012-10-04 18:20:15 +01:00
# define __NR_mq_timedreceive 183
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_mq_timedreceive , sys_mq_timedreceive_time32 , \
sys_mq_timedreceive )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_mq_notify 184
__SC_COMP ( __NR_mq_notify , sys_mq_notify , compat_sys_mq_notify )
# define __NR_mq_getsetattr 185
__SC_COMP ( __NR_mq_getsetattr , sys_mq_getsetattr , compat_sys_mq_getsetattr )
/* ipc/msg.c */
# define __NR_msgget 186
__SYSCALL ( __NR_msgget , sys_msgget )
# define __NR_msgctl 187
__SC_COMP ( __NR_msgctl , sys_msgctl , compat_sys_msgctl )
# define __NR_msgrcv 188
__SC_COMP ( __NR_msgrcv , sys_msgrcv , compat_sys_msgrcv )
# define __NR_msgsnd 189
__SC_COMP ( __NR_msgsnd , sys_msgsnd , compat_sys_msgsnd )
/* ipc/sem.c */
# define __NR_semget 190
__SYSCALL ( __NR_semget , sys_semget )
# define __NR_semctl 191
__SC_COMP ( __NR_semctl , sys_semctl , compat_sys_semctl )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_semtimedop 192
ipc: fix semtimedop for generic 32-bit architectures
As Vincent noticed, the y2038 conversion of semtimedop in linux-5.1
broke when commit 00bf25d693e7 ("y2038: use time32 syscall names on
32-bit") changed all system calls on all architectures that take
a 32-bit time_t to point to the _time32 implementation, but left out
semtimedop in the asm-generic header.
This affects all 32-bit architectures using asm-generic/unistd.h:
h8300, unicore32, openrisc, nios2, hexagon, c6x, arc, nds32 and csky.
The notable exception is riscv32, which has dropped support for the
time32 system calls entirely.
Reported-by: Vincent Chen <deanbo422@gmail.com>
Cc: stable@vger.kernel.org
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <jacquiot.aurelien@gmail.com>
Cc: Guo Ren <guoren@kernel.org>
Fixes: 00bf25d693e7 ("y2038: use time32 syscall names on 32-bit")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
2019-09-05 16:20:09 +02:00
__SC_3264 ( __NR_semtimedop , sys_semtimedop_time32 , sys_semtimedop )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_semop 193
__SYSCALL ( __NR_semop , sys_semop )
/* ipc/shm.c */
# define __NR_shmget 194
__SYSCALL ( __NR_shmget , sys_shmget )
# define __NR_shmctl 195
__SC_COMP ( __NR_shmctl , sys_shmctl , compat_sys_shmctl )
# define __NR_shmat 196
__SC_COMP ( __NR_shmat , sys_shmat , compat_sys_shmat )
# define __NR_shmdt 197
__SYSCALL ( __NR_shmdt , sys_shmdt )
/* net/socket.c */
# define __NR_socket 198
__SYSCALL ( __NR_socket , sys_socket )
# define __NR_socketpair 199
__SYSCALL ( __NR_socketpair , sys_socketpair )
# define __NR_bind 200
__SYSCALL ( __NR_bind , sys_bind )
# define __NR_listen 201
__SYSCALL ( __NR_listen , sys_listen )
# define __NR_accept 202
__SYSCALL ( __NR_accept , sys_accept )
# define __NR_connect 203
__SYSCALL ( __NR_connect , sys_connect )
# define __NR_getsockname 204
__SYSCALL ( __NR_getsockname , sys_getsockname )
# define __NR_getpeername 205
__SYSCALL ( __NR_getpeername , sys_getpeername )
# define __NR_sendto 206
__SYSCALL ( __NR_sendto , sys_sendto )
# define __NR_recvfrom 207
__SC_COMP ( __NR_recvfrom , sys_recvfrom , compat_sys_recvfrom )
# define __NR_setsockopt 208
2020-07-17 08:23:15 +02:00
__SC_COMP ( __NR_setsockopt , sys_setsockopt , sys_setsockopt )
2012-10-04 18:20:15 +01:00
# define __NR_getsockopt 209
2020-07-17 08:23:15 +02:00
__SC_COMP ( __NR_getsockopt , sys_getsockopt , sys_getsockopt )
2012-10-04 18:20:15 +01:00
# define __NR_shutdown 210
__SYSCALL ( __NR_shutdown , sys_shutdown )
# define __NR_sendmsg 211
__SC_COMP ( __NR_sendmsg , sys_sendmsg , compat_sys_sendmsg )
# define __NR_recvmsg 212
__SC_COMP ( __NR_recvmsg , sys_recvmsg , compat_sys_recvmsg )
/* mm/filemap.c */
# define __NR_readahead 213
__SC_COMP ( __NR_readahead , sys_readahead , compat_sys_readahead )
/* mm/nommu.c, also with MMU */
# define __NR_brk 214
__SYSCALL ( __NR_brk , sys_brk )
# define __NR_munmap 215
__SYSCALL ( __NR_munmap , sys_munmap )
# define __NR_mremap 216
__SYSCALL ( __NR_mremap , sys_mremap )
/* security/keys/keyctl.c */
# define __NR_add_key 217
__SYSCALL ( __NR_add_key , sys_add_key )
# define __NR_request_key 218
__SYSCALL ( __NR_request_key , sys_request_key )
# define __NR_keyctl 219
__SC_COMP ( __NR_keyctl , sys_keyctl , compat_sys_keyctl )
/* arch/example/kernel/sys_example.c */
# define __NR_clone 220
__SYSCALL ( __NR_clone , sys_clone )
# define __NR_execve 221
__SC_COMP ( __NR_execve , sys_execve , compat_sys_execve )
# define __NR3264_mmap 222
__SC_3264 ( __NR3264_mmap , sys_mmap2 , sys_mmap )
/* mm/fadvise.c */
# define __NR3264_fadvise64 223
__SC_COMP ( __NR3264_fadvise64 , sys_fadvise64_64 , compat_sys_fadvise64_64 )
/* mm/, CONFIG_MMU only */
# ifndef __ARCH_NOMMU
# define __NR_swapon 224
__SYSCALL ( __NR_swapon , sys_swapon )
# define __NR_swapoff 225
__SYSCALL ( __NR_swapoff , sys_swapoff )
# define __NR_mprotect 226
__SYSCALL ( __NR_mprotect , sys_mprotect )
# define __NR_msync 227
__SYSCALL ( __NR_msync , sys_msync )
# define __NR_mlock 228
__SYSCALL ( __NR_mlock , sys_mlock )
# define __NR_munlock 229
__SYSCALL ( __NR_munlock , sys_munlock )
# define __NR_mlockall 230
__SYSCALL ( __NR_mlockall , sys_mlockall )
# define __NR_munlockall 231
__SYSCALL ( __NR_munlockall , sys_munlockall )
# define __NR_mincore 232
__SYSCALL ( __NR_mincore , sys_mincore )
# define __NR_madvise 233
__SYSCALL ( __NR_madvise , sys_madvise )
# define __NR_remap_file_pages 234
__SYSCALL ( __NR_remap_file_pages , sys_remap_file_pages )
# define __NR_mbind 235
2021-09-08 15:18:25 -07:00
__SYSCALL ( __NR_mbind , sys_mbind )
2012-10-04 18:20:15 +01:00
# define __NR_get_mempolicy 236
2021-09-08 15:18:25 -07:00
__SYSCALL ( __NR_get_mempolicy , sys_get_mempolicy )
2012-10-04 18:20:15 +01:00
# define __NR_set_mempolicy 237
2021-09-08 15:18:25 -07:00
__SYSCALL ( __NR_set_mempolicy , sys_set_mempolicy )
2012-10-04 18:20:15 +01:00
# define __NR_migrate_pages 238
2021-09-08 15:18:25 -07:00
__SYSCALL ( __NR_migrate_pages , sys_migrate_pages )
2012-10-04 18:20:15 +01:00
# define __NR_move_pages 239
2021-09-08 15:18:25 -07:00
__SYSCALL ( __NR_move_pages , sys_move_pages )
2012-10-04 18:20:15 +01:00
# endif
# define __NR_rt_tgsigqueueinfo 240
__SC_COMP ( __NR_rt_tgsigqueueinfo , sys_rt_tgsigqueueinfo , \
compat_sys_rt_tgsigqueueinfo )
# define __NR_perf_event_open 241
__SYSCALL ( __NR_perf_event_open , sys_perf_event_open )
# define __NR_accept4 242
__SYSCALL ( __NR_accept4 , sys_accept4 )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_recvmmsg 243
2019-01-01 01:13:32 +01:00
__SC_COMP_3264 ( __NR_recvmmsg , sys_recvmmsg_time32 , sys_recvmmsg , compat_sys_recvmmsg_time32 )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
/*
* Architectures may provide up to 16 syscalls of their own
* starting with this value .
*/
# define __NR_arch_specific_syscall 244
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_wait4 260
__SC_COMP ( __NR_wait4 , sys_wait4 , compat_sys_wait4 )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_prlimit64 261
__SYSCALL ( __NR_prlimit64 , sys_prlimit64 )
# define __NR_fanotify_init 262
__SYSCALL ( __NR_fanotify_init , sys_fanotify_init )
# define __NR_fanotify_mark 263
__SYSCALL ( __NR_fanotify_mark , sys_fanotify_mark )
# define __NR_name_to_handle_at 264
__SYSCALL ( __NR_name_to_handle_at , sys_name_to_handle_at )
# define __NR_open_by_handle_at 265
2018-05-16 11:18:48 +03:00
__SYSCALL ( __NR_open_by_handle_at , sys_open_by_handle_at )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
2012-10-04 18:20:15 +01:00
# define __NR_clock_adjtime 266
2019-01-01 01:13:32 +01:00
__SC_3264 ( __NR_clock_adjtime , sys_clock_adjtime32 , sys_clock_adjtime )
2019-02-18 17:30:06 +01:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_syncfs 267
__SYSCALL ( __NR_syncfs , sys_syncfs )
# define __NR_setns 268
__SYSCALL ( __NR_setns , sys_setns )
# define __NR_sendmmsg 269
__SC_COMP ( __NR_sendmmsg , sys_sendmmsg , compat_sys_sendmmsg )
# define __NR_process_vm_readv 270
2020-09-25 06:51:45 +02:00
__SYSCALL ( __NR_process_vm_readv , sys_process_vm_readv )
2012-10-04 18:20:15 +01:00
# define __NR_process_vm_writev 271
2020-09-25 06:51:45 +02:00
__SYSCALL ( __NR_process_vm_writev , sys_process_vm_writev )
2012-10-04 18:20:15 +01:00
# define __NR_kcmp 272
__SYSCALL ( __NR_kcmp , sys_kcmp )
2012-10-16 12:40:03 +10:30
# define __NR_finit_module 273
__SYSCALL ( __NR_finit_module , sys_finit_module )
2014-02-03 15:33:37 +00:00
# define __NR_sched_setattr 274
__SYSCALL ( __NR_sched_setattr , sys_sched_setattr )
# define __NR_sched_getattr 275
__SYSCALL ( __NR_sched_getattr , sys_sched_getattr )
2014-04-23 11:08:05 +01:00
# define __NR_renameat2 276
__SYSCALL ( __NR_renameat2 , sys_renameat2 )
2014-06-25 16:08:24 -07:00
# define __NR_seccomp 277
__SYSCALL ( __NR_seccomp , sys_seccomp )
random: introduce getrandom(2) system call
The getrandom(2) system call was requested by the LibreSSL Portable
developers. It is analoguous to the getentropy(2) system call in
OpenBSD.
The rationale of this system call is to provide resiliance against
file descriptor exhaustion attacks, where the attacker consumes all
available file descriptors, forcing the use of the fallback code where
/dev/[u]random is not available. Since the fallback code is often not
well-tested, it is better to eliminate this potential failure mode
entirely.
The other feature provided by this new system call is the ability to
request randomness from the /dev/urandom entropy pool, but to block
until at least 128 bits of entropy has been accumulated in the
/dev/urandom entropy pool. Historically, the emphasis in the
/dev/urandom development has been to ensure that urandom pool is
initialized as quickly as possible after system boot, and preferably
before the init scripts start execution.
This is because changing /dev/urandom reads to block represents an
interface change that could potentially break userspace which is not
acceptable. In practice, on most x86 desktop and server systems, in
general the entropy pool can be initialized before it is needed (and
in modern kernels, we will printk a warning message if not). However,
on an embedded system, this may not be the case. And so with this new
interface, we can provide the functionality of blocking until the
urandom pool has been initialized. Any userspace program which uses
this new functionality must take care to assure that if it is used
during the boot process, that it will not cause the init scripts or
other portions of the system startup to hang indefinitely.
SYNOPSIS
#include <linux/random.h>
int getrandom(void *buf, size_t buflen, unsigned int flags);
DESCRIPTION
The system call getrandom() fills the buffer pointed to by buf
with up to buflen random bytes which can be used to seed user
space random number generators (i.e., DRBG's) or for other
cryptographic uses. It should not be used for Monte Carlo
simulations or other programs/algorithms which are doing
probabilistic sampling.
If the GRND_RANDOM flags bit is set, then draw from the
/dev/random pool instead of the /dev/urandom pool. The
/dev/random pool is limited based on the entropy that can be
obtained from environmental noise, so if there is insufficient
entropy, the requested number of bytes may not be returned.
If there is no entropy available at all, getrandom(2) will
either block, or return an error with errno set to EAGAIN if
the GRND_NONBLOCK bit is set in flags.
If the GRND_RANDOM bit is not set, then the /dev/urandom pool
will be used. Unlike using read(2) to fetch data from
/dev/urandom, if the urandom pool has not been sufficiently
initialized, getrandom(2) will block (or return -1 with the
errno set to EAGAIN if the GRND_NONBLOCK bit is set in flags).
The getentropy(2) system call in OpenBSD can be emulated using
the following function:
int getentropy(void *buf, size_t buflen)
{
int ret;
if (buflen > 256)
goto failure;
ret = getrandom(buf, buflen, 0);
if (ret < 0)
return ret;
if (ret == buflen)
return 0;
failure:
errno = EIO;
return -1;
}
RETURN VALUE
On success, the number of bytes that was filled in the buf is
returned. This may not be all the bytes requested by the
caller via buflen if insufficient entropy was present in the
/dev/random pool, or if the system call was interrupted by a
signal.
On error, -1 is returned, and errno is set appropriately.
ERRORS
EINVAL An invalid flag was passed to getrandom(2)
EFAULT buf is outside the accessible address space.
EAGAIN The requested entropy was not available, and
getentropy(2) would have blocked if the
GRND_NONBLOCK flag was not set.
EINTR While blocked waiting for entropy, the call was
interrupted by a signal handler; see the description
of how interrupted read(2) calls on "slow" devices
are handled with and without the SA_RESTART flag
in the signal(7) man page.
NOTES
For small requests (buflen <= 256) getrandom(2) will not
return EINTR when reading from the urandom pool once the
entropy pool has been initialized, and it will return all of
the bytes that have been requested. This is the recommended
way to use getrandom(2), and is designed for compatibility
with OpenBSD's getentropy() system call.
However, if you are using GRND_RANDOM, then getrandom(2) may
block until the entropy accounting determines that sufficient
environmental noise has been gathered such that getrandom(2)
will be operating as a NRBG instead of a DRBG for those people
who are working in the NIST SP 800-90 regime. Since it may
block for a long time, these guarantees do *not* apply. The
user may want to interrupt a hanging process using a signal,
so blocking until all of the requested bytes are returned
would be unfriendly.
For this reason, the user of getrandom(2) MUST always check
the return value, in case it returns some error, or if fewer
bytes than requested was returned. In the case of
!GRND_RANDOM and small request, the latter should never
happen, but the careful userspace code (and all crypto code
should be careful) should check for this anyway!
Finally, unless you are doing long-term key generation (and
perhaps not even then), you probably shouldn't be using
GRND_RANDOM. The cryptographic algorithms used for
/dev/urandom are quite conservative, and so should be
sufficient for all purposes. The disadvantage of GRND_RANDOM
is that it can block, and the increased complexity required to
deal with partially fulfilled getrandom(2) requests.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zach Brown <zab@zabbo.net>
2014-07-17 04:13:05 -04:00
# define __NR_getrandom 278
__SYSCALL ( __NR_getrandom , sys_getrandom )
2014-08-11 14:24:47 +01:00
# define __NR_memfd_create 279
__SYSCALL ( __NR_memfd_create , sys_memfd_create )
2014-09-26 00:16:58 -07:00
# define __NR_bpf 280
__SYSCALL ( __NR_bpf , sys_bpf )
syscalls: implement execveat() system call
This patchset adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc filesystem,
at least for executables (rather than scripts). The current glibc version
of fexecve(3) is implemented via /proc, which causes problems in sandboxed
or otherwise restricted environments.
Given the desire for a /proc-free fexecve() implementation, HPA suggested
(https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be
an appropriate generalization.
Also, having a new syscall means that it can take a flags argument without
back-compatibility concerns. The current implementation just defines the
AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be
added in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).
Related history:
- https://lkml.org/lkml/2006/12/27/123 is an example of someone
realizing that fexecve() is likely to fail in a chroot environment.
- http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
documenting the /proc requirement of fexecve(3) in its manpage, to
"prevent other people from wasting their time".
- https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
problem where a process that did setuid() could not fexecve()
because it no longer had access to /proc/self/fd; this has since
been fixed.
This patch (of 4):
Add a new execveat(2) system call. execveat() is to execve() as openat()
is to open(): it takes a file descriptor that refers to a directory, and
resolves the filename relative to that.
In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in other
UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and
so relies on /proc being mounted).
The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found. This does however mean that
execution of a script in a /proc-less environment won't work; also, script
execution via an O_CLOEXEC file descriptor fails (as the file will not be
accessible after exec).
Based on patches by Meredydd Luff.
Signed-off-by: David Drysdale <drysdale@google.com>
Cc: Meredydd Luff <meredydd@senatehouse.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Rich Felker <dalias@aerifal.cx>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-12 16:57:29 -08:00
# define __NR_execveat 281
__SC_COMP ( __NR_execveat , sys_execveat , compat_sys_execveat )
2015-09-22 14:59:09 -07:00
# define __NR_userfaultfd 282
__SYSCALL ( __NR_userfaultfd , sys_userfaultfd )
# define __NR_membarrier 283
sys_membarrier(): system-wide memory barrier (generic, x86)
Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system. It is
implemented by calling synchronize_sched(). It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier. For synchronization primitives
that distinguish between read-side and write-side (e.g. userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.
The existing applications of which I am aware that would be improved by
this system call are as follows:
* Through Userspace RCU library (http://urcu.so)
- DNS server (Knot DNS) https://www.knot-dns.cz/
- Network sniffer (http://netsniff-ng.org/)
- Distributed object storage (https://sheepdog.github.io/sheepdog/)
- User-space tracing (http://lttng.org)
- Network storage system (https://www.gluster.org/)
- Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
- Financial software (https://lkml.org/lkml/2015/3/23/189)
Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking. Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().
* Direct users of sys_membarrier
- core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)
Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux. They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.
To explain the benefit of this scheme, let's introduce two example threads:
Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())
In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".
Before the change, we had, for each smp_mb() pairs:
Thread A Thread B
previous mem accesses previous mem accesses
smp_mb() smp_mb()
following mem accesses following mem accesses
After the change, these pairs become:
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).
1) Non-concurrent Thread A vs Thread B accesses:
Thread A Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
prev mem accesses
barrier()
follow mem accesses
In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.
2) Concurrent Thread A vs Thread B accesses
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().
* Benchmarks
On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)
1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.
* User-space user of this system call: Userspace RCU library
Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.
Results in liburcu:
Operations in 10s, 6 readers, 2 writers:
memory barriers in reader: 1701557485 reads, 2202847 writes
signal-based scheme: 9830061167 reads, 6700 writes
sys_membarrier: 9952759104 reads, 425 writes
sys_membarrier (dyn. check): 7970328887 reads, 425 writes
The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.
Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.
An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.
This patch adds the system call to x86 and to asm-generic.
[1] http://urcu.so
membarrier(2) man page:
MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2)
NAME
membarrier - issue memory barriers on a set of threads
SYNOPSIS
#include <linux/membarrier.h>
int membarrier(int cmd, int flags);
DESCRIPTION
The cmd argument is one of the following:
MEMBARRIER_CMD_QUERY
Query the set of supported commands. It returns a bitmask of
supported commands.
MEMBARRIER_CMD_SHARED
Execute a memory barrier on all threads running on the system.
Upon return from system call, the caller thread is ensured that
all running threads have passed through a state where all memory
accesses to user-space addresses match program order between
entry to and return from the system call (non-running threads
are de facto in such a state). This covers threads from all pro=E2=80=90
cesses running on the system. This command returns 0.
The flags argument needs to be 0. For future extensions.
All memory accesses performed in program order from each targeted
thread is guaranteed to be ordered with respect to sys_membarrier(). If
we use the semantic "barrier()" to represent a compiler barrier forcing
memory accesses to be performed in program order across the barrier,
and smp_mb() to represent explicit memory barriers forcing full memory
ordering across the barrier, we have the following ordering table for
each pair of barrier(), sys_membarrier() and smp_mb():
The pair ordering is detailed as (O: ordered, X: not ordered):
barrier() smp_mb() sys_membarrier()
barrier() X X O
smp_mb() X O O
sys_membarrier() O O O
RETURN VALUE
On success, these system calls return zero. On error, -1 is returned,
and errno is set appropriately. For a given command, with flags
argument set to 0, this system call is guaranteed to always return the
same value until reboot.
ERRORS
ENOSYS System call is not implemented.
EINVAL Invalid arguments.
Linux 2015-04-15 MEMBARRIER(2)
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-11 13:07:39 -07:00
__SYSCALL ( __NR_membarrier , sys_membarrier )
2015-11-05 18:51:33 -08:00
# define __NR_mlock2 284
__SYSCALL ( __NR_mlock2 , sys_mlock2 )
2015-11-10 16:53:30 -05:00
# define __NR_copy_file_range 285
__SYSCALL ( __NR_copy_file_range , sys_copy_file_range )
2016-04-11 10:17:46 +01:00
# define __NR_preadv2 286
2016-05-02 19:12:47 +03:00
__SC_COMP ( __NR_preadv2 , sys_preadv2 , compat_sys_preadv2 )
2016-04-11 10:17:46 +01:00
# define __NR_pwritev2 287
2016-05-02 19:12:47 +03:00
__SC_COMP ( __NR_pwritev2 , sys_pwritev2 , compat_sys_pwritev2 )
2016-07-29 09:30:18 -07:00
# define __NR_pkey_mprotect 288
__SYSCALL ( __NR_pkey_mprotect , sys_pkey_mprotect )
# define __NR_pkey_alloc 289
__SYSCALL ( __NR_pkey_alloc , sys_pkey_alloc )
# define __NR_pkey_free 290
__SYSCALL ( __NR_pkey_free , sys_pkey_free )
2017-03-13 23:45:21 +09:00
# define __NR_statx 291
__SYSCALL ( __NR_statx , sys_statx )
2019-02-18 17:30:06 +01:00
# if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
aio: implement io_pgetevents
This is the io_getevents equivalent of ppoll/pselect and allows to
properly mix signals and aio completions (especially with IOCB_CMD_POLL)
and atomically executes the following sequence:
sigset_t origmask;
pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
ret = io_getevents(ctx, min_nr, nr, events, timeout);
pthread_sigmask(SIG_SETMASK, &origmask, NULL);
Note that unlike many other signal related calls we do not pass a sigmask
size, as that would get us to 7 arguments, which aren't easily supported
by the syscall infrastructure. It seems a lot less painful to just add a
new syscall variant in the unlikely case we're going to increase the
sigset size.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-05-02 19:51:00 +02:00
# define __NR_io_pgetevents 292
2019-01-01 01:13:32 +01:00
__SC_COMP_3264 ( __NR_io_pgetevents , sys_io_pgetevents_time32 , sys_io_pgetevents , compat_sys_io_pgetevents )
2019-02-18 17:30:06 +01:00
# endif
2018-06-20 14:58:10 +01:00
# define __NR_rseq 293
__SYSCALL ( __NR_rseq , sys_rseq )
2018-11-15 14:52:40 +09:00
# define __NR_kexec_file_load 294
__SYSCALL ( __NR_kexec_file_load , sys_kexec_file_load )
2018-12-31 14:38:26 +01:00
/* 295 through 402 are unassigned to sync up with generic numbers, don't use */
2022-04-05 15:13:01 +08:00
# if defined(__SYSCALL_COMPAT) || __BITS_PER_LONG == 32
2019-01-10 12:45:11 +01:00
# define __NR_clock_gettime64 403
__SYSCALL ( __NR_clock_gettime64 , sys_clock_gettime )
# define __NR_clock_settime64 404
__SYSCALL ( __NR_clock_settime64 , sys_clock_settime )
# define __NR_clock_adjtime64 405
__SYSCALL ( __NR_clock_adjtime64 , sys_clock_adjtime )
# define __NR_clock_getres_time64 406
__SYSCALL ( __NR_clock_getres_time64 , sys_clock_getres )
# define __NR_clock_nanosleep_time64 407
__SYSCALL ( __NR_clock_nanosleep_time64 , sys_clock_nanosleep )
# define __NR_timer_gettime64 408
__SYSCALL ( __NR_timer_gettime64 , sys_timer_gettime )
# define __NR_timer_settime64 409
__SYSCALL ( __NR_timer_settime64 , sys_timer_settime )
# define __NR_timerfd_gettime64 410
__SYSCALL ( __NR_timerfd_gettime64 , sys_timerfd_gettime )
# define __NR_timerfd_settime64 411
__SYSCALL ( __NR_timerfd_settime64 , sys_timerfd_settime )
# define __NR_utimensat_time64 412
__SYSCALL ( __NR_utimensat_time64 , sys_utimensat )
# define __NR_pselect6_time64 413
__SC_COMP ( __NR_pselect6_time64 , sys_pselect6 , compat_sys_pselect6_time64 )
# define __NR_ppoll_time64 414
__SC_COMP ( __NR_ppoll_time64 , sys_ppoll , compat_sys_ppoll_time64 )
# define __NR_io_pgetevents_time64 416
__SYSCALL ( __NR_io_pgetevents_time64 , sys_io_pgetevents )
# define __NR_recvmmsg_time64 417
__SC_COMP ( __NR_recvmmsg_time64 , sys_recvmmsg , compat_sys_recvmmsg_time64 )
# define __NR_mq_timedsend_time64 418
__SYSCALL ( __NR_mq_timedsend_time64 , sys_mq_timedsend )
# define __NR_mq_timedreceive_time64 419
__SYSCALL ( __NR_mq_timedreceive_time64 , sys_mq_timedreceive )
# define __NR_semtimedop_time64 420
__SYSCALL ( __NR_semtimedop_time64 , sys_semtimedop )
# define __NR_rt_sigtimedwait_time64 421
__SC_COMP ( __NR_rt_sigtimedwait_time64 , sys_rt_sigtimedwait , compat_sys_rt_sigtimedwait_time64 )
# define __NR_futex_time64 422
__SYSCALL ( __NR_futex_time64 , sys_futex )
# define __NR_sched_rr_get_interval_time64 423
__SYSCALL ( __NR_sched_rr_get_interval_time64 , sys_sched_rr_get_interval )
# endif
2012-10-04 18:20:15 +01:00
signal: add pidfd_send_signal() syscall
The kill() syscall operates on process identifiers (pid). After a process
has exited its pid can be reused by another process. If a caller sends a
signal to a reused pid it will end up signaling the wrong process. This
issue has often surfaced and there has been a push to address this problem [1].
This patch uses file descriptors (fd) from proc/<pid> as stable handles on
struct pid. Even if a pid is recycled the handle will not change. The fd
can be used to send signals to the process it refers to.
Thus, the new syscall pidfd_send_signal() is introduced to solve this
problem. Instead of pids it operates on process fds (pidfd).
/* prototype and argument /*
long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags);
/* syscall number 424 */
The syscall number was chosen to be 424 to align with Arnd's rework in his
y2038 to minimize merge conflicts (cf. [25]).
In addition to the pidfd and signal argument it takes an additional
siginfo_t and flags argument. If the siginfo_t argument is NULL then
pidfd_send_signal() is equivalent to kill(<positive-pid>, <signal>). If it
is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo().
The flags argument is added to allow for future extensions of this syscall.
It currently needs to be passed as 0. Failing to do so will cause EINVAL.
/* pidfd_send_signal() replaces multiple pid-based syscalls */
The pidfd_send_signal() syscall currently takes on the job of
rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a
positive pid is passed to kill(2). It will however be possible to also
replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended.
/* sending signals to threads (tid) and process groups (pgid) */
Specifically, the pidfd_send_signal() syscall does currently not operate on
process groups or threads. This is left for future extensions.
In order to extend the syscall to allow sending signal to threads and
process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and
PIDFD_TYPE_TID) should be added. This implies that the flags argument will
determine what is signaled and not the file descriptor itself. Put in other
words, grouping in this api is a property of the flags argument not a
property of the file descriptor (cf. [13]). Clarification for this has been
requested by Eric (cf. [19]).
When appropriate extensions through the flags argument are added then
pidfd_send_signal() can additionally replace the part of kill(2) which
operates on process groups as well as the tgkill(2) and
rt_tgsigqueueinfo(2) syscalls.
How such an extension could be implemented has been very roughly sketched
in [14], [15], and [16]. However, this should not be taken as a commitment
to a particular implementation. There might be better ways to do it.
Right now this is intentionally left out to keep this patchset as simple as
possible (cf. [4]).
/* naming */
The syscall had various names throughout iterations of this patchset:
- procfd_signal()
- procfd_send_signal()
- taskfd_send_signal()
In the last round of reviews it was pointed out that given that if the
flags argument decides the scope of the signal instead of different types
of fds it might make sense to either settle for "procfd_" or "pidfd_" as
prefix. The community was willing to accept either (cf. [17] and [18]).
Given that one developer expressed strong preference for the "pidfd_"
prefix (cf. [13]) and with other developers less opinionated about the name
we should settle for "pidfd_" to avoid further bikeshedding.
The "_send_signal" suffix was chosen to reflect the fact that the syscall
takes on the job of multiple syscalls. It is therefore intentional that the
name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the
fomer because it might imply that pidfd_send_signal() is a replacement for
kill(2), and not the latter because it is a hassle to remember the correct
spelling - especially for non-native speakers - and because it is not
descriptive enough of what the syscall actually does. The name
"pidfd_send_signal" makes it very clear that its job is to send signals.
/* zombies */
Zombies can be signaled just as any other process. No special error will be
reported since a zombie state is an unreliable state (cf. [3]). However,
this can be added as an extension through the @flags argument if the need
ever arises.
/* cross-namespace signals */
The patch currently enforces that the signaler and signalee either are in
the same pid namespace or that the signaler's pid namespace is an ancestor
of the signalee's pid namespace. This is done for the sake of simplicity
and because it is unclear to what values certain members of struct
siginfo_t would need to be set to (cf. [5], [6]).
/* compat syscalls */
It became clear that we would like to avoid adding compat syscalls
(cf. [7]). The compat syscall handling is now done in kernel/signal.c
itself by adding __copy_siginfo_from_user_generic() which lets us avoid
compat syscalls (cf. [8]). It should be noted that the addition of
__copy_siginfo_from_user_any() is caused by a bug in the original
implementation of rt_sigqueueinfo(2) (cf. 12).
With upcoming rework for syscall handling things might improve
significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain
any additional callers.
/* testing */
This patch was tested on x64 and x86.
/* userspace usage */
An asciinema recording for the basic functionality can be found under [9].
With this patch a process can be killed via:
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags)
{
#ifdef __NR_pidfd_send_signal
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
#else
return -ENOSYS;
#endif
}
int main(int argc, char *argv[])
{
int fd, ret, saved_errno, sig;
if (argc < 3)
exit(EXIT_FAILURE);
fd = open(argv[1], O_DIRECTORY | O_CLOEXEC);
if (fd < 0) {
printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]);
exit(EXIT_FAILURE);
}
sig = atoi(argv[2]);
printf("Sending signal %d to process %s\n", sig, argv[1]);
ret = do_pidfd_send_signal(fd, sig, NULL, 0);
saved_errno = errno;
close(fd);
errno = saved_errno;
if (ret < 0) {
printf("%s - Failed to send signal %d to process %s\n",
strerror(errno), sig, argv[1]);
exit(EXIT_FAILURE);
}
exit(EXIT_SUCCESS);
}
/* Q&A
* Given that it seems the same questions get asked again by people who are
* late to the party it makes sense to add a Q&A section to the commit
* message so it's hopefully easier to avoid duplicate threads.
*
* For the sake of progress please consider these arguments settled unless
* there is a new point that desperately needs to be addressed. Please make
* sure to check the links to the threads in this commit message whether
* this has not already been covered.
*/
Q-01: (Florian Weimer [20], Andrew Morton [21])
What happens when the target process has exited?
A-01: Sending the signal will fail with ESRCH (cf. [22]).
Q-02: (Andrew Morton [21])
Is the task_struct pinned by the fd?
A-02: No. A reference to struct pid is kept. struct pid - as far as I
understand - was created exactly for the reason to not require to
pin struct task_struct (cf. [22]).
Q-03: (Andrew Morton [21])
Does the entire procfs directory remain visible? Just one entry
within it?
A-03: The same thing that happens right now when you hold a file descriptor
to /proc/<pid> open (cf. [22]).
Q-04: (Andrew Morton [21])
Does the pid remain reserved?
A-04: No. This patchset guarantees a stable handle not that pids are not
recycled (cf. [22]).
Q-05: (Andrew Morton [21])
Do attempts to signal that fd return errors?
A-05: See {Q,A}-01.
Q-06: (Andrew Morton [22])
Is there a cleaner way of obtaining the fd? Another syscall perhaps.
A-06: Userspace can already trivially retrieve file descriptors from procfs
so this is something that we will need to support anyway. Hence,
there's no immediate need to add another syscalls just to make
pidfd_send_signal() not dependent on the presence of procfs. However,
adding a syscalls to get such file descriptors is planned for a
future patchset (cf. [22]).
Q-07: (Andrew Morton [21] and others)
This fd-for-a-process sounds like a handy thing and people may well
think up other uses for it in the future, probably unrelated to
signals. Are the code and the interface designed to permit such
future applications?
A-07: Yes (cf. [22]).
Q-08: (Andrew Morton [21] and others)
Now I think about it, why a new syscall? This thing is looking
rather like an ioctl?
A-08: This has been extensively discussed. It was agreed that a syscall is
preferred for a variety or reasons. Here are just a few taken from
prior threads. Syscalls are safer than ioctl()s especially when
signaling to fds. Processes are a core kernel concept so a syscall
seems more appropriate. The layout of the syscall with its four
arguments would require the addition of a custom struct for the
ioctl() thereby causing at least the same amount or even more
complexity for userspace than a simple syscall. The new syscall will
replace multiple other pid-based syscalls (see description above).
The file-descriptors-for-processes concept introduced with this
syscall will be extended with other syscalls in the future. See also
[22], [23] and various other threads already linked in here.
Q-09: (Florian Weimer [24])
What happens if you use the new interface with an O_PATH descriptor?
A-09:
pidfds opened as O_PATH fds cannot be used to send signals to a
process (cf. [2]). Signaling processes through pidfds is the
equivalent of writing to a file. Thus, this is not an operation that
operates "purely at the file descriptor level" as required by the
open(2) manpage. See also [4].
/* References */
[1]: https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/
[2]: https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/
[3]: https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/
[4]: https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/
[5]: https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/
[6]: https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/
[7]: https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/
[8]: https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/
[9]: https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy
[11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/
[12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/
[13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/
[14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/
[15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/
[16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/
[17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/
[18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/
[19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/
[20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/
[22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/
[23]: https://lwn.net/Articles/773459/
[24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jann Horn <jannh@google.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Aleksa Sarai <cyphar@cyphar.com>
2018-11-19 00:51:56 +01:00
# define __NR_pidfd_send_signal 424
__SYSCALL ( __NR_pidfd_send_signal , sys_pidfd_send_signal )
Add io_uring IO interface
The submission queue (SQ) and completion queue (CQ) rings are shared
between the application and the kernel. This eliminates the need to
copy data back and forth to submit and complete IO.
IO submissions use the io_uring_sqe data structure, and completions
are generated in the form of io_uring_cqe data structures. The SQ
ring is an index into the io_uring_sqe array, which makes it possible
to submit a batch of IOs without them being contiguous in the ring.
The CQ ring is always contiguous, as completion events are inherently
unordered, and hence any io_uring_cqe entry can point back to an
arbitrary submission.
Two new system calls are added for this:
io_uring_setup(entries, params)
Sets up an io_uring instance for doing async IO. On success,
returns a file descriptor that the application can mmap to
gain access to the SQ ring, CQ ring, and io_uring_sqes.
io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize)
Initiates IO against the rings mapped to this fd, or waits for
them to complete, or both. The behavior is controlled by the
parameters passed in. If 'to_submit' is non-zero, then we'll
try and submit new IO. If IORING_ENTER_GETEVENTS is set, the
kernel will wait for 'min_complete' events, if they aren't
already available. It's valid to set IORING_ENTER_GETEVENTS
and 'min_complete' == 0 at the same time, this allows the
kernel to return already completed events without waiting
for them. This is useful only for polling, as for IRQ
driven IO, the application can just check the CQ ring
without entering the kernel.
With this setup, it's possible to do async IO with a single system
call. Future developments will enable polled IO with this interface,
and polled submission as well. The latter will enable an application
to do IO without doing ANY system calls at all.
For IRQ driven IO, an application only needs to enter the kernel for
completions if it wants to wait for them to occur.
Each io_uring is backed by a workqueue, to support buffered async IO
as well. We will only punt to an async context if the command would
need to wait for IO on the device side. Any data that can be accessed
directly in the page cache is done inline. This avoids the slowness
issue of usual threadpools, since cached data is accessed as quickly
as a sync interface.
Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-01-07 10:46:33 -07:00
# define __NR_io_uring_setup 425
__SYSCALL ( __NR_io_uring_setup , sys_io_uring_setup )
# define __NR_io_uring_enter 426
__SYSCALL ( __NR_io_uring_enter , sys_io_uring_enter )
io_uring: add support for pre-mapped user IO buffers
If we have fixed user buffers, we can map them into the kernel when we
setup the io_uring. That avoids the need to do get_user_pages() for
each and every IO.
To utilize this feature, the application must call io_uring_register()
after having setup an io_uring instance, passing in
IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to
an iovec array, and the nr_args should contain how many iovecs the
application wishes to map.
If successful, these buffers are now mapped into the kernel, eligible
for IO. To use these fixed buffers, the application must use the
IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
must point to somewhere inside the indexed buffer.
The application may register buffers throughout the lifetime of the
io_uring instance. It can call io_uring_register() with
IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
buffers, and then register a new set. The application need not
unregister buffers explicitly before shutting down the io_uring
instance.
It's perfectly valid to setup a larger buffer, and then sometimes only
use parts of it for an IO. As long as the range is within the originally
mapped region, it will work just fine.
For now, buffers must not be file backed. If file backed buffers are
passed in, the registration will fail with -1/EOPNOTSUPP. This
restriction may be relaxed in the future.
RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat
arbitrary 1G per buffer size is also imposed.
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-01-09 09:16:05 -07:00
# define __NR_io_uring_register 427
__SYSCALL ( __NR_io_uring_register , sys_io_uring_register )
2019-05-16 12:52:34 +01:00
# define __NR_open_tree 428
__SYSCALL ( __NR_open_tree , sys_open_tree )
# define __NR_move_mount 429
__SYSCALL ( __NR_move_mount , sys_move_mount )
# define __NR_fsopen 430
__SYSCALL ( __NR_fsopen , sys_fsopen )
# define __NR_fsconfig 431
__SYSCALL ( __NR_fsconfig , sys_fsconfig )
# define __NR_fsmount 432
__SYSCALL ( __NR_fsmount , sys_fsmount )
# define __NR_fspick 433
__SYSCALL ( __NR_fspick , sys_fspick )
2019-05-24 12:44:59 +02:00
# define __NR_pidfd_open 434
__SYSCALL ( __NR_pidfd_open , sys_pidfd_open )
2019-07-14 21:22:05 +02:00
# ifdef __ARCH_WANT_SYS_CLONE3
2019-07-11 10:09:44 -07:00
# define __NR_clone3 435
2019-05-25 11:37:43 +02:00
__SYSCALL ( __NR_clone3 , sys_clone3 )
2019-07-14 21:22:05 +02:00
# endif
2019-05-24 11:31:44 +02:00
# define __NR_close_range 436
__SYSCALL ( __NR_close_range , sys_close_range )
2012-10-04 18:20:15 +01:00
open: introduce openat2(2) syscall
/* Background. */
For a very long time, extending openat(2) with new features has been
incredibly frustrating. This stems from the fact that openat(2) is
possibly the most famous counter-example to the mantra "don't silently
accept garbage from userspace" -- it doesn't check whether unknown flags
are present[1].
This means that (generally) the addition of new flags to openat(2) has
been fraught with backwards-compatibility issues (O_TMPFILE has to be
defined as __O_TMPFILE|O_DIRECTORY|[O_RDWR or O_WRONLY] to ensure old
kernels gave errors, since it's insecure to silently ignore the
flag[2]). All new security-related flags therefore have a tough road to
being added to openat(2).
Userspace also has a hard time figuring out whether a particular flag is
supported on a particular kernel. While it is now possible with
contemporary kernels (thanks to [3]), older kernels will expose unknown
flag bits through fcntl(F_GETFL). Giving a clear -EINVAL during
openat(2) time matches modern syscall designs and is far more
fool-proof.
In addition, the newly-added path resolution restriction LOOKUP flags
(which we would like to expose to user-space) don't feel related to the
pre-existing O_* flag set -- they affect all components of path lookup.
We'd therefore like to add a new flag argument.
Adding a new syscall allows us to finally fix the flag-ignoring problem,
and we can make it extensible enough so that we will hopefully never
need an openat3(2).
/* Syscall Prototype. */
/*
* open_how is an extensible structure (similar in interface to
* clone3(2) or sched_setattr(2)). The size parameter must be set to
* sizeof(struct open_how), to allow for future extensions. All future
* extensions will be appended to open_how, with their zero value
* acting as a no-op default.
*/
struct open_how { /* ... */ };
int openat2(int dfd, const char *pathname,
struct open_how *how, size_t size);
/* Description. */
The initial version of 'struct open_how' contains the following fields:
flags
Used to specify openat(2)-style flags. However, any unknown flag
bits or otherwise incorrect flag combinations (like O_PATH|O_RDWR)
will result in -EINVAL. In addition, this field is 64-bits wide to
allow for more O_ flags than currently permitted with openat(2).
mode
The file mode for O_CREAT or O_TMPFILE.
Must be set to zero if flags does not contain O_CREAT or O_TMPFILE.
resolve
Restrict path resolution (in contrast to O_* flags they affect all
path components). The current set of flags are as follows (at the
moment, all of the RESOLVE_ flags are implemented as just passing
the corresponding LOOKUP_ flag).
RESOLVE_NO_XDEV => LOOKUP_NO_XDEV
RESOLVE_NO_SYMLINKS => LOOKUP_NO_SYMLINKS
RESOLVE_NO_MAGICLINKS => LOOKUP_NO_MAGICLINKS
RESOLVE_BENEATH => LOOKUP_BENEATH
RESOLVE_IN_ROOT => LOOKUP_IN_ROOT
open_how does not contain an embedded size field, because it is of
little benefit (userspace can figure out the kernel open_how size at
runtime fairly easily without it). It also only contains u64s (even
though ->mode arguably should be a u16) to avoid having padding fields
which are never used in the future.
Note that as a result of the new how->flags handling, O_PATH|O_TMPFILE
is no longer permitted for openat(2). As far as I can tell, this has
always been a bug and appears to not be used by userspace (and I've not
seen any problems on my machines by disallowing it). If it turns out
this breaks something, we can special-case it and only permit it for
openat(2) but not openat2(2).
After input from Florian Weimer, the new open_how and flag definitions
are inside a separate header from uapi/linux/fcntl.h, to avoid problems
that glibc has with importing that header.
/* Testing. */
In a follow-up patch there are over 200 selftests which ensure that this
syscall has the correct semantics and will correctly handle several
attack scenarios.
In addition, I've written a userspace library[4] which provides
convenient wrappers around openat2(RESOLVE_IN_ROOT) (this is necessary
because no other syscalls support RESOLVE_IN_ROOT, and thus lots of care
must be taken when using RESOLVE_IN_ROOT'd file descriptors with other
syscalls). During the development of this patch, I've run numerous
verification tests using libpathrs (showing that the API is reasonably
usable by userspace).
/* Future Work. */
Additional RESOLVE_ flags have been suggested during the review period.
These can be easily implemented separately (such as blocking auto-mount
during resolution).
Furthermore, there are some other proposed changes to the openat(2)
interface (the most obvious example is magic-link hardening[5]) which
would be a good opportunity to add a way for userspace to restrict how
O_PATH file descriptors can be re-opened.
Another possible avenue of future work would be some kind of
CHECK_FIELDS[6] flag which causes the kernel to indicate to userspace
which openat2(2) flags and fields are supported by the current kernel
(to avoid userspace having to go through several guesses to figure it
out).
[1]: https://lwn.net/Articles/588444/
[2]: https://lore.kernel.org/lkml/CA+55aFyyxJL1LyXZeBsf2ypriraj5ut1XkNDsunRBqgVjZU_6Q@mail.gmail.com
[3]: commit 629e014bb834 ("fs: completely ignore unknown open flags")
[4]: https://sourceware.org/bugzilla/show_bug.cgi?id=17523
[5]: https://lore.kernel.org/lkml/20190930183316.10190-2-cyphar@cyphar.com/
[6]: https://youtu.be/ggD-eb3yPVs
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-18 23:07:59 +11:00
# define __NR_openat2 437
__SYSCALL ( __NR_openat2 , sys_openat2 )
2020-01-07 09:59:26 -08:00
# define __NR_pidfd_getfd 438
__SYSCALL ( __NR_pidfd_getfd , sys_pidfd_getfd )
2020-05-14 16:44:25 +02:00
# define __NR_faccessat2 439
__SYSCALL ( __NR_faccessat2 , sys_faccessat2 )
mm/madvise: introduce process_madvise() syscall: an external memory hinting API
There is usecase that System Management Software(SMS) want to give a
memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the
case of Android, it is the ActivityManagerService.
The information required to make the reclaim decision is not known to the
app. Instead, it is known to the centralized userspace
daemon(ActivityManagerService), and that daemon must be able to initiate
reclaim on its own without any app involvement.
To solve the issue, this patch introduces a new syscall
process_madvise(2). It uses pidfd of an external process to give the
hint. It also supports vector address range because Android app has
thousands of vmas due to zygote so it's totally waste of CPU and power if
we should call the syscall one by one for each vma.(With testing 2000-vma
syscall vs 1-vector syscall, it showed 15% performance improvement. I
think it would be bigger in real practice because the testing ran very
cache friendly environment).
Another potential use case for the vector range is to amortize the cost
ofTLB shootdowns for multiple ranges when using MADV_DONTNEED; this could
benefit users like TCP receive zerocopy and malloc implementations. In
future, we could find more usecases for other advises so let's make it
happens as API since we introduce a new syscall at this moment. With
that, existing madvise(2) user could replace it with process_madvise(2)
with their own pid if they want to have batch address ranges support
feature.
ince it could affect other process's address range, only privileged
process(PTRACE_MODE_ATTACH_FSCREDS) or something else(e.g., being the same
UID) gives it the right to ptrace the process could use it successfully.
The flag argument is reserved for future use if we need to extend the API.
I think supporting all hints madvise has/will supported/support to
process_madvise is rather risky. Because we are not sure all hints make
sense from external process and implementation for the hint may rely on
the caller being in the current context so it could be error-prone. Thus,
I just limited hints as MADV_[COLD|PAGEOUT] in this patch.
If someone want to add other hints, we could hear the usecase and review
it for each hint. It's safer for maintenance rather than introducing a
buggy syscall but hard to fix it later.
So finally, the API is as follows,
ssize_t process_madvise(int pidfd, const struct iovec *iovec,
unsigned long vlen, int advice, unsigned int flags);
DESCRIPTION
The process_madvise() system call is used to give advice or directions
to the kernel about the address ranges from external process as well as
local process. It provides the advice to address ranges of process
described by iovec and vlen. The goal of such advice is to improve
system or application performance.
The pidfd selects the process referred to by the PID file descriptor
specified in pidfd. (See pidofd_open(2) for further information)
The pointer iovec points to an array of iovec structures, defined in
<sys/uio.h> as:
struct iovec {
void *iov_base; /* starting address */
size_t iov_len; /* number of bytes to be advised */
};
The iovec describes address ranges beginning at address(iov_base)
and with size length of bytes(iov_len).
The vlen represents the number of elements in iovec.
The advice is indicated in the advice argument, which is one of the
following at this moment if the target process specified by pidfd is
external.
MADV_COLD
MADV_PAGEOUT
Permission to provide a hint to external process is governed by a
ptrace access mode PTRACE_MODE_ATTACH_FSCREDS check; see ptrace(2).
The process_madvise supports every advice madvise(2) has if target
process is in same thread group with calling process so user could
use process_madvise(2) to extend existing madvise(2) to support
vector address ranges.
RETURN VALUE
On success, process_madvise() returns the number of bytes advised.
This return value may be less than the total number of requested
bytes, if an error occurred. The caller should check return value
to determine whether a partial advice occurred.
FAQ:
Q.1 - Why does any external entity have better knowledge?
Quote from Sandeep
"For Android, every application (including the special SystemServer)
are forked from Zygote. The reason of course is to share as many
libraries and classes between the two as possible to benefit from the
preloading during boot.
After applications start, (almost) all of the APIs end up calling into
this SystemServer process over IPC (binder) and back to the
application.
In a fully running system, the SystemServer monitors every single
process periodically to calculate their PSS / RSS and also decides
which process is "important" to the user for interactivity.
So, because of how these processes start _and_ the fact that the
SystemServer is looping to monitor each process, it does tend to *know*
which address range of the application is not used / useful.
Besides, we can never rely on applications to clean things up
themselves. We've had the "hey app1, the system is low on memory,
please trim your memory usage down" notifications for a long time[1].
They rely on applications honoring the broadcasts and very few do.
So, if we want to avoid the inevitable killing of the application and
restarting it, some way to be able to tell the OS about unimportant
memory in these applications will be useful.
- ssp
Q.2 - How to guarantee the race(i.e., object validation) between when
giving a hint from an external process and get the hint from the target
process?
process_madvise operates on the target process's address space as it
exists at the instant that process_madvise is called. If the space
target process can run between the time the process_madvise process
inspects the target process address space and the time that
process_madvise is actually called, process_madvise may operate on
memory regions that the calling process does not expect. It's the
responsibility of the process calling process_madvise to close this
race condition. For example, the calling process can suspend the
target process with ptrace, SIGSTOP, or the freezer cgroup so that it
doesn't have an opportunity to change its own address space before
process_madvise is called. Another option is to operate on memory
regions that the caller knows a priori will be unchanged in the target
process. Yet another option is to accept the race for certain
process_madvise calls after reasoning that mistargeting will do no
harm. The suggested API itself does not provide synchronization. It
also apply other APIs like move_pages, process_vm_write.
The race isn't really a problem though. Why is it so wrong to require
that callers do their own synchronization in some manner? Nobody
objects to write(2) merely because it's possible for two processes to
open the same file and clobber each other's writes --- instead, we tell
people to use flock or something. Think about mmap. It never
guarantees newly allocated address space is still valid when the user
tries to access it because other threads could unmap the memory right
before. That's where we need synchronization by using other API or
design from userside. It shouldn't be part of API itself. If someone
needs more fine-grained synchronization rather than process level,
there were two ideas suggested - cookie[2] and anon-fd[3]. Both are
applicable via using last reserved argument of the API but I don't
think it's necessary right now since we have already ways to prevent
the race so don't want to add additional complexity with more
fine-grained optimization model.
To make the API extend, it reserved an unsigned long as last argument
so we could support it in future if someone really needs it.
Q.3 - Why doesn't ptrace work?
Injecting an madvise in the target process using ptrace would not work
for us because such injected madvise would have to be executed by the
target process, which means that process would have to be runnable and
that creates the risk of the abovementioned race and hinting a wrong
VMA. Furthermore, we want to act the hint in caller's context, not the
callee's, because the callee is usually limited in cpuset/cgroups or
even freezed state so they can't act by themselves quick enough, which
causes more thrashing/kill. It doesn't work if the target process are
ptraced(e.g., strace, debugger, minidump) because a process can have at
most one ptracer.
[1] https://developer.android.com/topic/performance/memory"
[2] process_getinfo for getting the cookie which is updated whenever
vma of process address layout are changed - Daniel Colascione -
https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224
[3] anonymous fd which is used for the object(i.e., address range)
validation - Michal Hocko -
https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/
[minchan@kernel.org: fix process_madvise build break for arm64]
Link: http://lkml.kernel.org/r/20200303145756.GA219683@google.com
[minchan@kernel.org: fix build error for mips of process_madvise]
Link: http://lkml.kernel.org/r/20200508052517.GA197378@google.com
[akpm@linux-foundation.org: fix patch ordering issue]
[akpm@linux-foundation.org: fix arm64 whoops]
[minchan@kernel.org: make process_madvise() vlen arg have type size_t, per Florian]
[akpm@linux-foundation.org: fix i386 build]
[sfr@canb.auug.org.au: fix syscall numbering]
Link: https://lkml.kernel.org/r/20200905142639.49fc3f1a@canb.auug.org.au
[sfr@canb.auug.org.au: madvise.c needs compat.h]
Link: https://lkml.kernel.org/r/20200908204547.285646b4@canb.auug.org.au
[minchan@kernel.org: fix mips build]
Link: https://lkml.kernel.org/r/20200909173655.GC2435453@google.com
[yuehaibing@huawei.com: remove duplicate header which is included twice]
Link: https://lkml.kernel.org/r/20200915121550.30584-1-yuehaibing@huawei.com
[minchan@kernel.org: do not use helper functions for process_madvise]
Link: https://lkml.kernel.org/r/20200921175539.GB387368@google.com
[akpm@linux-foundation.org: pidfd_get_pid() gained an argument]
[sfr@canb.auug.org.au: fix up for "iov_iter: transparently handle compat iovecs in import_iovec"]
Link: https://lkml.kernel.org/r/20200928212542.468e1fef@canb.auug.org.au
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: Daniel Colascione <dancol@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Dias <joaodias@google.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleksandr Natalenko <oleksandr@redhat.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: SeongJae Park <sj38.park@gmail.com>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Sonny Rao <sonnyrao@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Florian Weimer <fw@deneb.enyo.de>
Cc: <linux-man@vger.kernel.org>
Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org
Link: http://lkml.kernel.org/r/20200508183320.GA125527@google.com
Link: http://lkml.kernel.org/r/20200622192900.22757-4-minchan@kernel.org
Link: https://lkml.kernel.org/r/20200901000633.1920247-4-minchan@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-17 16:14:59 -07:00
# define __NR_process_madvise 440
__SYSCALL ( __NR_process_madvise , sys_process_madvise )
2020-12-18 14:05:41 -08:00
# define __NR_epoll_pwait2 441
__SC_COMP ( __NR_epoll_pwait2 , sys_epoll_pwait2 , compat_sys_epoll_pwait2 )
fs: add mount_setattr()
This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.
The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:
int mount_setattr(int dfd, const char *path, unsigned flags,
struct mount_attr *uattr, size_t usize);
Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.
The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:
struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.
The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.
The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.
[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")
Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-01-21 14:19:53 +01:00
# define __NR_mount_setattr 442
__SYSCALL ( __NR_mount_setattr , sys_mount_setattr )
2021-05-25 16:07:48 +02:00
# define __NR_quotactl_fd 443
__SYSCALL ( __NR_quotactl_fd , sys_quotactl_fd )
open: introduce openat2(2) syscall
/* Background. */
For a very long time, extending openat(2) with new features has been
incredibly frustrating. This stems from the fact that openat(2) is
possibly the most famous counter-example to the mantra "don't silently
accept garbage from userspace" -- it doesn't check whether unknown flags
are present[1].
This means that (generally) the addition of new flags to openat(2) has
been fraught with backwards-compatibility issues (O_TMPFILE has to be
defined as __O_TMPFILE|O_DIRECTORY|[O_RDWR or O_WRONLY] to ensure old
kernels gave errors, since it's insecure to silently ignore the
flag[2]). All new security-related flags therefore have a tough road to
being added to openat(2).
Userspace also has a hard time figuring out whether a particular flag is
supported on a particular kernel. While it is now possible with
contemporary kernels (thanks to [3]), older kernels will expose unknown
flag bits through fcntl(F_GETFL). Giving a clear -EINVAL during
openat(2) time matches modern syscall designs and is far more
fool-proof.
In addition, the newly-added path resolution restriction LOOKUP flags
(which we would like to expose to user-space) don't feel related to the
pre-existing O_* flag set -- they affect all components of path lookup.
We'd therefore like to add a new flag argument.
Adding a new syscall allows us to finally fix the flag-ignoring problem,
and we can make it extensible enough so that we will hopefully never
need an openat3(2).
/* Syscall Prototype. */
/*
* open_how is an extensible structure (similar in interface to
* clone3(2) or sched_setattr(2)). The size parameter must be set to
* sizeof(struct open_how), to allow for future extensions. All future
* extensions will be appended to open_how, with their zero value
* acting as a no-op default.
*/
struct open_how { /* ... */ };
int openat2(int dfd, const char *pathname,
struct open_how *how, size_t size);
/* Description. */
The initial version of 'struct open_how' contains the following fields:
flags
Used to specify openat(2)-style flags. However, any unknown flag
bits or otherwise incorrect flag combinations (like O_PATH|O_RDWR)
will result in -EINVAL. In addition, this field is 64-bits wide to
allow for more O_ flags than currently permitted with openat(2).
mode
The file mode for O_CREAT or O_TMPFILE.
Must be set to zero if flags does not contain O_CREAT or O_TMPFILE.
resolve
Restrict path resolution (in contrast to O_* flags they affect all
path components). The current set of flags are as follows (at the
moment, all of the RESOLVE_ flags are implemented as just passing
the corresponding LOOKUP_ flag).
RESOLVE_NO_XDEV => LOOKUP_NO_XDEV
RESOLVE_NO_SYMLINKS => LOOKUP_NO_SYMLINKS
RESOLVE_NO_MAGICLINKS => LOOKUP_NO_MAGICLINKS
RESOLVE_BENEATH => LOOKUP_BENEATH
RESOLVE_IN_ROOT => LOOKUP_IN_ROOT
open_how does not contain an embedded size field, because it is of
little benefit (userspace can figure out the kernel open_how size at
runtime fairly easily without it). It also only contains u64s (even
though ->mode arguably should be a u16) to avoid having padding fields
which are never used in the future.
Note that as a result of the new how->flags handling, O_PATH|O_TMPFILE
is no longer permitted for openat(2). As far as I can tell, this has
always been a bug and appears to not be used by userspace (and I've not
seen any problems on my machines by disallowing it). If it turns out
this breaks something, we can special-case it and only permit it for
openat(2) but not openat2(2).
After input from Florian Weimer, the new open_how and flag definitions
are inside a separate header from uapi/linux/fcntl.h, to avoid problems
that glibc has with importing that header.
/* Testing. */
In a follow-up patch there are over 200 selftests which ensure that this
syscall has the correct semantics and will correctly handle several
attack scenarios.
In addition, I've written a userspace library[4] which provides
convenient wrappers around openat2(RESOLVE_IN_ROOT) (this is necessary
because no other syscalls support RESOLVE_IN_ROOT, and thus lots of care
must be taken when using RESOLVE_IN_ROOT'd file descriptors with other
syscalls). During the development of this patch, I've run numerous
verification tests using libpathrs (showing that the API is reasonably
usable by userspace).
/* Future Work. */
Additional RESOLVE_ flags have been suggested during the review period.
These can be easily implemented separately (such as blocking auto-mount
during resolution).
Furthermore, there are some other proposed changes to the openat(2)
interface (the most obvious example is magic-link hardening[5]) which
would be a good opportunity to add a way for userspace to restrict how
O_PATH file descriptors can be re-opened.
Another possible avenue of future work would be some kind of
CHECK_FIELDS[6] flag which causes the kernel to indicate to userspace
which openat2(2) flags and fields are supported by the current kernel
(to avoid userspace having to go through several guesses to figure it
out).
[1]: https://lwn.net/Articles/588444/
[2]: https://lore.kernel.org/lkml/CA+55aFyyxJL1LyXZeBsf2ypriraj5ut1XkNDsunRBqgVjZU_6Q@mail.gmail.com
[3]: commit 629e014bb834 ("fs: completely ignore unknown open flags")
[4]: https://sourceware.org/bugzilla/show_bug.cgi?id=17523
[5]: https://lore.kernel.org/lkml/20190930183316.10190-2-cyphar@cyphar.com/
[6]: https://youtu.be/ggD-eb3yPVs
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-18 23:07:59 +11:00
2021-04-22 17:41:19 +02:00
# define __NR_landlock_create_ruleset 444
__SYSCALL ( __NR_landlock_create_ruleset , sys_landlock_create_ruleset )
# define __NR_landlock_add_rule 445
__SYSCALL ( __NR_landlock_add_rule , sys_landlock_add_rule )
# define __NR_landlock_restrict_self 446
__SYSCALL ( __NR_landlock_restrict_self , sys_landlock_restrict_self )
open: introduce openat2(2) syscall
/* Background. */
For a very long time, extending openat(2) with new features has been
incredibly frustrating. This stems from the fact that openat(2) is
possibly the most famous counter-example to the mantra "don't silently
accept garbage from userspace" -- it doesn't check whether unknown flags
are present[1].
This means that (generally) the addition of new flags to openat(2) has
been fraught with backwards-compatibility issues (O_TMPFILE has to be
defined as __O_TMPFILE|O_DIRECTORY|[O_RDWR or O_WRONLY] to ensure old
kernels gave errors, since it's insecure to silently ignore the
flag[2]). All new security-related flags therefore have a tough road to
being added to openat(2).
Userspace also has a hard time figuring out whether a particular flag is
supported on a particular kernel. While it is now possible with
contemporary kernels (thanks to [3]), older kernels will expose unknown
flag bits through fcntl(F_GETFL). Giving a clear -EINVAL during
openat(2) time matches modern syscall designs and is far more
fool-proof.
In addition, the newly-added path resolution restriction LOOKUP flags
(which we would like to expose to user-space) don't feel related to the
pre-existing O_* flag set -- they affect all components of path lookup.
We'd therefore like to add a new flag argument.
Adding a new syscall allows us to finally fix the flag-ignoring problem,
and we can make it extensible enough so that we will hopefully never
need an openat3(2).
/* Syscall Prototype. */
/*
* open_how is an extensible structure (similar in interface to
* clone3(2) or sched_setattr(2)). The size parameter must be set to
* sizeof(struct open_how), to allow for future extensions. All future
* extensions will be appended to open_how, with their zero value
* acting as a no-op default.
*/
struct open_how { /* ... */ };
int openat2(int dfd, const char *pathname,
struct open_how *how, size_t size);
/* Description. */
The initial version of 'struct open_how' contains the following fields:
flags
Used to specify openat(2)-style flags. However, any unknown flag
bits or otherwise incorrect flag combinations (like O_PATH|O_RDWR)
will result in -EINVAL. In addition, this field is 64-bits wide to
allow for more O_ flags than currently permitted with openat(2).
mode
The file mode for O_CREAT or O_TMPFILE.
Must be set to zero if flags does not contain O_CREAT or O_TMPFILE.
resolve
Restrict path resolution (in contrast to O_* flags they affect all
path components). The current set of flags are as follows (at the
moment, all of the RESOLVE_ flags are implemented as just passing
the corresponding LOOKUP_ flag).
RESOLVE_NO_XDEV => LOOKUP_NO_XDEV
RESOLVE_NO_SYMLINKS => LOOKUP_NO_SYMLINKS
RESOLVE_NO_MAGICLINKS => LOOKUP_NO_MAGICLINKS
RESOLVE_BENEATH => LOOKUP_BENEATH
RESOLVE_IN_ROOT => LOOKUP_IN_ROOT
open_how does not contain an embedded size field, because it is of
little benefit (userspace can figure out the kernel open_how size at
runtime fairly easily without it). It also only contains u64s (even
though ->mode arguably should be a u16) to avoid having padding fields
which are never used in the future.
Note that as a result of the new how->flags handling, O_PATH|O_TMPFILE
is no longer permitted for openat(2). As far as I can tell, this has
always been a bug and appears to not be used by userspace (and I've not
seen any problems on my machines by disallowing it). If it turns out
this breaks something, we can special-case it and only permit it for
openat(2) but not openat2(2).
After input from Florian Weimer, the new open_how and flag definitions
are inside a separate header from uapi/linux/fcntl.h, to avoid problems
that glibc has with importing that header.
/* Testing. */
In a follow-up patch there are over 200 selftests which ensure that this
syscall has the correct semantics and will correctly handle several
attack scenarios.
In addition, I've written a userspace library[4] which provides
convenient wrappers around openat2(RESOLVE_IN_ROOT) (this is necessary
because no other syscalls support RESOLVE_IN_ROOT, and thus lots of care
must be taken when using RESOLVE_IN_ROOT'd file descriptors with other
syscalls). During the development of this patch, I've run numerous
verification tests using libpathrs (showing that the API is reasonably
usable by userspace).
/* Future Work. */
Additional RESOLVE_ flags have been suggested during the review period.
These can be easily implemented separately (such as blocking auto-mount
during resolution).
Furthermore, there are some other proposed changes to the openat(2)
interface (the most obvious example is magic-link hardening[5]) which
would be a good opportunity to add a way for userspace to restrict how
O_PATH file descriptors can be re-opened.
Another possible avenue of future work would be some kind of
CHECK_FIELDS[6] flag which causes the kernel to indicate to userspace
which openat2(2) flags and fields are supported by the current kernel
(to avoid userspace having to go through several guesses to figure it
out).
[1]: https://lwn.net/Articles/588444/
[2]: https://lore.kernel.org/lkml/CA+55aFyyxJL1LyXZeBsf2ypriraj5ut1XkNDsunRBqgVjZU_6Q@mail.gmail.com
[3]: commit 629e014bb834 ("fs: completely ignore unknown open flags")
[4]: https://sourceware.org/bugzilla/show_bug.cgi?id=17523
[5]: https://lore.kernel.org/lkml/20190930183316.10190-2-cyphar@cyphar.com/
[6]: https://youtu.be/ggD-eb3yPVs
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-01-18 23:07:59 +11:00
2021-07-07 18:08:11 -07:00
# ifdef __ARCH_WANT_MEMFD_SECRET
# define __NR_memfd_secret 447
__SYSCALL ( __NR_memfd_secret , sys_memfd_secret )
# endif
2021-09-02 15:00:33 -07:00
# define __NR_process_mrelease 448
__SYSCALL ( __NR_process_mrelease , sys_process_mrelease )
2021-07-07 18:08:11 -07:00
futex: Implement sys_futex_waitv()
Add support to wait on multiple futexes. This is the interface
implemented by this syscall:
futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
unsigned int flags, struct timespec *timeout, clockid_t clockid)
struct futex_waitv {
__u64 val;
__u64 uaddr;
__u32 flags;
__u32 __reserved;
};
Given an array of struct futex_waitv, wait on each uaddr. The thread
wakes if a futex_wake() is performed at any uaddr. The syscall returns
immediately if any waiter has *uaddr != val. *timeout is an optional
absolute timeout value for the operation. This syscall supports only
64bit sized timeout structs. The flags argument of the syscall should be
empty, but it can be used for future extensions. Flags for shared
futexes, sizes, etc. should be used on the individual flags of each
waiter.
__reserved is used for explicit padding and should be 0, but it might be
used for future extensions. If the userspace uses 32-bit pointers, it
should make sure to explicitly cast it when assigning to waitv::uaddr.
Returns the array index of one of the woken futexes. There’s no given
information of how many were woken, or any particular attribute of it
(if it’s the first woken, if it is of the smaller index...).
Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com
2021-09-23 14:11:05 -03:00
# define __NR_futex_waitv 449
__SYSCALL ( __NR_futex_waitv , sys_futex_waitv )
2022-01-14 14:08:21 -08:00
# define __NR_set_mempolicy_home_node 450
__SYSCALL ( __NR_set_mempolicy_home_node , sys_set_mempolicy_home_node )
2012-10-04 18:20:15 +01:00
# undef __NR_syscalls
2022-01-14 14:08:21 -08:00
# define __NR_syscalls 451
2012-10-04 18:20:15 +01:00
/*
* 32 bit systems traditionally used different
* syscalls for off_t and loff_t arguments , while
* 64 bit systems only need the off_t version .
* For new 32 bit platforms , there is no need to
* implement the old 32 bit off_t syscalls , so
* they take different names .
* Here we map the numbers so that both versions
* use the same syscall table layout .
*/
# if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT)
# define __NR_fcntl __NR3264_fcntl
# define __NR_statfs __NR3264_statfs
# define __NR_fstatfs __NR3264_fstatfs
# define __NR_truncate __NR3264_truncate
# define __NR_ftruncate __NR3264_ftruncate
# define __NR_lseek __NR3264_lseek
# define __NR_sendfile __NR3264_sendfile
2018-12-06 10:07:40 +08:00
# if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64)
2012-10-04 18:20:15 +01:00
# define __NR_newfstatat __NR3264_fstatat
# define __NR_fstat __NR3264_fstat
2018-12-06 10:07:40 +08:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_mmap __NR3264_mmap
# define __NR_fadvise64 __NR3264_fadvise64
# ifdef __NR3264_stat
# define __NR_stat __NR3264_stat
# define __NR_lstat __NR3264_lstat
# endif
# else
# define __NR_fcntl64 __NR3264_fcntl
# define __NR_statfs64 __NR3264_statfs
# define __NR_fstatfs64 __NR3264_fstatfs
# define __NR_truncate64 __NR3264_truncate
# define __NR_ftruncate64 __NR3264_ftruncate
# define __NR_llseek __NR3264_lseek
# define __NR_sendfile64 __NR3264_sendfile
2018-12-06 10:07:40 +08:00
# if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64)
2012-10-04 18:20:15 +01:00
# define __NR_fstatat64 __NR3264_fstatat
# define __NR_fstat64 __NR3264_fstat
2018-12-06 10:07:40 +08:00
# endif
2012-10-04 18:20:15 +01:00
# define __NR_mmap2 __NR3264_mmap
# define __NR_fadvise64_64 __NR3264_fadvise64
# ifdef __NR3264_stat
# define __NR_stat64 __NR3264_stat
# define __NR_lstat64 __NR3264_lstat
# endif
# endif