2011-11-11 15:55:49 -08:00
#
# 32-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point> <compat entry point>
#
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit
2015-10-05 17:48:05 -07:00
2 i386 fork sys_fork sys_fork
2011-11-11 15:55:49 -08:00
3 i386 read sys_read
4 i386 write sys_write
5 i386 open sys_open compat_sys_open
6 i386 close sys_close
7 i386 waitpid sys_waitpid sys32_waitpid
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
2015-10-05 17:48:05 -07:00
11 i386 execve sys_execve compat_sys_execve
2011-11-11 15:55:49 -08:00
12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod
15 i386 chmod sys_chmod
16 i386 lchown sys_lchown16
17 i386 break
18 i386 oldstat sys_stat
2013-02-24 10:52:26 -05:00
19 i386 lseek sys_lseek compat_sys_lseek
2011-11-11 15:55:49 -08:00
20 i386 getpid sys_getpid
21 i386 mount sys_mount compat_sys_mount
22 i386 umount sys_oldumount
23 i386 setuid sys_setuid16
24 i386 getuid sys_getuid16
25 i386 stime sys_stime compat_sys_stime
26 i386 ptrace sys_ptrace compat_sys_ptrace
27 i386 alarm sys_alarm
28 i386 oldfstat sys_fstat
29 i386 pause sys_pause
30 i386 utime sys_utime compat_sys_utime
31 i386 stty
32 i386 gtty
33 i386 access sys_access
34 i386 nice sys_nice
35 i386 ftime
36 i386 sync sys_sync
2013-02-24 14:00:48 -05:00
37 i386 kill sys_kill
2011-11-11 15:55:49 -08:00
38 i386 rename sys_rename
39 i386 mkdir sys_mkdir
40 i386 rmdir sys_rmdir
41 i386 dup sys_dup
42 i386 pipe sys_pipe
43 i386 times sys_times compat_sys_times
44 i386 prof
45 i386 brk sys_brk
46 i386 setgid sys_setgid16
47 i386 getgid sys_getgid16
48 i386 signal sys_signal
49 i386 geteuid sys_geteuid16
50 i386 getegid sys_getegid16
51 i386 acct sys_acct
52 i386 umount2 sys_umount
53 i386 lock
54 i386 ioctl sys_ioctl compat_sys_ioctl
55 i386 fcntl sys_fcntl compat_sys_fcntl64
56 i386 mpx
57 i386 setpgid sys_setpgid
58 i386 ulimit
59 i386 oldolduname sys_olduname
60 i386 umask sys_umask
61 i386 chroot sys_chroot
62 i386 ustat sys_ustat compat_sys_ustat
63 i386 dup2 sys_dup2
64 i386 getppid sys_getppid
65 i386 getpgrp sys_getpgrp
66 i386 setsid sys_setsid
2012-12-25 19:14:55 -05:00
67 i386 sigaction sys_sigaction compat_sys_sigaction
2011-11-11 15:55:49 -08:00
68 i386 sgetmask sys_sgetmask
69 i386 ssetmask sys_ssetmask
70 i386 setreuid sys_setreuid16
71 i386 setregid sys_setregid16
2012-12-25 16:09:20 -05:00
72 i386 sigsuspend sys_sigsuspend sys_sigsuspend
2011-11-11 15:55:49 -08:00
73 i386 sigpending sys_sigpending compat_sys_sigpending
74 i386 sethostname sys_sethostname
75 i386 setrlimit sys_setrlimit compat_sys_setrlimit
76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit
77 i386 getrusage sys_getrusage compat_sys_getrusage
78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday
79 i386 settimeofday sys_settimeofday compat_sys_settimeofday
80 i386 getgroups sys_getgroups16
81 i386 setgroups sys_setgroups16
82 i386 select sys_old_select compat_sys_old_select
83 i386 symlink sys_symlink
84 i386 oldlstat sys_lstat
85 i386 readlink sys_readlink
86 i386 uselib sys_uselib
87 i386 swapon sys_swapon
88 i386 reboot sys_reboot
89 i386 readdir sys_old_readdir compat_sys_old_readdir
90 i386 mmap sys_old_mmap sys32_mmap
91 i386 munmap sys_munmap
2013-02-24 13:49:08 -05:00
92 i386 truncate sys_truncate compat_sys_truncate
93 i386 ftruncate sys_ftruncate compat_sys_ftruncate
2011-11-11 15:55:49 -08:00
94 i386 fchmod sys_fchmod
95 i386 fchown sys_fchown16
96 i386 getpriority sys_getpriority
97 i386 setpriority sys_setpriority
98 i386 profil
99 i386 statfs sys_statfs compat_sys_statfs
100 i386 fstatfs sys_fstatfs compat_sys_fstatfs
101 i386 ioperm sys_ioperm
102 i386 socketcall sys_socketcall compat_sys_socketcall
103 i386 syslog sys_syslog
104 i386 setitimer sys_setitimer compat_sys_setitimer
105 i386 getitimer sys_getitimer compat_sys_getitimer
106 i386 stat sys_newstat compat_sys_newstat
107 i386 lstat sys_newlstat compat_sys_newlstat
108 i386 fstat sys_newfstat compat_sys_newfstat
109 i386 olduname sys_uname
2012-11-19 22:00:52 -05:00
110 i386 iopl sys_iopl
2011-11-11 15:55:49 -08:00
111 i386 vhangup sys_vhangup
112 i386 idle
2015-03-03 22:31:34 -05:00
113 i386 vm86old sys_vm86old sys_ni_syscall
2011-11-11 15:55:49 -08:00
114 i386 wait4 sys_wait4 compat_sys_wait4
115 i386 swapoff sys_swapoff
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
2013-01-21 23:15:25 -05:00
117 i386 ipc sys_ipc compat_sys_ipc
2011-11-11 15:55:49 -08:00
118 i386 fsync sys_fsync
2015-10-05 17:48:05 -07:00
119 i386 sigreturn sys_sigreturn sys32_sigreturn
2012-10-22 22:34:11 -04:00
120 i386 clone sys_clone stub32_clone
2011-11-11 15:55:49 -08:00
121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname
123 i386 modify_ldt sys_modify_ldt
124 i386 adjtimex sys_adjtimex compat_sys_adjtimex
2013-02-24 14:00:48 -05:00
125 i386 mprotect sys_mprotect
2011-11-11 15:55:49 -08:00
126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask
127 i386 create_module
128 i386 init_module sys_init_module
129 i386 delete_module sys_delete_module
130 i386 get_kernel_syms
131 i386 quotactl sys_quotactl sys32_quotactl
132 i386 getpgid sys_getpgid
133 i386 fchdir sys_fchdir
134 i386 bdflush sys_bdflush
135 i386 sysfs sys_sysfs
136 i386 personality sys_personality
137 i386 afs_syscall
138 i386 setfsuid sys_setfsuid16
139 i386 setfsgid sys_setfsgid16
140 i386 _llseek sys_llseek
141 i386 getdents sys_getdents compat_sys_getdents
142 i386 _newselect sys_select compat_sys_select
143 i386 flock sys_flock
144 i386 msync sys_msync
145 i386 readv sys_readv compat_sys_readv
146 i386 writev sys_writev compat_sys_writev
147 i386 getsid sys_getsid
148 i386 fdatasync sys_fdatasync
149 i386 _sysctl sys_sysctl compat_sys_sysctl
150 i386 mlock sys_mlock
151 i386 munlock sys_munlock
152 i386 mlockall sys_mlockall
153 i386 munlockall sys_munlockall
154 i386 sched_setparam sys_sched_setparam
155 i386 sched_getparam sys_sched_getparam
156 i386 sched_setscheduler sys_sched_setscheduler
157 i386 sched_getscheduler sys_sched_getscheduler
158 i386 sched_yield sys_sched_yield
159 i386 sched_get_priority_max sys_sched_get_priority_max
160 i386 sched_get_priority_min sys_sched_get_priority_min
2012-12-25 17:19:57 -05:00
161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval
2011-11-11 15:55:49 -08:00
162 i386 nanosleep sys_nanosleep compat_sys_nanosleep
163 i386 mremap sys_mremap
164 i386 setresuid sys_setresuid16
165 i386 getresuid sys_getresuid16
2015-03-03 22:31:34 -05:00
166 i386 vm86 sys_vm86 sys_ni_syscall
2011-11-11 15:55:49 -08:00
167 i386 query_module
168 i386 poll sys_poll
169 i386 nfsservctl
170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl
2015-10-05 17:48:05 -07:00
173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
2012-12-25 18:42:26 -05:00
174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
2012-02-19 09:48:01 -08:00
175 i386 rt_sigprocmask sys_rt_sigprocmask
2012-12-25 14:46:17 -05:00
176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
2011-11-11 15:55:49 -08:00
177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait
2012-12-25 15:26:55 -05:00
178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo
2011-11-11 15:55:49 -08:00
179 i386 rt_sigsuspend sys_rt_sigsuspend
180 i386 pread64 sys_pread64 sys32_pread
181 i386 pwrite64 sys_pwrite64 sys32_pwrite
182 i386 chown sys_chown16
183 i386 getcwd sys_getcwd
184 i386 capget sys_capget
185 i386 capset sys_capset
2012-12-14 14:47:53 -05:00
186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack
2013-02-24 02:17:03 -05:00
187 i386 sendfile sys_sendfile compat_sys_sendfile
2011-11-11 15:55:49 -08:00
188 i386 getpmsg
189 i386 putpmsg
2015-10-05 17:48:05 -07:00
190 i386 vfork sys_vfork sys_vfork
2011-11-11 15:55:49 -08:00
191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
192 i386 mmap2 sys_mmap_pgoff
193 i386 truncate64 sys_truncate64 sys32_truncate64
194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64
195 i386 stat64 sys_stat64 sys32_stat64
196 i386 lstat64 sys_lstat64 sys32_lstat64
197 i386 fstat64 sys_fstat64 sys32_fstat64
198 i386 lchown32 sys_lchown
199 i386 getuid32 sys_getuid
200 i386 getgid32 sys_getgid
201 i386 geteuid32 sys_geteuid
202 i386 getegid32 sys_getegid
203 i386 setreuid32 sys_setreuid
204 i386 setregid32 sys_setregid
205 i386 getgroups32 sys_getgroups
206 i386 setgroups32 sys_setgroups
207 i386 fchown32 sys_fchown
208 i386 setresuid32 sys_setresuid
209 i386 getresuid32 sys_getresuid
210 i386 setresgid32 sys_setresgid
211 i386 getresgid32 sys_getresgid
212 i386 chown32 sys_chown
213 i386 setuid32 sys_setuid
214 i386 setgid32 sys_setgid
215 i386 setfsuid32 sys_setfsuid
216 i386 setfsgid32 sys_setfsgid
217 i386 pivot_root sys_pivot_root
218 i386 mincore sys_mincore
219 i386 madvise sys_madvise
220 i386 getdents64 sys_getdents64 compat_sys_getdents64
221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64
# 222 is unused
# 223 is unused
224 i386 gettid sys_gettid
225 i386 readahead sys_readahead sys32_readahead
226 i386 setxattr sys_setxattr
227 i386 lsetxattr sys_lsetxattr
228 i386 fsetxattr sys_fsetxattr
229 i386 getxattr sys_getxattr
230 i386 lgetxattr sys_lgetxattr
231 i386 fgetxattr sys_fgetxattr
232 i386 listxattr sys_listxattr
233 i386 llistxattr sys_llistxattr
234 i386 flistxattr sys_flistxattr
235 i386 removexattr sys_removexattr
236 i386 lremovexattr sys_lremovexattr
237 i386 fremovexattr sys_fremovexattr
238 i386 tkill sys_tkill
239 i386 sendfile64 sys_sendfile64
240 i386 futex sys_futex compat_sys_futex
241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity
242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity
243 i386 set_thread_area sys_set_thread_area
244 i386 get_thread_area sys_get_thread_area
245 i386 io_setup sys_io_setup compat_sys_io_setup
246 i386 io_destroy sys_io_destroy
247 i386 io_getevents sys_io_getevents compat_sys_io_getevents
248 i386 io_submit sys_io_submit compat_sys_io_submit
249 i386 io_cancel sys_io_cancel
250 i386 fadvise64 sys_fadvise64 sys32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
252 i386 exit_group sys_exit_group
2013-02-25 18:42:04 -05:00
253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
2011-11-11 15:55:49 -08:00
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
256 i386 epoll_wait sys_epoll_wait
257 i386 remap_file_pages sys_remap_file_pages
258 i386 set_tid_address sys_set_tid_address
259 i386 timer_create sys_timer_create compat_sys_timer_create
260 i386 timer_settime sys_timer_settime compat_sys_timer_settime
261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime
262 i386 timer_getoverrun sys_timer_getoverrun
263 i386 timer_delete sys_timer_delete
264 i386 clock_settime sys_clock_settime compat_sys_clock_settime
265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime
266 i386 clock_getres sys_clock_getres compat_sys_clock_getres
267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep
268 i386 statfs64 sys_statfs64 compat_sys_statfs64
269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
270 i386 tgkill sys_tgkill
271 i386 utimes sys_utimes compat_sys_utimes
272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind
275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
276 i386 set_mempolicy sys_set_mempolicy
277 i386 mq_open sys_mq_open compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink
279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend
280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive
281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
2012-03-21 22:50:08 +01:00
282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr
2011-11-11 15:55:49 -08:00
283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
284 i386 waitid sys_waitid compat_sys_waitid
# 285 sys_setaltroot
286 i386 add_key sys_add_key
287 i386 request_key sys_request_key
288 i386 keyctl sys_keyctl
289 i386 ioprio_set sys_ioprio_set
290 i386 ioprio_get sys_ioprio_get
291 i386 inotify_init sys_inotify_init
292 i386 inotify_add_watch sys_inotify_add_watch
293 i386 inotify_rm_watch sys_inotify_rm_watch
294 i386 migrate_pages sys_migrate_pages
295 i386 openat sys_openat compat_sys_openat
296 i386 mkdirat sys_mkdirat
297 i386 mknodat sys_mknodat
298 i386 fchownat sys_fchownat
299 i386 futimesat sys_futimesat compat_sys_futimesat
300 i386 fstatat64 sys_fstatat64 sys32_fstatat
301 i386 unlinkat sys_unlinkat
302 i386 renameat sys_renameat
303 i386 linkat sys_linkat
304 i386 symlinkat sys_symlinkat
305 i386 readlinkat sys_readlinkat
306 i386 fchmodat sys_fchmodat
307 i386 faccessat sys_faccessat
308 i386 pselect6 sys_pselect6 compat_sys_pselect6
309 i386 ppoll sys_ppoll compat_sys_ppoll
310 i386 unshare sys_unshare
311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list
312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list
313 i386 splice sys_splice
314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range
315 i386 tee sys_tee
316 i386 vmsplice sys_vmsplice compat_sys_vmsplice
317 i386 move_pages sys_move_pages compat_sys_move_pages
318 i386 getcpu sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait
320 i386 utimensat sys_utimensat compat_sys_utimensat
321 i386 signalfd sys_signalfd compat_sys_signalfd
322 i386 timerfd_create sys_timerfd_create
323 i386 eventfd sys_eventfd
324 i386 fallocate sys_fallocate sys32_fallocate
325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime
326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime
327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4
328 i386 eventfd2 sys_eventfd2
329 i386 epoll_create1 sys_epoll_create1
330 i386 dup3 sys_dup3
331 i386 pipe2 sys_pipe2
332 i386 inotify_init1 sys_inotify_init1
333 i386 preadv sys_preadv compat_sys_preadv
334 i386 pwritev sys_pwritev compat_sys_pwritev
335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
336 i386 perf_event_open sys_perf_event_open
337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg
338 i386 fanotify_init sys_fanotify_init
2013-03-05 20:10:59 -05:00
339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark
2011-11-11 15:55:49 -08:00
340 i386 prlimit64 sys_prlimit64
341 i386 name_to_handle_at sys_name_to_handle_at
342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at
343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime
344 i386 syncfs sys_syncfs
345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg
346 i386 setns sys_setns
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
2012-05-31 16:26:44 -07:00
349 i386 kcmp sys_kcmp
module: add syscall to load module from fd
As part of the effort to create a stronger boundary between root and
kernel, Chrome OS wants to be able to enforce that kernel modules are
being loaded only from our read-only crypto-hash verified (dm_verity)
root filesystem. Since the init_module syscall hands the kernel a module
as a memory blob, no reasoning about the origin of the blob can be made.
Earlier proposals for appending signatures to kernel modules would not be
useful in Chrome OS, since it would involve adding an additional set of
keys to our kernel and builds for no good reason: we already trust the
contents of our root filesystem. We don't need to verify those kernel
modules a second time. Having to do signature checking on module loading
would slow us down and be redundant. All we need to know is where a
module is coming from so we can say yes/no to loading it.
If a file descriptor is used as the source of a kernel module, many more
things can be reasoned about. In Chrome OS's case, we could enforce that
the module lives on the filesystem we expect it to live on. In the case
of IMA (or other LSMs), it would be possible, for example, to examine
extended attributes that may contain signatures over the contents of
the module.
This introduces a new syscall (on x86), similar to init_module, that has
only two arguments. The first argument is used as a file descriptor to
the module and the second argument is a pointer to the NULL terminated
string of module arguments.
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (merge fixes)
2012-10-16 07:31:07 +10:30
350 i386 finit_module sys_finit_module
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 14:43:36 +01:00
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
2014-04-11 12:25:37 +02:00
353 i386 renameat2 sys_renameat2
2014-06-25 16:08:24 -07:00
354 i386 seccomp sys_seccomp
random: introduce getrandom(2) system call
The getrandom(2) system call was requested by the LibreSSL Portable
developers. It is analoguous to the getentropy(2) system call in
OpenBSD.
The rationale of this system call is to provide resiliance against
file descriptor exhaustion attacks, where the attacker consumes all
available file descriptors, forcing the use of the fallback code where
/dev/[u]random is not available. Since the fallback code is often not
well-tested, it is better to eliminate this potential failure mode
entirely.
The other feature provided by this new system call is the ability to
request randomness from the /dev/urandom entropy pool, but to block
until at least 128 bits of entropy has been accumulated in the
/dev/urandom entropy pool. Historically, the emphasis in the
/dev/urandom development has been to ensure that urandom pool is
initialized as quickly as possible after system boot, and preferably
before the init scripts start execution.
This is because changing /dev/urandom reads to block represents an
interface change that could potentially break userspace which is not
acceptable. In practice, on most x86 desktop and server systems, in
general the entropy pool can be initialized before it is needed (and
in modern kernels, we will printk a warning message if not). However,
on an embedded system, this may not be the case. And so with this new
interface, we can provide the functionality of blocking until the
urandom pool has been initialized. Any userspace program which uses
this new functionality must take care to assure that if it is used
during the boot process, that it will not cause the init scripts or
other portions of the system startup to hang indefinitely.
SYNOPSIS
#include <linux/random.h>
int getrandom(void *buf, size_t buflen, unsigned int flags);
DESCRIPTION
The system call getrandom() fills the buffer pointed to by buf
with up to buflen random bytes which can be used to seed user
space random number generators (i.e., DRBG's) or for other
cryptographic uses. It should not be used for Monte Carlo
simulations or other programs/algorithms which are doing
probabilistic sampling.
If the GRND_RANDOM flags bit is set, then draw from the
/dev/random pool instead of the /dev/urandom pool. The
/dev/random pool is limited based on the entropy that can be
obtained from environmental noise, so if there is insufficient
entropy, the requested number of bytes may not be returned.
If there is no entropy available at all, getrandom(2) will
either block, or return an error with errno set to EAGAIN if
the GRND_NONBLOCK bit is set in flags.
If the GRND_RANDOM bit is not set, then the /dev/urandom pool
will be used. Unlike using read(2) to fetch data from
/dev/urandom, if the urandom pool has not been sufficiently
initialized, getrandom(2) will block (or return -1 with the
errno set to EAGAIN if the GRND_NONBLOCK bit is set in flags).
The getentropy(2) system call in OpenBSD can be emulated using
the following function:
int getentropy(void *buf, size_t buflen)
{
int ret;
if (buflen > 256)
goto failure;
ret = getrandom(buf, buflen, 0);
if (ret < 0)
return ret;
if (ret == buflen)
return 0;
failure:
errno = EIO;
return -1;
}
RETURN VALUE
On success, the number of bytes that was filled in the buf is
returned. This may not be all the bytes requested by the
caller via buflen if insufficient entropy was present in the
/dev/random pool, or if the system call was interrupted by a
signal.
On error, -1 is returned, and errno is set appropriately.
ERRORS
EINVAL An invalid flag was passed to getrandom(2)
EFAULT buf is outside the accessible address space.
EAGAIN The requested entropy was not available, and
getentropy(2) would have blocked if the
GRND_NONBLOCK flag was not set.
EINTR While blocked waiting for entropy, the call was
interrupted by a signal handler; see the description
of how interrupted read(2) calls on "slow" devices
are handled with and without the SA_RESTART flag
in the signal(7) man page.
NOTES
For small requests (buflen <= 256) getrandom(2) will not
return EINTR when reading from the urandom pool once the
entropy pool has been initialized, and it will return all of
the bytes that have been requested. This is the recommended
way to use getrandom(2), and is designed for compatibility
with OpenBSD's getentropy() system call.
However, if you are using GRND_RANDOM, then getrandom(2) may
block until the entropy accounting determines that sufficient
environmental noise has been gathered such that getrandom(2)
will be operating as a NRBG instead of a DRBG for those people
who are working in the NIST SP 800-90 regime. Since it may
block for a long time, these guarantees do *not* apply. The
user may want to interrupt a hanging process using a signal,
so blocking until all of the requested bytes are returned
would be unfriendly.
For this reason, the user of getrandom(2) MUST always check
the return value, in case it returns some error, or if fewer
bytes than requested was returned. In the case of
!GRND_RANDOM and small request, the latter should never
happen, but the careful userspace code (and all crypto code
should be careful) should check for this anyway!
Finally, unless you are doing long-term key generation (and
perhaps not even then), you probably shouldn't be using
GRND_RANDOM. The cryptographic algorithms used for
/dev/urandom are quite conservative, and so should be
sufficient for all purposes. The disadvantage of GRND_RANDOM
is that it can block, and the increased complexity required to
deal with partially fulfilled getrandom(2) requests.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zach Brown <zab@zabbo.net>
2014-07-17 04:13:05 -04:00
355 i386 getrandom sys_getrandom
shm: add memfd_create() syscall
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It can support sealing and avoids any
connection to user-visible mount-points. Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.
memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode. Also calls like fstat() will
return proper information and mark the file as regular file. If you want
sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not
supported (like on all other regular files).
Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit. It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-08 14:25:29 -07:00
356 i386 memfd_create sys_memfd_create
2014-09-26 00:16:58 -07:00
357 i386 bpf sys_bpf
2015-10-05 17:48:05 -07:00
358 i386 execveat sys_execveat compat_sys_execveat
2015-07-14 15:24:24 -07:00
359 i386 socket sys_socket
360 i386 socketpair sys_socketpair
361 i386 bind sys_bind
362 i386 connect sys_connect
363 i386 listen sys_listen
364 i386 accept4 sys_accept4
365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
367 i386 getsockname sys_getsockname
368 i386 getpeername sys_getpeername
369 i386 sendto sys_sendto
370 i386 sendmsg sys_sendmsg compat_sys_sendmsg
371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
373 i386 shutdown sys_shutdown
2015-09-04 15:46:58 -07:00
374 i386 userfaultfd sys_userfaultfd
sys_membarrier(): system-wide memory barrier (generic, x86)
Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system. It is
implemented by calling synchronize_sched(). It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier. For synchronization primitives
that distinguish between read-side and write-side (e.g. userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.
The existing applications of which I am aware that would be improved by
this system call are as follows:
* Through Userspace RCU library (http://urcu.so)
- DNS server (Knot DNS) https://www.knot-dns.cz/
- Network sniffer (http://netsniff-ng.org/)
- Distributed object storage (https://sheepdog.github.io/sheepdog/)
- User-space tracing (http://lttng.org)
- Network storage system (https://www.gluster.org/)
- Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
- Financial software (https://lkml.org/lkml/2015/3/23/189)
Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking. Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().
* Direct users of sys_membarrier
- core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)
Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux. They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.
To explain the benefit of this scheme, let's introduce two example threads:
Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())
In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".
Before the change, we had, for each smp_mb() pairs:
Thread A Thread B
previous mem accesses previous mem accesses
smp_mb() smp_mb()
following mem accesses following mem accesses
After the change, these pairs become:
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).
1) Non-concurrent Thread A vs Thread B accesses:
Thread A Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
prev mem accesses
barrier()
follow mem accesses
In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.
2) Concurrent Thread A vs Thread B accesses
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().
* Benchmarks
On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)
1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.
* User-space user of this system call: Userspace RCU library
Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.
Results in liburcu:
Operations in 10s, 6 readers, 2 writers:
memory barriers in reader: 1701557485 reads, 2202847 writes
signal-based scheme: 9830061167 reads, 6700 writes
sys_membarrier: 9952759104 reads, 425 writes
sys_membarrier (dyn. check): 7970328887 reads, 425 writes
The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.
Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.
An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.
This patch adds the system call to x86 and to asm-generic.
[1] http://urcu.so
membarrier(2) man page:
MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2)
NAME
membarrier - issue memory barriers on a set of threads
SYNOPSIS
#include <linux/membarrier.h>
int membarrier(int cmd, int flags);
DESCRIPTION
The cmd argument is one of the following:
MEMBARRIER_CMD_QUERY
Query the set of supported commands. It returns a bitmask of
supported commands.
MEMBARRIER_CMD_SHARED
Execute a memory barrier on all threads running on the system.
Upon return from system call, the caller thread is ensured that
all running threads have passed through a state where all memory
accesses to user-space addresses match program order between
entry to and return from the system call (non-running threads
are de facto in such a state). This covers threads from all pro=E2=80=90
cesses running on the system. This command returns 0.
The flags argument needs to be 0. For future extensions.
All memory accesses performed in program order from each targeted
thread is guaranteed to be ordered with respect to sys_membarrier(). If
we use the semantic "barrier()" to represent a compiler barrier forcing
memory accesses to be performed in program order across the barrier,
and smp_mb() to represent explicit memory barriers forcing full memory
ordering across the barrier, we have the following ordering table for
each pair of barrier(), sys_membarrier() and smp_mb():
The pair ordering is detailed as (O: ordered, X: not ordered):
barrier() smp_mb() sys_membarrier()
barrier() X X O
smp_mb() X O O
sys_membarrier() O O O
RETURN VALUE
On success, these system calls return zero. On error, -1 is returned,
and errno is set appropriately. For a given command, with flags
argument set to 0, this system call is guaranteed to always return the
same value until reboot.
ERRORS
ENOSYS System call is not implemented.
EINVAL Invalid arguments.
Linux 2015-04-15 MEMBARRIER(2)
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-11 13:07:39 -07:00
375 i386 membarrier sys_membarrier