From 6ae930d9dbf2d093157be33428538c91966d8a9f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:51 +0200 Subject: [PATCH 1/3] pid: add pidfd_prepare() Add a new helper that allows to reserve a pidfd and allocates a new pidfd file that stashes the provided struct pid. This will allow us to remove places that either open code this function or that call pidfd_create() but then have to call close_fd() because there are still failure points after pidfd_create() has been called. Reviewed-by: Jan Kara Message-Id: <20230327-pidfd-file-api-v1-1-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner --- include/linux/pid.h | 1 + kernel/fork.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ kernel/pid.c | 19 ++++------ 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 343abf22092e..b75de288a8c2 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -80,6 +80,7 @@ extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_create(struct pid *pid, unsigned int flags); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) { diff --git a/kernel/fork.c b/kernel/fork.c index c0257cbee093..a34395bd708a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1951,6 +1951,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..f93954a0384d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) */ int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; + int pidfd; + struct file *pidfd_file; - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) - return -EINVAL; + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - flags | O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); - - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** From ca7707f5430ad6b1c9cb7cee0a7f67d69328bb2d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:52 +0200 Subject: [PATCH 2/3] fork: use pidfd_prepare() Stop open-coding get_unused_fd_flags() and anon_inode_getfile(). That's brittle just for keeping the flags between both calls in sync. Use the dedicated helper. Message-Id: <20230327-pidfd-file-api-v1-2-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner --- kernel/fork.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index a34395bd708a..67bec6f1cb7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2376,21 +2376,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; From eee3a0e93924f2aab8ebaa7f2e26fd0f3b33f9e7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:53 +0200 Subject: [PATCH 3/3] fanotify: use pidfd_prepare() We generally try to avoid installing a file descriptor into the caller's file descriptor table just to close it again via close_fd() in case an error occurs. Instead we reserve a file descriptor but don't install it into the caller's file descriptor table yet. If we fail for other, unrelated reasons we can just close the reserved file descriptor and if we make it past all meaningful error paths we just install it. Fanotify gets this right already for one fd type but not for pidfds. Use the new pidfd_prepare() helper to reserve a pidfd and a pidfd file and switch to the more common fd allocation and installation pattern. Acked-by: Jan Kara Message-Id: <20230327-pidfd-file-api-v1-3-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner --- fs/notify/fanotify/fanotify_user.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8f430bfad487..22fb1cf7e1fc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -663,7 +663,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; - struct file *f = NULL; + struct file *f = NULL, *pidfd_file = NULL; int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -718,7 +718,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, !pid_has_task(event->pid, PIDTYPE_TGID)) { pidfd = FAN_NOPIDFD; } else { - pidfd = pidfd_create(event->pid, 0); + pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); if (pidfd < 0) pidfd = FAN_EPIDFD; } @@ -751,6 +751,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); + if (pidfd_file) + fd_install(pidfd, pidfd_file); + return metadata.event_len; out_close_fd: @@ -759,8 +762,10 @@ out_close_fd: fput(f); } - if (pidfd >= 0) - close_fd(pidfd); + if (pidfd >= 0) { + put_unused_fd(pidfd); + fput(pidfd_file); + } return ret; }