2024-02-12 18:00:50 +03:00
// SPDX-License-Identifier: GPL-2.0
2024-02-12 18:32:38 +03:00
# include <linux/anon_inodes.h>
2024-02-12 18:00:50 +03:00
# include <linux/file.h>
# include <linux/fs.h>
# include <linux/magic.h>
# include <linux/mount.h>
# include <linux/pid.h>
2024-02-12 18:32:38 +03:00
# include <linux/pidfs.h>
2024-02-12 18:00:50 +03:00
# include <linux/pid_namespace.h>
# include <linux/poll.h>
# include <linux/proc_fs.h>
# include <linux/proc_ns.h>
# include <linux/pseudo_fs.h>
2024-06-27 17:11:42 +03:00
# include <linux/ptrace.h>
2024-02-12 18:00:50 +03:00
# include <linux/seq_file.h>
# include <uapi/linux/pidfd.h>
2024-06-27 17:11:42 +03:00
# include <linux/ipc_namespace.h>
# include <linux/time_namespace.h>
# include <linux/utsname.h>
# include <net/net_namespace.h>
2024-02-12 18:00:50 +03:00
2024-02-19 18:30:57 +03:00
# include "internal.h"
2024-06-27 17:11:42 +03:00
# include "mount.h"
2024-02-19 18:30:57 +03:00
2024-02-12 18:00:50 +03:00
# ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
* @ m : proc fdinfo file
* @ f : file referencing a pidfd
*
* Pid :
* This function will print the pid that a given pidfd refers to in the
* pid namespace of the procfs instance .
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its pid . This is
* similar to calling getppid ( ) on a process whose parent is outside of
* its pid namespace .
*
* NSpid :
* If pid namespaces are supported then this function will also print
* the pid of a given pidfd refers to for all descendant pid namespaces
* starting from the current pid namespace of the instance , i . e . the
* Pid field and the first entry in the NSpid field will be identical .
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its first NSpid
* entry and no others will be shown .
* Note that this differs from the Pid and NSpid fields in
* / proc / < pid > / status where Pid and NSpid are always shown relative to
* the pid namespace of the procfs instance . The difference becomes
* obvious when sending around a pidfd between pid namespaces from a
* different branch of the tree , i . e . where no ancestral relation is
* present between the pid namespaces :
* - create two new pid namespaces ns1 and ns2 in the initial pid
* namespace ( also take care to create new mount namespaces in the
* new pid namespace and mount procfs )
* - create a process with a pidfd in ns1
* - send pidfd from ns1 to ns2
* - read / proc / self / fdinfo / < pidfd > and observe that both Pid and NSpid
* have exactly one entry , which is 0
*/
static void pidfd_show_fdinfo ( struct seq_file * m , struct file * f )
{
2024-02-12 18:32:38 +03:00
struct pid * pid = pidfd_pid ( f ) ;
2024-02-12 18:00:50 +03:00
struct pid_namespace * ns ;
pid_t nr = - 1 ;
if ( likely ( pid_has_task ( pid , PIDTYPE_PID ) ) ) {
ns = proc_pid_ns ( file_inode ( m - > file ) - > i_sb ) ;
nr = pid_nr_ns ( pid , ns ) ;
}
seq_put_decimal_ll ( m , " Pid: \t " , nr ) ;
# ifdef CONFIG_PID_NS
seq_put_decimal_ll ( m , " \n NSpid: \t " , nr ) ;
if ( nr > 0 ) {
int i ;
/* If nr is non-zero it means that 'pid' is valid and that
* ns , i . e . the pid namespace associated with the procfs
* instance , is in the pid namespace hierarchy of pid .
* Start at one below the already printed level .
*/
for ( i = ns - > level + 1 ; i < = pid - > level ; i + + )
seq_put_decimal_ll ( m , " \t " , pid - > numbers [ i ] . nr ) ;
}
# endif
seq_putc ( m , ' \n ' ) ;
}
# endif
/*
* Poll support for process exit notification .
*/
static __poll_t pidfd_poll ( struct file * file , struct poll_table_struct * pts )
{
2024-02-12 18:32:38 +03:00
struct pid * pid = pidfd_pid ( file ) ;
2024-02-12 18:00:50 +03:00
bool thread = file - > f_flags & PIDFD_THREAD ;
struct task_struct * task ;
__poll_t poll_flags = 0 ;
poll_wait ( file , & pid - > wait_pidfd , pts ) ;
/*
* Depending on PIDFD_THREAD , inform pollers when the thread
* or the whole thread - group exits .
*/
guard ( rcu ) ( ) ;
task = pid_task ( pid , PIDTYPE_PID ) ;
if ( ! task )
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP ;
else if ( task - > exit_state & & ( thread | | thread_group_empty ( task ) ) )
poll_flags = EPOLLIN | EPOLLRDNORM ;
return poll_flags ;
}
2024-06-27 17:11:42 +03:00
static long pidfd_ioctl ( struct file * file , unsigned int cmd , unsigned long arg )
{
struct task_struct * task __free ( put_task ) = NULL ;
struct nsproxy * nsp __free ( put_nsproxy ) = NULL ;
struct pid * pid = pidfd_pid ( file ) ;
2024-07-22 16:13:54 +03:00
struct ns_common * ns_common = NULL ;
2024-06-27 17:11:42 +03:00
if ( arg )
return - EINVAL ;
task = get_pid_task ( pid , PIDTYPE_PID ) ;
if ( ! task )
return - ESRCH ;
scoped_guard ( task_lock , task ) {
nsp = task - > nsproxy ;
if ( nsp )
get_nsproxy ( nsp ) ;
}
if ( ! nsp )
return - ESRCH ; /* just pretend it didn't exist */
/*
* We ' re trying to open a file descriptor to the namespace so perform a
* filesystem cred ptrace check . Also , we mirror nsfs behavior .
*/
if ( ! ptrace_may_access ( task , PTRACE_MODE_READ_FSCREDS ) )
return - EACCES ;
switch ( cmd ) {
/* Namespaces that hang of nsproxy. */
case PIDFD_GET_CGROUP_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_CGROUPS ) ) {
get_cgroup_ns ( nsp - > cgroup_ns ) ;
ns_common = to_ns_common ( nsp - > cgroup_ns ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_IPC_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_IPC_NS ) ) {
get_ipc_ns ( nsp - > ipc_ns ) ;
ns_common = to_ns_common ( nsp - > ipc_ns ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_MNT_NAMESPACE :
get_mnt_ns ( nsp - > mnt_ns ) ;
ns_common = to_ns_common ( nsp - > mnt_ns ) ;
break ;
case PIDFD_GET_NET_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_NET_NS ) ) {
ns_common = to_ns_common ( nsp - > net_ns ) ;
get_net_ns ( ns_common ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_PID_NS ) ) {
get_pid_ns ( nsp - > pid_ns_for_children ) ;
ns_common = to_ns_common ( nsp - > pid_ns_for_children ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_TIME_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_TIME_NS ) ) {
get_time_ns ( nsp - > time_ns ) ;
ns_common = to_ns_common ( nsp - > time_ns ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_TIME_NS ) ) {
get_time_ns ( nsp - > time_ns_for_children ) ;
ns_common = to_ns_common ( nsp - > time_ns_for_children ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_UTS_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_UTS_NS ) ) {
get_uts_ns ( nsp - > uts_ns ) ;
ns_common = to_ns_common ( nsp - > uts_ns ) ;
}
2024-06-27 17:11:42 +03:00
break ;
/* Namespaces that don't hang of nsproxy. */
case PIDFD_GET_USER_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_USER_NS ) ) {
rcu_read_lock ( ) ;
ns_common = to_ns_common ( get_user_ns ( task_cred_xxx ( task , user_ns ) ) ) ;
rcu_read_unlock ( ) ;
}
2024-06-27 17:11:42 +03:00
break ;
case PIDFD_GET_PID_NAMESPACE :
2024-07-22 16:13:54 +03:00
if ( IS_ENABLED ( CONFIG_PID_NS ) ) {
rcu_read_lock ( ) ;
ns_common = to_ns_common ( get_pid_ns ( task_active_pid_ns ( task ) ) ) ;
rcu_read_unlock ( ) ;
}
2024-06-27 17:11:42 +03:00
break ;
default :
return - ENOIOCTLCMD ;
}
2024-07-22 16:13:54 +03:00
if ( ! ns_common )
return - EOPNOTSUPP ;
2024-06-27 17:11:42 +03:00
/* open_namespace() unconditionally consumes the reference */
return open_namespace ( ns_common ) ;
}
2024-02-12 18:32:38 +03:00
static const struct file_operations pidfs_file_operations = {
2024-02-12 18:00:50 +03:00
. poll = pidfd_poll ,
# ifdef CONFIG_PROC_FS
. show_fdinfo = pidfd_show_fdinfo ,
# endif
2024-06-27 17:11:42 +03:00
. unlocked_ioctl = pidfd_ioctl ,
. compat_ioctl = compat_ptr_ioctl ,
2024-02-12 18:00:50 +03:00
} ;
2024-02-12 18:32:38 +03:00
struct pid * pidfd_pid ( const struct file * file )
{
if ( file - > f_op ! = & pidfs_file_operations )
return ERR_PTR ( - EBADF ) ;
return file_inode ( file ) - > i_private ;
}
static struct vfsmount * pidfs_mnt __ro_after_init ;
2024-03-12 12:39:44 +03:00
# if BITS_PER_LONG == 32
/*
* Provide a fallback mechanism for 32 - bit systems so processes remain
* reliably comparable by inode number even on those systems .
*/
static DEFINE_IDA ( pidfd_inum_ida ) ;
static int pidfs_inum ( struct pid * pid , unsigned long * ino )
{
int ret ;
ret = ida_alloc_range ( & pidfd_inum_ida , RESERVED_PIDS + 1 ,
UINT_MAX , GFP_ATOMIC ) ;
if ( ret < 0 )
return - ENOSPC ;
* ino = ret ;
return 0 ;
}
static inline void pidfs_free_inum ( unsigned long ino )
{
if ( ino > 0 )
ida_free ( & pidfd_inum_ida , ino ) ;
}
# else
static inline int pidfs_inum ( struct pid * pid , unsigned long * ino )
{
* ino = pid - > ino ;
return 0 ;
}
# define pidfs_free_inum(ino) ((void)(ino))
# endif
2024-02-12 18:32:38 +03:00
/*
* The vfs falls back to simple_setattr ( ) if i_op - > setattr ( ) isn ' t
* implemented . Let ' s reject it completely until we have a clean
* permission concept for pidfds .
*/
static int pidfs_setattr ( struct mnt_idmap * idmap , struct dentry * dentry ,
struct iattr * attr )
{
return - EOPNOTSUPP ;
}
2024-05-21 15:34:43 +03:00
/*
* User space expects pidfs inodes to have no file type in st_mode .
*
* In particular , ' lsof ' has this legacy logic :
*
* type = s - > st_mode & S_IFMT ;
* switch ( type ) {
* . . .
* case 0 :
* if ( ! strcmp ( p , " anon_inode " ) )
* Lf - > ntype = Ntype = N_ANON_INODE ;
*
* to detect our old anon_inode logic .
*
* Rather than mess with our internal sane inode data , just fix it
* up here in getattr ( ) by masking off the format bits .
*/
2024-02-12 18:32:38 +03:00
static int pidfs_getattr ( struct mnt_idmap * idmap , const struct path * path ,
struct kstat * stat , u32 request_mask ,
unsigned int query_flags )
{
struct inode * inode = d_inode ( path - > dentry ) ;
generic_fillattr ( & nop_mnt_idmap , request_mask , inode , stat ) ;
2024-05-21 15:34:43 +03:00
stat - > mode & = ~ S_IFMT ;
2024-02-12 18:32:38 +03:00
return 0 ;
}
static const struct inode_operations pidfs_inode_operations = {
. getattr = pidfs_getattr ,
. setattr = pidfs_setattr ,
} ;
static void pidfs_evict_inode ( struct inode * inode )
{
struct pid * pid = inode - > i_private ;
clear_inode ( inode ) ;
put_pid ( pid ) ;
2024-03-12 12:39:44 +03:00
pidfs_free_inum ( inode - > i_ino ) ;
2024-02-12 18:32:38 +03:00
}
static const struct super_operations pidfs_sops = {
. drop_inode = generic_delete_inode ,
. evict_inode = pidfs_evict_inode ,
. statfs = simple_statfs ,
} ;
2024-05-21 15:34:43 +03:00
/*
* ' lsof ' has knowledge of out historical anon_inode use , and expects
* the pidfs dentry name to start with ' anon_inode ' .
*/
2024-02-12 18:32:38 +03:00
static char * pidfs_dname ( struct dentry * dentry , char * buffer , int buflen )
{
2024-05-21 15:34:43 +03:00
return dynamic_dname ( buffer , buflen , " anon_inode:[pidfd] " ) ;
2024-02-12 18:32:38 +03:00
}
static const struct dentry_operations pidfs_dentry_operations = {
. d_delete = always_delete_dentry ,
. d_dname = pidfs_dname ,
2024-02-21 11:59:51 +03:00
. d_prune = stashed_dentry_prune ,
2024-02-12 18:32:38 +03:00
} ;
2024-03-12 12:39:44 +03:00
static int pidfs_init_inode ( struct inode * inode , void * data )
2024-03-01 12:26:03 +03:00
{
inode - > i_private = data ;
inode - > i_flags | = S_PRIVATE ;
inode - > i_mode | = S_IRWXU ;
inode - > i_op = & pidfs_inode_operations ;
inode - > i_fop = & pidfs_file_operations ;
2024-03-12 12:39:44 +03:00
/*
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This
* avoids collisions with the root inode which is 1 for pseudo
* filesystems .
*/
return pidfs_inum ( data , & inode - > i_ino ) ;
2024-03-01 12:26:03 +03:00
}
static void pidfs_put_data ( void * data )
{
struct pid * pid = data ;
put_pid ( pid ) ;
}
static const struct stashed_operations pidfs_stashed_ops = {
. init_inode = pidfs_init_inode ,
. put_data = pidfs_put_data ,
} ;
2024-02-12 18:32:38 +03:00
static int pidfs_init_fs_context ( struct fs_context * fc )
{
struct pseudo_fs_context * ctx ;
ctx = init_pseudo ( fc , PID_FS_MAGIC ) ;
if ( ! ctx )
return - ENOMEM ;
ctx - > ops = & pidfs_sops ;
ctx - > dops = & pidfs_dentry_operations ;
2024-03-01 12:26:03 +03:00
fc - > s_fs_info = ( void * ) & pidfs_stashed_ops ;
2024-02-12 18:32:38 +03:00
return 0 ;
}
static struct file_system_type pidfs_type = {
. name = " pidfs " ,
. init_fs_context = pidfs_init_fs_context ,
. kill_sb = kill_anon_super ,
} ;
struct file * pidfs_alloc_file ( struct pid * pid , unsigned int flags )
{
struct file * pidfd_file ;
2024-02-19 18:30:57 +03:00
struct path path ;
int ret ;
2024-02-12 18:32:38 +03:00
2024-03-12 12:39:44 +03:00
ret = path_from_stashed ( & pid - > stashed , pidfs_mnt , get_pid ( pid ) , & path ) ;
2024-02-19 18:30:57 +03:00
if ( ret < 0 )
return ERR_PTR ( ret ) ;
pidfd_file = dentry_open ( & path , flags , current_cred ( ) ) ;
path_put ( & path ) ;
2024-02-12 18:32:38 +03:00
return pidfd_file ;
}
void __init pidfs_init ( void )
{
pidfs_mnt = kern_mount ( & pidfs_type ) ;
if ( IS_ERR ( pidfs_mnt ) )
panic ( " Failed to mount pidfs pseudo filesystem " ) ;
}