2024-02-12 16:00:50 +01:00
// SPDX-License-Identifier: GPL-2.0
2024-02-12 16:32:38 +01:00
# include <linux/anon_inodes.h>
2024-02-12 16:00:50 +01:00
# include <linux/file.h>
# include <linux/fs.h>
# include <linux/magic.h>
# include <linux/mount.h>
# include <linux/pid.h>
2024-02-12 16:32:38 +01:00
# include <linux/pidfs.h>
2024-02-12 16:00:50 +01:00
# include <linux/pid_namespace.h>
# include <linux/poll.h>
# include <linux/proc_fs.h>
# include <linux/proc_ns.h>
# include <linux/pseudo_fs.h>
# include <linux/seq_file.h>
# include <uapi/linux/pidfd.h>
2024-02-19 16:30:57 +01:00
# include "internal.h"
2024-02-12 16:00:50 +01:00
# ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
* @ m : proc fdinfo file
* @ f : file referencing a pidfd
*
* Pid :
* This function will print the pid that a given pidfd refers to in the
* pid namespace of the procfs instance .
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its pid . This is
* similar to calling getppid ( ) on a process whose parent is outside of
* its pid namespace .
*
* NSpid :
* If pid namespaces are supported then this function will also print
* the pid of a given pidfd refers to for all descendant pid namespaces
* starting from the current pid namespace of the instance , i . e . the
* Pid field and the first entry in the NSpid field will be identical .
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its first NSpid
* entry and no others will be shown .
* Note that this differs from the Pid and NSpid fields in
* / proc / < pid > / status where Pid and NSpid are always shown relative to
* the pid namespace of the procfs instance . The difference becomes
* obvious when sending around a pidfd between pid namespaces from a
* different branch of the tree , i . e . where no ancestral relation is
* present between the pid namespaces :
* - create two new pid namespaces ns1 and ns2 in the initial pid
* namespace ( also take care to create new mount namespaces in the
* new pid namespace and mount procfs )
* - create a process with a pidfd in ns1
* - send pidfd from ns1 to ns2
* - read / proc / self / fdinfo / < pidfd > and observe that both Pid and NSpid
* have exactly one entry , which is 0
*/
static void pidfd_show_fdinfo ( struct seq_file * m , struct file * f )
{
2024-02-12 16:32:38 +01:00
struct pid * pid = pidfd_pid ( f ) ;
2024-02-12 16:00:50 +01:00
struct pid_namespace * ns ;
pid_t nr = - 1 ;
if ( likely ( pid_has_task ( pid , PIDTYPE_PID ) ) ) {
ns = proc_pid_ns ( file_inode ( m - > file ) - > i_sb ) ;
nr = pid_nr_ns ( pid , ns ) ;
}
seq_put_decimal_ll ( m , " Pid: \t " , nr ) ;
# ifdef CONFIG_PID_NS
seq_put_decimal_ll ( m , " \n NSpid: \t " , nr ) ;
if ( nr > 0 ) {
int i ;
/* If nr is non-zero it means that 'pid' is valid and that
* ns , i . e . the pid namespace associated with the procfs
* instance , is in the pid namespace hierarchy of pid .
* Start at one below the already printed level .
*/
for ( i = ns - > level + 1 ; i < = pid - > level ; i + + )
seq_put_decimal_ll ( m , " \t " , pid - > numbers [ i ] . nr ) ;
}
# endif
seq_putc ( m , ' \n ' ) ;
}
# endif
/*
* Poll support for process exit notification .
*/
static __poll_t pidfd_poll ( struct file * file , struct poll_table_struct * pts )
{
2024-02-12 16:32:38 +01:00
struct pid * pid = pidfd_pid ( file ) ;
2024-02-12 16:00:50 +01:00
bool thread = file - > f_flags & PIDFD_THREAD ;
struct task_struct * task ;
__poll_t poll_flags = 0 ;
poll_wait ( file , & pid - > wait_pidfd , pts ) ;
/*
* Depending on PIDFD_THREAD , inform pollers when the thread
* or the whole thread - group exits .
*/
guard ( rcu ) ( ) ;
task = pid_task ( pid , PIDTYPE_PID ) ;
if ( ! task )
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP ;
else if ( task - > exit_state & & ( thread | | thread_group_empty ( task ) ) )
poll_flags = EPOLLIN | EPOLLRDNORM ;
return poll_flags ;
}
2024-02-12 16:32:38 +01:00
static const struct file_operations pidfs_file_operations = {
2024-02-12 16:00:50 +01:00
. poll = pidfd_poll ,
# ifdef CONFIG_PROC_FS
. show_fdinfo = pidfd_show_fdinfo ,
# endif
} ;
2024-02-12 16:32:38 +01:00
struct pid * pidfd_pid ( const struct file * file )
{
if ( file - > f_op ! = & pidfs_file_operations )
return ERR_PTR ( - EBADF ) ;
return file_inode ( file ) - > i_private ;
}
static struct vfsmount * pidfs_mnt __ro_after_init ;
2024-03-12 10:39:44 +01:00
# if BITS_PER_LONG == 32
/*
* Provide a fallback mechanism for 32 - bit systems so processes remain
* reliably comparable by inode number even on those systems .
*/
static DEFINE_IDA ( pidfd_inum_ida ) ;
static int pidfs_inum ( struct pid * pid , unsigned long * ino )
{
int ret ;
ret = ida_alloc_range ( & pidfd_inum_ida , RESERVED_PIDS + 1 ,
UINT_MAX , GFP_ATOMIC ) ;
if ( ret < 0 )
return - ENOSPC ;
* ino = ret ;
return 0 ;
}
static inline void pidfs_free_inum ( unsigned long ino )
{
if ( ino > 0 )
ida_free ( & pidfd_inum_ida , ino ) ;
}
# else
static inline int pidfs_inum ( struct pid * pid , unsigned long * ino )
{
* ino = pid - > ino ;
return 0 ;
}
# define pidfs_free_inum(ino) ((void)(ino))
# endif
2024-02-12 16:32:38 +01:00
/*
* The vfs falls back to simple_setattr ( ) if i_op - > setattr ( ) isn ' t
* implemented . Let ' s reject it completely until we have a clean
* permission concept for pidfds .
*/
static int pidfs_setattr ( struct mnt_idmap * idmap , struct dentry * dentry ,
struct iattr * attr )
{
return - EOPNOTSUPP ;
}
2024-05-21 14:34:43 +02:00
/*
* User space expects pidfs inodes to have no file type in st_mode .
*
* In particular , ' lsof ' has this legacy logic :
*
* type = s - > st_mode & S_IFMT ;
* switch ( type ) {
* . . .
* case 0 :
* if ( ! strcmp ( p , " anon_inode " ) )
* Lf - > ntype = Ntype = N_ANON_INODE ;
*
* to detect our old anon_inode logic .
*
* Rather than mess with our internal sane inode data , just fix it
* up here in getattr ( ) by masking off the format bits .
*/
2024-02-12 16:32:38 +01:00
static int pidfs_getattr ( struct mnt_idmap * idmap , const struct path * path ,
struct kstat * stat , u32 request_mask ,
unsigned int query_flags )
{
struct inode * inode = d_inode ( path - > dentry ) ;
generic_fillattr ( & nop_mnt_idmap , request_mask , inode , stat ) ;
2024-05-21 14:34:43 +02:00
stat - > mode & = ~ S_IFMT ;
2024-02-12 16:32:38 +01:00
return 0 ;
}
static const struct inode_operations pidfs_inode_operations = {
. getattr = pidfs_getattr ,
. setattr = pidfs_setattr ,
} ;
static void pidfs_evict_inode ( struct inode * inode )
{
struct pid * pid = inode - > i_private ;
clear_inode ( inode ) ;
put_pid ( pid ) ;
2024-03-12 10:39:44 +01:00
pidfs_free_inum ( inode - > i_ino ) ;
2024-02-12 16:32:38 +01:00
}
static const struct super_operations pidfs_sops = {
. drop_inode = generic_delete_inode ,
. evict_inode = pidfs_evict_inode ,
. statfs = simple_statfs ,
} ;
2024-05-21 14:34:43 +02:00
/*
* ' lsof ' has knowledge of out historical anon_inode use , and expects
* the pidfs dentry name to start with ' anon_inode ' .
*/
2024-02-12 16:32:38 +01:00
static char * pidfs_dname ( struct dentry * dentry , char * buffer , int buflen )
{
2024-05-21 14:34:43 +02:00
return dynamic_dname ( buffer , buflen , " anon_inode:[pidfd] " ) ;
2024-02-12 16:32:38 +01:00
}
static const struct dentry_operations pidfs_dentry_operations = {
. d_delete = always_delete_dentry ,
. d_dname = pidfs_dname ,
2024-02-21 09:59:51 +01:00
. d_prune = stashed_dentry_prune ,
2024-02-12 16:32:38 +01:00
} ;
2024-03-12 10:39:44 +01:00
static int pidfs_init_inode ( struct inode * inode , void * data )
2024-03-01 10:26:03 +01:00
{
inode - > i_private = data ;
inode - > i_flags | = S_PRIVATE ;
inode - > i_mode | = S_IRWXU ;
inode - > i_op = & pidfs_inode_operations ;
inode - > i_fop = & pidfs_file_operations ;
2024-03-12 10:39:44 +01:00
/*
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This
* avoids collisions with the root inode which is 1 for pseudo
* filesystems .
*/
return pidfs_inum ( data , & inode - > i_ino ) ;
2024-03-01 10:26:03 +01:00
}
static void pidfs_put_data ( void * data )
{
struct pid * pid = data ;
put_pid ( pid ) ;
}
static const struct stashed_operations pidfs_stashed_ops = {
. init_inode = pidfs_init_inode ,
. put_data = pidfs_put_data ,
} ;
2024-02-12 16:32:38 +01:00
static int pidfs_init_fs_context ( struct fs_context * fc )
{
struct pseudo_fs_context * ctx ;
ctx = init_pseudo ( fc , PID_FS_MAGIC ) ;
if ( ! ctx )
return - ENOMEM ;
ctx - > ops = & pidfs_sops ;
ctx - > dops = & pidfs_dentry_operations ;
2024-03-01 10:26:03 +01:00
fc - > s_fs_info = ( void * ) & pidfs_stashed_ops ;
2024-02-12 16:32:38 +01:00
return 0 ;
}
static struct file_system_type pidfs_type = {
. name = " pidfs " ,
. init_fs_context = pidfs_init_fs_context ,
. kill_sb = kill_anon_super ,
} ;
struct file * pidfs_alloc_file ( struct pid * pid , unsigned int flags )
{
struct file * pidfd_file ;
2024-02-19 16:30:57 +01:00
struct path path ;
int ret ;
2024-02-12 16:32:38 +01:00
2024-03-12 10:39:44 +01:00
ret = path_from_stashed ( & pid - > stashed , pidfs_mnt , get_pid ( pid ) , & path ) ;
2024-02-19 16:30:57 +01:00
if ( ret < 0 )
return ERR_PTR ( ret ) ;
pidfd_file = dentry_open ( & path , flags , current_cred ( ) ) ;
path_put ( & path ) ;
2024-02-12 16:32:38 +01:00
return pidfd_file ;
}
void __init pidfs_init ( void )
{
pidfs_mnt = kern_mount ( & pidfs_type ) ;
if ( IS_ERR ( pidfs_mnt ) )
panic ( " Failed to mount pidfs pseudo filesystem " ) ;
}