2005-04-17 02:20:36 +04:00
/*
* linux / fs / file_table . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
* Copyright ( C ) 1997 David S . Miller ( davem @ caip . rutgers . edu )
*/
# include <linux/string.h>
# include <linux/slab.h>
# include <linux/file.h>
2008-04-24 15:44:08 +04:00
# include <linux/fdtable.h>
2005-04-17 02:20:36 +04:00
# include <linux/init.h>
# include <linux/module.h>
# include <linux/fs.h>
# include <linux/security.h>
2009-02-04 17:06:57 +03:00
# include <linux/ima.h>
2005-04-17 02:20:36 +04:00
# include <linux/eventpoll.h>
2005-09-10 00:04:13 +04:00
# include <linux/rcupdate.h>
2005-04-17 02:20:36 +04:00
# include <linux/mount.h>
2006-01-11 23:17:46 +03:00
# include <linux/capability.h>
2005-04-17 02:20:36 +04:00
# include <linux/cdev.h>
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
# include <linux/fsnotify.h>
2006-03-08 08:55:35 +03:00
# include <linux/sysctl.h>
# include <linux/percpu_counter.h>
# include <asm/atomic.h>
2005-04-17 02:20:36 +04:00
/* sysctl tunables... */
struct files_stat_struct files_stat = {
. max_files = NR_FILE
} ;
/* public. Not pretty! */
2006-03-08 08:55:35 +03:00
__cacheline_aligned_in_smp DEFINE_SPINLOCK ( files_lock ) ;
2005-04-17 02:20:36 +04:00
2008-12-10 20:35:45 +03:00
/* SLAB cache for file structures */
static struct kmem_cache * filp_cachep __read_mostly ;
2006-03-08 08:55:35 +03:00
static struct percpu_counter nr_files __cacheline_aligned_in_smp ;
2005-04-17 02:20:36 +04:00
2006-03-08 08:55:35 +03:00
static inline void file_free_rcu ( struct rcu_head * head )
2005-04-17 02:20:36 +04:00
{
2008-11-14 02:39:25 +03:00
struct file * f = container_of ( head , struct file , f_u . fu_rcuhead ) ;
put_cred ( f - > f_cred ) ;
2006-03-08 08:55:35 +03:00
kmem_cache_free ( filp_cachep , f ) ;
2005-04-17 02:20:36 +04:00
}
2006-03-08 08:55:35 +03:00
static inline void file_free ( struct file * f )
2005-04-17 02:20:36 +04:00
{
2006-03-08 08:55:35 +03:00
percpu_counter_dec ( & nr_files ) ;
2008-02-16 01:38:01 +03:00
file_check_state ( f ) ;
2006-03-08 08:55:35 +03:00
call_rcu ( & f - > f_u . fu_rcuhead , file_free_rcu ) ;
2005-04-17 02:20:36 +04:00
}
2006-03-08 08:55:35 +03:00
/*
* Return the total number of open files in the system
*/
static int get_nr_files ( void )
2005-04-17 02:20:36 +04:00
{
2006-03-08 08:55:35 +03:00
return percpu_counter_read_positive ( & nr_files ) ;
2005-04-17 02:20:36 +04:00
}
2006-03-08 08:55:35 +03:00
/*
* Return the maximum number of open files in the system
*/
int get_max_files ( void )
2005-09-10 00:04:13 +04:00
{
2006-03-08 08:55:35 +03:00
return files_stat . max_files ;
2005-09-10 00:04:13 +04:00
}
2006-03-08 08:55:35 +03:00
EXPORT_SYMBOL_GPL ( get_max_files ) ;
/*
* Handle nr_files sysctl
*/
# if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
int proc_nr_files ( ctl_table * table , int write , struct file * filp ,
void __user * buffer , size_t * lenp , loff_t * ppos )
{
files_stat . nr_files = get_nr_files ( ) ;
return proc_dointvec ( table , write , filp , buffer , lenp , ppos ) ;
}
# else
int proc_nr_files ( ctl_table * table , int write , struct file * filp ,
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
# endif
2005-09-10 00:04:13 +04:00
2005-04-17 02:20:36 +04:00
/* Find an unused file structure and return a pointer to it.
* Returns NULL , if there are no more free file structures or
* we run out of memory .
2008-02-16 01:37:26 +03:00
*
* Be very careful using this . You are responsible for
* getting write access to any mount that you might assign
* to this filp , if it is opened for write . If this is not
* done , you will imbalance int the mount ' s writer count
* and a warning at __fput ( ) time .
2005-04-17 02:20:36 +04:00
*/
struct file * get_empty_filp ( void )
{
2008-11-14 02:39:18 +03:00
const struct cred * cred = current_cred ( ) ;
2005-06-23 11:09:50 +04:00
static int old_max ;
2005-04-17 02:20:36 +04:00
struct file * f ;
/*
* Privileged users can go above max_files
*/
2006-03-08 08:55:35 +03:00
if ( get_nr_files ( ) > = files_stat . max_files & & ! capable ( CAP_SYS_ADMIN ) ) {
/*
* percpu_counters are inaccurate . Do an expensive check before
* we go and fail .
*/
2007-10-17 10:25:44 +04:00
if ( percpu_counter_sum_positive ( & nr_files ) > = files_stat . max_files )
2006-03-08 08:55:35 +03:00
goto over ;
}
2005-06-23 11:09:50 +04:00
2007-10-17 10:26:19 +04:00
f = kmem_cache_zalloc ( filp_cachep , GFP_KERNEL ) ;
2005-06-23 11:09:50 +04:00
if ( f = = NULL )
goto fail ;
2006-03-08 08:55:35 +03:00
percpu_counter_inc ( & nr_files ) ;
2005-06-23 11:09:50 +04:00
if ( security_file_alloc ( f ) )
goto fail_sec ;
2005-04-17 02:20:36 +04:00
2006-03-23 14:01:03 +03:00
INIT_LIST_HEAD ( & f - > f_u . fu_list ) ;
2008-07-26 08:39:17 +04:00
atomic_long_set ( & f - > f_count , 1 ) ;
2005-06-23 11:09:50 +04:00
rwlock_init ( & f - > f_owner . lock ) ;
2008-11-14 02:39:25 +03:00
f - > f_cred = get_cred ( cred ) ;
2009-02-06 23:52:43 +03:00
spin_lock_init ( & f - > f_lock ) ;
2006-03-23 14:01:03 +03:00
eventpoll_init_file ( f ) ;
2005-06-23 11:09:50 +04:00
/* f->f_version: 0 */
return f ;
over :
2005-04-17 02:20:36 +04:00
/* Ran out of filps - report that */
2006-03-08 08:55:35 +03:00
if ( get_nr_files ( ) > old_max ) {
2005-04-17 02:20:36 +04:00
printk ( KERN_INFO " VFS: file-max limit %d reached \n " ,
2006-03-08 08:55:35 +03:00
get_max_files ( ) ) ;
old_max = get_nr_files ( ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-23 11:09:50 +04:00
goto fail ;
fail_sec :
file_free ( f ) ;
2005-04-17 02:20:36 +04:00
fail :
return NULL ;
}
EXPORT_SYMBOL ( get_empty_filp ) ;
2007-10-17 10:31:13 +04:00
/**
* alloc_file - allocate and initialize a ' struct file '
* @ mnt : the vfsmount on which the file will reside
* @ dentry : the dentry representing the new file
* @ mode : the mode with which the new file will be opened
* @ fop : the ' struct file_operations ' for the new file
*
* Use this instead of get_empty_filp ( ) to get a new
* ' struct file ' . Do so because of the same initialization
* pitfalls reasons listed for init_file ( ) . This is a
* preferred interface to using init_file ( ) .
*
* If all the callers of init_file ( ) are eliminated , its
* code should be moved into this function .
*/
struct file * alloc_file ( struct vfsmount * mnt , struct dentry * dentry ,
2008-09-02 23:28:45 +04:00
fmode_t mode , const struct file_operations * fop )
2007-10-17 10:31:13 +04:00
{
struct file * file ;
file = get_empty_filp ( ) ;
if ( ! file )
return NULL ;
init_file ( file , mnt , dentry , mode , fop ) ;
return file ;
}
EXPORT_SYMBOL ( alloc_file ) ;
/**
* init_file - initialize a ' struct file '
* @ file : the already allocated ' struct file ' to initialized
* @ mnt : the vfsmount on which the file resides
* @ dentry : the dentry representing this file
* @ mode : the mode the file is opened with
* @ fop : the ' struct file_operations ' for this file
*
* Use this instead of setting the members directly . Doing so
* avoids making mistakes like forgetting the mntget ( ) or
* forgetting to take a write on the mnt .
*
* Note : This is a crappy interface . It is here to make
* merging with the existing users of get_empty_filp ( )
* who have complex failure logic easier . All users
* of this should be moving to alloc_file ( ) .
*/
int init_file ( struct file * file , struct vfsmount * mnt , struct dentry * dentry ,
2008-09-02 23:28:45 +04:00
fmode_t mode , const struct file_operations * fop )
2007-10-17 10:31:13 +04:00
{
int error = 0 ;
file - > f_path . dentry = dentry ;
file - > f_path . mnt = mntget ( mnt ) ;
file - > f_mapping = dentry - > d_inode - > i_mapping ;
file - > f_mode = mode ;
file - > f_op = fop ;
2008-02-16 01:37:48 +03:00
/*
* These mounts don ' t really matter in practice
* for r / o bind mounts . They aren ' t userspace -
* visible . We do this for consistency , and so
* that we can do debugging checks at __fput ( )
*/
if ( ( mode & FMODE_WRITE ) & & ! special_file ( dentry - > d_inode - > i_mode ) ) {
2008-02-16 01:38:01 +03:00
file_take_write ( file ) ;
2009-04-26 14:25:55 +04:00
error = mnt_clone_write ( mnt ) ;
2008-02-16 01:37:48 +03:00
WARN_ON ( error ) ;
}
2007-10-17 10:31:13 +04:00
return error ;
}
EXPORT_SYMBOL ( init_file ) ;
2008-02-08 15:19:52 +03:00
void fput ( struct file * file )
2005-04-17 02:20:36 +04:00
{
2008-07-26 08:39:17 +04:00
if ( atomic_long_dec_and_test ( & file - > f_count ) )
2005-04-17 02:20:36 +04:00
__fput ( file ) ;
}
EXPORT_SYMBOL ( fput ) ;
2008-02-16 01:37:31 +03:00
/**
* drop_file_write_access - give up ability to write to a file
* @ file : the file to which we will stop writing
*
* This is a central place which will give up the ability
* to write to @ file , along with access to write through
* its vfsmount .
*/
void drop_file_write_access ( struct file * file )
{
2008-02-16 01:37:48 +03:00
struct vfsmount * mnt = file - > f_path . mnt ;
2008-02-16 01:37:31 +03:00
struct dentry * dentry = file - > f_path . dentry ;
struct inode * inode = dentry - > d_inode ;
put_write_access ( inode ) ;
2008-02-16 01:38:01 +03:00
if ( special_file ( inode - > i_mode ) )
return ;
if ( file_check_writeable ( file ) ! = 0 )
return ;
mnt_drop_write ( mnt ) ;
file_release_write ( file ) ;
2008-02-16 01:37:31 +03:00
}
EXPORT_SYMBOL_GPL ( drop_file_write_access ) ;
2005-04-17 02:20:36 +04:00
/* __fput is called from task context when aio completion releases the last
* last use of a struct file * . Do not use otherwise .
*/
2008-02-08 15:19:52 +03:00
void __fput ( struct file * file )
2005-04-17 02:20:36 +04:00
{
2006-12-08 13:36:35 +03:00
struct dentry * dentry = file - > f_path . dentry ;
struct vfsmount * mnt = file - > f_path . mnt ;
2005-04-17 02:20:36 +04:00
struct inode * inode = dentry - > d_inode ;
might_sleep ( ) ;
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
fsnotify_close ( file ) ;
2005-04-17 02:20:36 +04:00
/*
* The function eventpoll_release ( ) should be the first called
* in the file cleanup chain .
*/
eventpoll_release ( file ) ;
locks_remove_flock ( file ) ;
2008-11-01 02:28:30 +03:00
if ( unlikely ( file - > f_flags & FASYNC ) ) {
if ( file - > f_op & & file - > f_op - > fasync )
file - > f_op - > fasync ( - 1 , file , 0 ) ;
}
2005-04-17 02:20:36 +04:00
if ( file - > f_op & & file - > f_op - > release )
file - > f_op - > release ( inode , file ) ;
security_file_free ( file ) ;
2009-02-04 17:06:57 +03:00
ima_file_free ( file ) ;
2006-09-27 12:50:49 +04:00
if ( unlikely ( S_ISCHR ( inode - > i_mode ) & & inode - > i_cdev ! = NULL ) )
2005-04-17 02:20:36 +04:00
cdev_put ( inode - > i_cdev ) ;
fops_put ( file - > f_op ) ;
2006-10-02 13:17:15 +04:00
put_pid ( file - > f_owner . pid ) ;
2005-04-17 02:20:36 +04:00
file_kill ( file ) ;
2008-02-16 01:37:31 +03:00
if ( file - > f_mode & FMODE_WRITE )
drop_file_write_access ( file ) ;
2006-12-08 13:36:35 +03:00
file - > f_path . dentry = NULL ;
file - > f_path . mnt = NULL ;
2005-04-17 02:20:36 +04:00
file_free ( file ) ;
dput ( dentry ) ;
mntput ( mnt ) ;
}
2008-02-08 15:19:52 +03:00
struct file * fget ( unsigned int fd )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
struct files_struct * files = current - > files ;
2005-09-10 00:04:13 +04:00
rcu_read_lock ( ) ;
2005-04-17 02:20:36 +04:00
file = fcheck_files ( files , fd ) ;
2005-09-10 00:04:13 +04:00
if ( file ) {
2008-07-26 08:39:17 +04:00
if ( ! atomic_long_inc_not_zero ( & file - > f_count ) ) {
2005-09-10 00:04:13 +04:00
/* File object ref couldn't be taken */
rcu_read_unlock ( ) ;
return NULL ;
}
}
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return file ;
}
EXPORT_SYMBOL ( fget ) ;
/*
* Lightweight file lookup - no refcnt increment if fd table isn ' t shared .
* You can use this only if it is guranteed that the current task already
* holds a refcnt to that file . That check has to be done at fget ( ) only
* and a flag is returned to be passed to the corresponding fput_light ( ) .
* There must not be a cloning between an fget_light / fput_light pair .
*/
2008-02-08 15:19:52 +03:00
struct file * fget_light ( unsigned int fd , int * fput_needed )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
struct files_struct * files = current - > files ;
* fput_needed = 0 ;
if ( likely ( ( atomic_read ( & files - > count ) = = 1 ) ) ) {
file = fcheck_files ( files , fd ) ;
} else {
2005-09-10 00:04:13 +04:00
rcu_read_lock ( ) ;
2005-04-17 02:20:36 +04:00
file = fcheck_files ( files , fd ) ;
if ( file ) {
2008-07-26 08:39:17 +04:00
if ( atomic_long_inc_not_zero ( & file - > f_count ) )
2005-09-10 00:04:13 +04:00
* fput_needed = 1 ;
else
/* Didn't get the reference, someone's freed */
file = NULL ;
2005-04-17 02:20:36 +04:00
}
2005-09-10 00:04:13 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
}
2005-09-10 00:04:13 +04:00
2005-04-17 02:20:36 +04:00
return file ;
}
void put_filp ( struct file * file )
{
2008-07-26 08:39:17 +04:00
if ( atomic_long_dec_and_test ( & file - > f_count ) ) {
2005-04-17 02:20:36 +04:00
security_file_free ( file ) ;
file_kill ( file ) ;
file_free ( file ) ;
}
}
void file_move ( struct file * file , struct list_head * list )
{
if ( ! list )
return ;
file_list_lock ( ) ;
2005-10-31 02:02:16 +03:00
list_move ( & file - > f_u . fu_list , list ) ;
2005-04-17 02:20:36 +04:00
file_list_unlock ( ) ;
}
void file_kill ( struct file * file )
{
2005-10-31 02:02:16 +03:00
if ( ! list_empty ( & file - > f_u . fu_list ) ) {
2005-04-17 02:20:36 +04:00
file_list_lock ( ) ;
2005-10-31 02:02:16 +03:00
list_del_init ( & file - > f_u . fu_list ) ;
2005-04-17 02:20:36 +04:00
file_list_unlock ( ) ;
}
}
int fs_may_remount_ro ( struct super_block * sb )
{
2007-10-19 10:39:56 +04:00
struct file * file ;
2005-04-17 02:20:36 +04:00
/* Check that no files are currently opened for writing. */
file_list_lock ( ) ;
2007-10-19 10:39:56 +04:00
list_for_each_entry ( file , & sb - > s_files , f_u . fu_list ) {
2006-12-08 13:36:35 +03:00
struct inode * inode = file - > f_path . dentry - > d_inode ;
2005-04-17 02:20:36 +04:00
/* File with pending delete? */
if ( inode - > i_nlink = = 0 )
goto too_bad ;
/* Writeable file? */
if ( S_ISREG ( inode - > i_mode ) & & ( file - > f_mode & FMODE_WRITE ) )
goto too_bad ;
}
file_list_unlock ( ) ;
return 1 ; /* Tis' cool bro. */
too_bad :
file_list_unlock ( ) ;
return 0 ;
}
2009-04-26 14:25:56 +04:00
/**
* mark_files_ro - mark all files read - only
* @ sb : superblock in question
*
* All files are marked read - only . We don ' t care about pending
* delete files so this should be used in ' force ' mode only .
*/
void mark_files_ro ( struct super_block * sb )
{
struct file * f ;
retry :
file_list_lock ( ) ;
list_for_each_entry ( f , & sb - > s_files , f_u . fu_list ) {
struct vfsmount * mnt ;
if ( ! S_ISREG ( f - > f_path . dentry - > d_inode - > i_mode ) )
continue ;
if ( ! file_count ( f ) )
continue ;
if ( ! ( f - > f_mode & FMODE_WRITE ) )
continue ;
f - > f_mode & = ~ FMODE_WRITE ;
if ( file_check_writeable ( f ) ! = 0 )
continue ;
file_release_write ( f ) ;
mnt = mntget ( f - > f_path . mnt ) ;
file_list_unlock ( ) ;
/*
* This can sleep , so we can ' t hold
* the file_list_lock ( ) spinlock .
*/
mnt_drop_write ( mnt ) ;
mntput ( mnt ) ;
goto retry ;
}
file_list_unlock ( ) ;
}
2005-04-17 02:20:36 +04:00
void __init files_init ( unsigned long mempages )
{
int n ;
2008-12-10 20:35:45 +03:00
filp_cachep = kmem_cache_create ( " filp " , sizeof ( struct file ) , 0 ,
SLAB_HWCACHE_ALIGN | SLAB_PANIC , NULL ) ;
/*
* One file with associated inode and dcache is very roughly 1 K .
2005-04-17 02:20:36 +04:00
* Per default don ' t use more than 10 % of our memory for files .
*/
n = ( mempages * ( PAGE_SIZE / 1024 ) ) / 10 ;
files_stat . max_files = n ;
if ( files_stat . max_files < NR_FILE )
files_stat . max_files = NR_FILE ;
2005-09-10 00:04:13 +04:00
files_defer_init ( ) ;
2006-06-23 13:05:41 +04:00
percpu_counter_init ( & nr_files , 0 ) ;
2005-04-17 02:20:36 +04:00
}