2005-09-09 13:10:28 -07:00
/*
FUSE : Filesystem in Userspace
2008-11-26 12:03:54 +01:00
Copyright ( C ) 2001 - 2008 Miklos Szeredi < miklos @ szeredi . hu >
2005-09-09 13:10:28 -07:00
This program can be distributed under the terms of the GNU GPL .
See the file COPYING .
*/
# include "fuse_i.h"
# include <linux/pagemap.h>
# include <linux/file.h>
# include <linux/sched.h>
# include <linux/namei.h>
2010-12-07 20:16:56 +01:00
# include <linux/slab.h>
2016-08-29 08:46:36 -05:00
# include <linux/xattr.h>
2016-08-29 08:46:37 -05:00
# include <linux/posix_acl.h>
2005-09-09 13:10:28 -07:00
2013-05-18 03:03:58 -04:00
static bool fuse_use_readdirplus ( struct inode * dir , struct dir_context * ctx )
2013-01-15 11:23:28 +08:00
{
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
struct fuse_inode * fi = get_fuse_inode ( dir ) ;
if ( ! fc - > do_readdirplus )
return false ;
2013-02-06 22:29:01 +00:00
if ( ! fc - > readdirplus_auto )
return true ;
2013-01-15 11:23:28 +08:00
if ( test_and_clear_bit ( FUSE_I_ADVISE_RDPLUS , & fi - > state ) )
return true ;
2013-05-18 03:03:58 -04:00
if ( ctx - > pos = = 0 )
2013-01-15 11:23:28 +08:00
return true ;
return false ;
}
static void fuse_advise_use_readdirplus ( struct inode * dir )
{
struct fuse_inode * fi = get_fuse_inode ( dir ) ;
set_bit ( FUSE_I_ADVISE_RDPLUS , & fi - > state ) ;
}
2016-10-01 07:32:32 +02:00
union fuse_dentry {
u64 time ;
struct rcu_head rcu ;
} ;
2006-07-30 03:04:10 -07:00
static inline void fuse_dentry_settime ( struct dentry * entry , u64 time )
{
2016-10-01 07:32:32 +02:00
( ( union fuse_dentry * ) entry - > d_fsdata ) - > time = time ;
2006-07-30 03:04:10 -07:00
}
static inline u64 fuse_dentry_time ( struct dentry * entry )
{
2016-10-01 07:32:32 +02:00
return ( ( union fuse_dentry * ) entry - > d_fsdata ) - > time ;
2006-07-30 03:04:10 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* FUSE caches dentries and attributes with separate timeout . The
* time in jiffies until the dentry / attributes are valid is stored in
2016-10-01 07:32:32 +02:00
* dentry - > d_fsdata and fuse_inode - > i_time respectively .
2006-01-06 00:19:39 -08:00
*/
/*
* Calculate the time in jiffies until a dentry / attributes are valid
*/
2016-10-01 07:32:32 +02:00
static u64 time_to_jiffies ( u64 sec , u32 nsec )
2005-09-09 13:10:28 -07:00
{
2006-07-30 03:04:08 -07:00
if ( sec | | nsec ) {
2016-10-01 07:32:32 +02:00
struct timespec64 ts = {
sec ,
2017-01-13 15:58:30 +00:00
min_t ( u32 , nsec , NSEC_PER_SEC - 1 )
2016-10-01 07:32:32 +02:00
} ;
return get_jiffies_64 ( ) + timespec64_to_jiffies ( & ts ) ;
2006-07-30 03:04:08 -07:00
} else
2006-07-30 03:04:10 -07:00
return 0 ;
2005-09-09 13:10:28 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* Set dentry and possibly attribute timeouts from the lookup / mk *
* replies
*/
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
static void fuse_change_entry_timeout ( struct dentry * entry ,
struct fuse_entry_out * o )
2006-01-06 00:19:34 -08:00
{
2006-07-30 03:04:10 -07:00
fuse_dentry_settime ( entry ,
time_to_jiffies ( o - > entry_valid , o - > entry_valid_nsec ) ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
}
static u64 attr_timeout ( struct fuse_attr_out * o )
{
return time_to_jiffies ( o - > attr_valid , o - > attr_valid_nsec ) ;
}
static u64 entry_attr_timeout ( struct fuse_entry_out * o )
{
return time_to_jiffies ( o - > attr_valid , o - > attr_valid_nsec ) ;
2006-01-06 00:19:38 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Mark the attributes as stale , so that at the next call to
* - > getattr ( ) they will be fetched from userspace
*/
2006-01-06 00:19:38 -08:00
void fuse_invalidate_attr ( struct inode * inode )
{
2006-07-30 03:04:10 -07:00
get_fuse_inode ( inode ) - > i_time = 0 ;
2006-01-06 00:19:38 -08:00
}
2013-11-05 03:55:43 -08:00
/**
* Mark the attributes as stale due to an atime change . Avoid the invalidate if
* atime is not used .
*/
void fuse_invalidate_atime ( struct inode * inode )
{
if ( ! IS_RDONLY ( inode ) )
fuse_invalidate_attr ( inode ) ;
}
2006-01-06 00:19:39 -08:00
/*
* Just mark the entry as stale , so that a next attempt to look it up
* will result in a new lookup call to userspace
*
* This is called when a dentry is about to become negative and the
* timeout is unknown ( unlink , rmdir , rename and in some cases
* lookup )
*/
2008-07-25 01:49:00 -07:00
void fuse_invalidate_entry_cache ( struct dentry * entry )
2006-01-06 00:19:38 -08:00
{
2006-07-30 03:04:10 -07:00
fuse_dentry_settime ( entry , 0 ) ;
2006-01-06 00:19:38 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Same as fuse_invalidate_entry_cache ( ) , but also try to remove the
* dentry from the hash
*/
2006-01-06 00:19:38 -08:00
static void fuse_invalidate_entry ( struct dentry * entry )
{
d_invalidate ( entry ) ;
fuse_invalidate_entry_cache ( entry ) ;
2006-01-06 00:19:34 -08:00
}
2014-12-12 09:49:05 +01:00
static void fuse_lookup_init ( struct fuse_conn * fc , struct fuse_args * args ,
2016-07-20 22:34:44 -04:00
u64 nodeid , const struct qstr * name ,
2005-09-09 13:10:28 -07:00
struct fuse_entry_out * outarg )
{
2007-10-18 03:07:05 -07:00
memset ( outarg , 0 , sizeof ( struct fuse_entry_out ) ) ;
2014-12-12 09:49:05 +01:00
args - > in . h . opcode = FUSE_LOOKUP ;
args - > in . h . nodeid = nodeid ;
args - > in . numargs = 1 ;
args - > in . args [ 0 ] . size = name - > len + 1 ;
args - > in . args [ 0 ] . value = name - > name ;
args - > out . numargs = 1 ;
2015-01-06 10:45:35 +01:00
args - > out . args [ 0 ] . size = sizeof ( struct fuse_entry_out ) ;
2014-12-12 09:49:05 +01:00
args - > out . args [ 0 ] . value = outarg ;
2005-09-09 13:10:28 -07:00
}
2008-04-30 00:54:43 -07:00
u64 fuse_get_attr_version ( struct fuse_conn * fc )
2007-11-28 16:21:59 -08:00
{
u64 curr_version ;
/*
* The spin lock isn ' t actually needed on 64 bit archs , but we
* don ' t yet care too much about such optimizations .
*/
spin_lock ( & fc - > lock ) ;
curr_version = fc - > attr_version ;
spin_unlock ( & fc - > lock ) ;
return curr_version ;
}
2006-01-06 00:19:39 -08:00
/*
* Check whether the dentry is still valid
*
* If the entry validity timeout has expired and the dentry is
* positive , try to redo the lookup . If the lookup results in a
* different inode , then let the VFS invalidate the dentry and redo
* the lookup once more . If the lookup results in the same inode ,
* then refresh the attributes , timeouts and mark the dentry valid .
*/
2012-06-10 16:03:43 -04:00
static int fuse_dentry_revalidate ( struct dentry * entry , unsigned int flags )
2005-09-09 13:10:28 -07:00
{
2011-01-07 17:49:57 +11:00
struct inode * inode ;
2013-06-03 14:40:22 +02:00
struct dentry * parent ;
struct fuse_conn * fc ;
2013-10-01 16:41:22 +02:00
struct fuse_inode * fi ;
2013-09-05 11:44:43 +02:00
int ret ;
2006-01-06 00:19:38 -08:00
2015-03-17 22:25:59 +00:00
inode = d_inode_rcu ( entry ) ;
2006-01-06 00:19:38 -08:00
if ( inode & & is_bad_inode ( inode ) )
2013-09-05 11:44:43 +02:00
goto invalid ;
2014-06-26 20:21:57 -04:00
else if ( time_before64 ( fuse_dentry_time ( entry ) , get_jiffies_64 ( ) ) | |
( flags & LOOKUP_REVAL ) ) {
2005-09-09 13:10:28 -07:00
struct fuse_entry_out outarg ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2006-01-06 00:19:38 -08:00
2006-02-28 16:59:03 -08:00
/* For negative dentries, always do a fresh lookup */
2006-01-06 00:19:38 -08:00
if ( ! inode )
2013-09-05 11:44:43 +02:00
goto invalid ;
2006-01-06 00:19:38 -08:00
2013-09-05 11:44:43 +02:00
ret = - ECHILD ;
2012-06-10 16:03:43 -04:00
if ( flags & LOOKUP_RCU )
2013-09-05 11:44:43 +02:00
goto out ;
2011-03-21 13:58:06 +01:00
2006-01-06 00:19:38 -08:00
fc = get_fuse_conn ( inode ) ;
2005-09-09 13:10:28 -07:00
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2014-12-12 09:49:05 +01:00
ret = - ENOMEM ;
if ( ! forget )
2013-09-05 11:44:43 +02:00
goto out ;
2006-11-25 11:09:20 -08:00
2007-11-28 16:21:59 -08:00
attr_version = fuse_get_attr_version ( fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2006-10-17 00:10:12 -07:00
parent = dget_parent ( entry ) ;
2015-03-17 22:25:59 +00:00
fuse_lookup_init ( fc , & args , get_node_id ( d_inode ( parent ) ) ,
2008-07-25 01:49:01 -07:00
& entry - > d_name , & outarg ) ;
2014-12-12 09:49:05 +01:00
ret = fuse_simple_request ( fc , & args ) ;
2006-10-17 00:10:12 -07:00
dput ( parent ) ;
2006-02-28 16:59:03 -08:00
/* Zero nodeid is same as -ENOENT */
2014-12-12 09:49:05 +01:00
if ( ! ret & & ! outarg . nodeid )
ret = - ENOENT ;
if ( ! ret ) {
2013-10-01 16:41:22 +02:00
fi = get_fuse_inode ( inode ) ;
2005-09-09 13:10:29 -07:00
if ( outarg . nodeid ! = get_node_id ( inode ) ) {
2010-12-07 20:16:56 +01:00
fuse_queue_forget ( fc , forget , outarg . nodeid , 1 ) ;
2013-09-05 11:44:43 +02:00
goto invalid ;
2005-09-09 13:10:29 -07:00
}
2006-10-17 00:10:08 -07:00
spin_lock ( & fc - > lock ) ;
2008-11-26 12:03:54 +01:00
fi - > nlookup + + ;
2006-10-17 00:10:08 -07:00
spin_unlock ( & fc - > lock ) ;
2005-09-09 13:10:29 -07:00
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2014-12-12 09:49:05 +01:00
if ( ret = = - ENOMEM )
goto out ;
if ( ret | | ( outarg . attr . mode ^ inode - > i_mode ) & S_IFMT )
2013-09-05 11:44:43 +02:00
goto invalid ;
2005-09-09 13:10:28 -07:00
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_attributes ( inode , & outarg . attr ,
entry_attr_timeout ( & outarg ) ,
attr_version ) ;
fuse_change_entry_timeout ( entry , & outarg ) ;
2013-06-03 14:40:22 +02:00
} else if ( inode ) {
2013-10-01 16:41:22 +02:00
fi = get_fuse_inode ( inode ) ;
if ( flags & LOOKUP_RCU ) {
if ( test_bit ( FUSE_I_INIT_RDPLUS , & fi - > state ) )
return - ECHILD ;
} else if ( test_and_clear_bit ( FUSE_I_INIT_RDPLUS , & fi - > state ) ) {
2013-06-03 14:40:22 +02:00
parent = dget_parent ( entry ) ;
2015-03-17 22:25:59 +00:00
fuse_advise_use_readdirplus ( d_inode ( parent ) ) ;
2013-06-03 14:40:22 +02:00
dput ( parent ) ;
}
2005-09-09 13:10:28 -07:00
}
2013-09-05 11:44:43 +02:00
ret = 1 ;
out :
return ret ;
invalid :
ret = 0 ;
goto out ;
2005-09-09 13:10:28 -07:00
}
2006-01-16 22:14:28 -08:00
static int invalid_nodeid ( u64 nodeid )
2005-11-28 13:44:16 -08:00
{
return ! nodeid | | nodeid = = FUSE_ROOT_ID ;
}
2016-10-01 07:32:32 +02:00
static int fuse_dentry_init ( struct dentry * dentry )
{
dentry - > d_fsdata = kzalloc ( sizeof ( union fuse_dentry ) , GFP_KERNEL ) ;
return dentry - > d_fsdata ? 0 : - ENOMEM ;
}
static void fuse_dentry_release ( struct dentry * dentry )
{
union fuse_dentry * fd = dentry - > d_fsdata ;
kfree_rcu ( fd , rcu ) ;
}
2009-02-20 05:59:13 +00:00
const struct dentry_operations fuse_dentry_operations = {
2005-09-09 13:10:28 -07:00
. d_revalidate = fuse_dentry_revalidate ,
2016-10-01 07:32:32 +02:00
. d_init = fuse_dentry_init ,
. d_release = fuse_dentry_release ,
2005-09-09 13:10:28 -07:00
} ;
2016-10-18 15:36:48 +02:00
const struct dentry_operations fuse_root_dentry_operations = {
. d_init = fuse_dentry_init ,
. d_release = fuse_dentry_release ,
} ;
2007-04-08 16:04:00 -07:00
int fuse_valid_type ( int m )
2006-01-06 00:19:43 -08:00
{
return S_ISREG ( m ) | | S_ISDIR ( m ) | | S_ISLNK ( m ) | | S_ISCHR ( m ) | |
S_ISBLK ( m ) | | S_ISFIFO ( m ) | | S_ISSOCK ( m ) ;
}
2016-07-20 22:34:44 -04:00
int fuse_lookup_name ( struct super_block * sb , u64 nodeid , const struct qstr * name ,
2008-07-25 01:49:01 -07:00
struct fuse_entry_out * outarg , struct inode * * inode )
2005-09-09 13:10:28 -07:00
{
2008-07-25 01:49:01 -07:00
struct fuse_conn * fc = get_fuse_conn_super ( sb ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2008-07-25 01:49:01 -07:00
int err ;
2005-09-09 13:10:28 -07:00
2008-07-25 01:49:01 -07:00
* inode = NULL ;
err = - ENAMETOOLONG ;
if ( name - > len > FUSE_NAME_MAX )
goto out ;
2005-09-09 13:10:28 -07:00
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
err = - ENOMEM ;
2014-12-12 09:49:05 +01:00
if ( ! forget )
2008-07-25 01:49:01 -07:00
goto out ;
2006-11-25 11:09:20 -08:00
2007-11-28 16:21:59 -08:00
attr_version = fuse_get_attr_version ( fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2014-12-12 09:49:05 +01:00
fuse_lookup_init ( fc , & args , nodeid , name , outarg ) ;
err = fuse_simple_request ( fc , & args ) ;
2006-02-28 16:59:03 -08:00
/* Zero nodeid is same as -ENOENT, but with valid timeout */
2008-07-25 01:49:01 -07:00
if ( err | | ! outarg - > nodeid )
goto out_put_forget ;
err = - EIO ;
if ( ! outarg - > nodeid )
goto out_put_forget ;
if ( ! fuse_valid_type ( outarg - > attr . mode ) )
goto out_put_forget ;
* inode = fuse_iget ( sb , outarg - > nodeid , outarg - > generation ,
& outarg - > attr , entry_attr_timeout ( outarg ) ,
attr_version ) ;
err = - ENOMEM ;
if ( ! * inode ) {
2010-12-07 20:16:56 +01:00
fuse_queue_forget ( fc , forget , outarg - > nodeid , 1 ) ;
2008-07-25 01:49:01 -07:00
goto out ;
2005-09-09 13:10:28 -07:00
}
2008-07-25 01:49:01 -07:00
err = 0 ;
out_put_forget :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2008-07-25 01:49:01 -07:00
out :
return err ;
}
static struct dentry * fuse_lookup ( struct inode * dir , struct dentry * entry ,
2012-06-10 17:13:09 -04:00
unsigned int flags )
2008-07-25 01:49:01 -07:00
{
int err ;
struct fuse_entry_out outarg ;
struct inode * inode ;
struct dentry * newent ;
bool outarg_valid = true ;
2016-06-30 13:10:49 +02:00
fuse_lock_inode ( dir ) ;
2008-07-25 01:49:01 -07:00
err = fuse_lookup_name ( dir - > i_sb , get_node_id ( dir ) , & entry - > d_name ,
& outarg , & inode ) ;
2016-06-30 13:10:49 +02:00
fuse_unlock_inode ( dir ) ;
2008-07-25 01:49:01 -07:00
if ( err = = - ENOENT ) {
outarg_valid = false ;
err = 0 ;
}
if ( err )
goto out_err ;
err = - EIO ;
if ( inode & & get_node_id ( inode ) = = FUSE_ROOT_ID )
goto out_iput ;
2005-09-09 13:10:28 -07:00
2014-10-12 22:24:21 -04:00
newent = d_splice_alias ( inode , entry ) ;
2013-09-05 11:44:42 +02:00
err = PTR_ERR ( newent ) ;
if ( IS_ERR ( newent ) )
goto out_err ;
2006-10-17 00:10:11 -07:00
2008-07-25 01:48:59 -07:00
entry = newent ? newent : entry ;
2008-07-25 01:49:01 -07:00
if ( outarg_valid )
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_entry_timeout ( entry , & outarg ) ;
2006-01-06 00:19:38 -08:00
else
fuse_invalidate_entry_cache ( entry ) ;
2008-07-25 01:49:01 -07:00
2013-01-15 11:23:28 +08:00
fuse_advise_use_readdirplus ( dir ) ;
2008-07-25 01:48:59 -07:00
return newent ;
2008-07-25 01:49:01 -07:00
out_iput :
iput ( inode ) ;
out_err :
return ERR_PTR ( err ) ;
2005-09-09 13:10:28 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* Atomic create + open operation
*
* If the filesystem doesn ' t support this , then fall back to separate
* ' mknod ' + ' open ' requests .
*/
2012-06-22 12:39:14 +04:00
static int fuse_create_open ( struct inode * dir , struct dentry * entry ,
2012-06-22 12:40:19 +04:00
struct file * file , unsigned flags ,
2012-06-22 12:39:14 +04:00
umode_t mode , int * opened )
2005-11-07 00:59:51 -08:00
{
int err ;
struct inode * inode ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
2009-06-30 20:12:23 +02:00
struct fuse_create_in inarg ;
2005-11-07 00:59:51 -08:00
struct fuse_open_out outopen ;
struct fuse_entry_out outentry ;
struct fuse_file * ff ;
2012-08-15 13:01:24 +02:00
/* Userspace expects S_IFREG in create mode */
BUG_ON ( ( mode & S_IFMT ) ! = S_IFREG ) ;
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2012-06-05 15:10:22 +02:00
err = - ENOMEM ;
2010-12-07 20:16:56 +01:00
if ( ! forget )
2012-06-05 15:10:22 +02:00
goto out_err ;
2006-06-25 05:48:50 -07:00
2006-04-10 22:54:58 -07:00
err = - ENOMEM ;
2008-11-26 12:03:55 +01:00
ff = fuse_file_alloc ( fc ) ;
2005-11-07 00:59:51 -08:00
if ( ! ff )
2014-12-12 09:49:05 +01:00
goto out_put_forget_req ;
2005-11-07 00:59:51 -08:00
2009-06-30 20:12:23 +02:00
if ( ! fc - > dont_mask )
mode & = ~ current_umask ( ) ;
2005-11-07 00:59:51 -08:00
flags & = ~ O_NOCTTY ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outentry , 0 , sizeof ( outentry ) ) ;
2005-11-07 00:59:51 -08:00
inarg . flags = flags ;
inarg . mode = mode ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_CREATE ;
args . in . h . nodeid = get_node_id ( dir ) ;
args . in . numargs = 2 ;
2015-01-06 10:45:35 +01:00
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
2014-12-12 09:49:05 +01:00
args . in . args [ 0 ] . value = & inarg ;
args . in . args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 1 ] . value = entry - > d_name . name ;
args . out . numargs = 2 ;
2015-01-06 10:45:35 +01:00
args . out . args [ 0 ] . size = sizeof ( outentry ) ;
2014-12-12 09:49:05 +01:00
args . out . args [ 0 ] . value = & outentry ;
args . out . args [ 1 ] . size = sizeof ( outopen ) ;
args . out . args [ 1 ] . value = & outopen ;
err = fuse_simple_request ( fc , & args ) ;
2012-06-05 15:10:22 +02:00
if ( err )
2005-11-07 00:59:51 -08:00
goto out_free_ff ;
err = - EIO ;
2005-11-28 13:44:16 -08:00
if ( ! S_ISREG ( outentry . attr . mode ) | | invalid_nodeid ( outentry . nodeid ) )
2005-11-07 00:59:51 -08:00
goto out_free_ff ;
2009-04-28 16:56:37 +02:00
ff - > fh = outopen . fh ;
ff - > nodeid = outentry . nodeid ;
ff - > open_flags = outopen . open_flags ;
2005-11-07 00:59:51 -08:00
inode = fuse_iget ( dir - > i_sb , outentry . nodeid , outentry . generation ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
& outentry . attr , entry_attr_timeout ( & outentry ) , 0 ) ;
2005-11-07 00:59:51 -08:00
if ( ! inode ) {
flags & = ~ ( O_CREAT | O_EXCL | O_TRUNC ) ;
2009-04-28 16:56:39 +02:00
fuse_sync_release ( ff , flags ) ;
2010-12-07 20:16:56 +01:00
fuse_queue_forget ( fc , forget , outentry . nodeid , 1 ) ;
2012-06-05 15:10:22 +02:00
err = - ENOMEM ;
goto out_err ;
2005-11-07 00:59:51 -08:00
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2005-11-07 00:59:51 -08:00
d_instantiate ( entry , inode ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_entry_timeout ( entry , & outentry ) ;
2008-02-06 01:38:38 -08:00
fuse_invalidate_attr ( dir ) ;
2012-06-22 12:40:19 +04:00
err = finish_open ( file , entry , generic_file_open , opened ) ;
if ( err ) {
2009-04-28 16:56:39 +02:00
fuse_sync_release ( ff , flags ) ;
2012-06-05 15:10:22 +02:00
} else {
file - > private_data = fuse_file_get ( ff ) ;
fuse_finish_open ( inode , file ) ;
2005-11-07 00:59:51 -08:00
}
2012-06-22 12:39:14 +04:00
return err ;
2005-11-07 00:59:51 -08:00
2012-06-05 15:10:22 +02:00
out_free_ff :
2005-11-07 00:59:51 -08:00
fuse_file_free ( ff ) ;
2012-06-05 15:10:22 +02:00
out_put_forget_req :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2012-06-05 15:10:22 +02:00
out_err :
2012-06-22 12:39:14 +04:00
return err ;
2012-06-05 15:10:22 +02:00
}
static int fuse_mknod ( struct inode * , struct dentry * , umode_t , dev_t ) ;
2012-06-22 12:39:14 +04:00
static int fuse_atomic_open ( struct inode * dir , struct dentry * entry ,
2012-06-22 12:40:19 +04:00
struct file * file , unsigned flags ,
2012-06-22 12:39:14 +04:00
umode_t mode , int * opened )
2012-06-05 15:10:22 +02:00
{
int err ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
struct dentry * res = NULL ;
2016-07-05 09:44:53 -04:00
if ( d_in_lookup ( entry ) ) {
2012-06-10 17:13:09 -04:00
res = fuse_lookup ( dir , entry , 0 ) ;
2012-06-05 15:10:22 +02:00
if ( IS_ERR ( res ) )
2012-06-22 12:39:14 +04:00
return PTR_ERR ( res ) ;
2012-06-05 15:10:22 +02:00
if ( res )
entry = res ;
}
2015-03-17 22:25:59 +00:00
if ( ! ( flags & O_CREAT ) | | d_really_is_positive ( entry ) )
2012-06-05 15:10:22 +02:00
goto no_open ;
/* Only creates */
2012-06-10 05:01:45 -04:00
* opened | = FILE_CREATED ;
2012-06-05 15:10:22 +02:00
if ( fc - > no_create )
goto mknod ;
2012-06-22 12:40:19 +04:00
err = fuse_create_open ( dir , entry , file , flags , mode , opened ) ;
2012-06-22 12:39:14 +04:00
if ( err = = - ENOSYS ) {
2012-06-05 15:10:22 +02:00
fc - > no_create = 1 ;
goto mknod ;
}
out_dput :
dput ( res ) ;
2012-06-22 12:39:14 +04:00
return err ;
2012-06-05 15:10:22 +02:00
mknod :
err = fuse_mknod ( dir , entry , mode , 0 ) ;
2012-06-22 12:39:14 +04:00
if ( err )
2012-06-05 15:10:22 +02:00
goto out_dput ;
no_open :
2012-06-10 06:48:09 -04:00
return finish_no_open ( file , res ) ;
2005-11-07 00:59:51 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Code shared between mknod , mkdir , symlink and link
*/
2014-12-12 09:49:05 +01:00
static int create_new_entry ( struct fuse_conn * fc , struct fuse_args * args ,
2005-09-09 13:10:29 -07:00
struct inode * dir , struct dentry * entry ,
2011-07-26 03:17:33 -04:00
umode_t mode )
2005-09-09 13:10:29 -07:00
{
struct fuse_entry_out outarg ;
struct inode * inode ;
int err ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
2006-11-25 11:09:20 -08:00
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2014-12-12 09:49:05 +01:00
if ( ! forget )
2010-12-07 20:16:56 +01:00
return - ENOMEM ;
2005-09-09 13:10:29 -07:00
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2014-12-12 09:49:05 +01:00
args - > in . h . nodeid = get_node_id ( dir ) ;
args - > out . numargs = 1 ;
2015-01-06 10:45:35 +01:00
args - > out . args [ 0 ] . size = sizeof ( outarg ) ;
2014-12-12 09:49:05 +01:00
args - > out . args [ 0 ] . value = & outarg ;
err = fuse_simple_request ( fc , args ) ;
2006-11-25 11:09:20 -08:00
if ( err )
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
err = - EIO ;
if ( invalid_nodeid ( outarg . nodeid ) )
2006-11-25 11:09:20 -08:00
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
if ( ( outarg . attr . mode ^ mode ) & S_IFMT )
2006-11-25 11:09:20 -08:00
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
2005-09-09 13:10:29 -07:00
inode = fuse_iget ( dir - > i_sb , outarg . nodeid , outarg . generation ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
& outarg . attr , entry_attr_timeout ( & outarg ) , 0 ) ;
2005-09-09 13:10:29 -07:00
if ( ! inode ) {
2010-12-07 20:16:56 +01:00
fuse_queue_forget ( fc , forget , outarg . nodeid , 1 ) ;
2005-09-09 13:10:29 -07:00
return - ENOMEM ;
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2005-09-09 13:10:29 -07:00
2013-10-01 16:44:54 +02:00
err = d_instantiate_no_diralias ( entry , inode ) ;
if ( err )
return err ;
2005-09-09 13:10:29 -07:00
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_entry_timeout ( entry , & outarg ) ;
2005-09-09 13:10:29 -07:00
fuse_invalidate_attr ( dir ) ;
return 0 ;
2006-01-06 00:19:43 -08:00
2006-11-25 11:09:20 -08:00
out_put_forget_req :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2006-01-06 00:19:43 -08:00
return err ;
2005-09-09 13:10:29 -07:00
}
2011-07-26 01:52:52 -04:00
static int fuse_mknod ( struct inode * dir , struct dentry * entry , umode_t mode ,
2005-09-09 13:10:29 -07:00
dev_t rdev )
{
struct fuse_mknod_in inarg ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2009-06-30 20:12:23 +02:00
if ( ! fc - > dont_mask )
mode & = ~ current_umask ( ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . mode = mode ;
inarg . rdev = new_encode_dev ( rdev ) ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_MKNOD ;
args . in . numargs = 2 ;
2015-01-06 10:45:35 +01:00
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
2014-12-12 09:49:05 +01:00
args . in . args [ 0 ] . value = & inarg ;
args . in . args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 1 ] . value = entry - > d_name . name ;
return create_new_entry ( fc , & args , dir , entry , mode ) ;
2005-09-09 13:10:29 -07:00
}
2011-07-26 01:42:34 -04:00
static int fuse_create ( struct inode * dir , struct dentry * entry , umode_t mode ,
2012-06-10 18:05:36 -04:00
bool excl )
2005-09-09 13:10:29 -07:00
{
return fuse_mknod ( dir , entry , mode , 0 ) ;
}
2011-07-26 01:41:39 -04:00
static int fuse_mkdir ( struct inode * dir , struct dentry * entry , umode_t mode )
2005-09-09 13:10:29 -07:00
{
struct fuse_mkdir_in inarg ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2009-06-30 20:12:23 +02:00
if ( ! fc - > dont_mask )
mode & = ~ current_umask ( ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . mode = mode ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_MKDIR ;
args . in . numargs = 2 ;
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
args . in . args [ 0 ] . value = & inarg ;
args . in . args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 1 ] . value = entry - > d_name . name ;
return create_new_entry ( fc , & args , dir , entry , S_IFDIR ) ;
2005-09-09 13:10:29 -07:00
}
static int fuse_symlink ( struct inode * dir , struct dentry * entry ,
const char * link )
{
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
unsigned len = strlen ( link ) + 1 ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_SYMLINK ;
args . in . numargs = 2 ;
args . in . args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 0 ] . value = entry - > d_name . name ;
args . in . args [ 1 ] . size = len ;
args . in . args [ 1 ] . value = link ;
return create_new_entry ( fc , & args , dir , entry , S_IFLNK ) ;
2005-09-09 13:10:29 -07:00
}
2016-08-29 08:46:36 -05:00
void fuse_update_ctime ( struct inode * inode )
2014-04-28 14:19:24 +02:00
{
if ( ! IS_NOCMTIME ( inode ) ) {
2016-09-14 07:48:06 -07:00
inode - > i_ctime = current_time ( inode ) ;
2014-04-28 14:19:24 +02:00
mark_inode_dirty_sync ( inode ) ;
}
}
2005-09-09 13:10:29 -07:00
static int fuse_unlink ( struct inode * dir , struct dentry * entry )
{
int err ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
args . in . h . opcode = FUSE_UNLINK ;
args . in . h . nodeid = get_node_id ( dir ) ;
args . in . numargs = 1 ;
args . in . args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 0 ] . value = entry - > d_name . name ;
err = fuse_simple_request ( fc , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2012-03-05 15:48:11 +01:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2005-09-09 13:10:29 -07:00
2012-03-05 15:48:11 +01:00
spin_lock ( & fc - > lock ) ;
fi - > attr_version = + + fc - > attr_version ;
2013-02-04 15:57:42 +01:00
/*
* If i_nlink = = 0 then unlink doesn ' t make sense , yet this can
* happen if userspace filesystem is careless . It would be
* difficult to enforce correct nlink usage so just ignore this
* condition here
*/
if ( inode - > i_nlink > 0 )
drop_nlink ( inode ) ;
2012-03-05 15:48:11 +01:00
spin_unlock ( & fc - > lock ) ;
2005-09-09 13:10:29 -07:00
fuse_invalidate_attr ( inode ) ;
fuse_invalidate_attr ( dir ) ;
2006-01-06 00:19:38 -08:00
fuse_invalidate_entry_cache ( entry ) ;
2014-04-28 14:19:24 +02:00
fuse_update_ctime ( inode ) ;
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR )
fuse_invalidate_entry ( entry ) ;
return err ;
}
static int fuse_rmdir ( struct inode * dir , struct dentry * entry )
{
int err ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
args . in . h . opcode = FUSE_RMDIR ;
args . in . h . nodeid = get_node_id ( dir ) ;
args . in . numargs = 1 ;
args . in . args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in . args [ 0 ] . value = entry - > d_name . name ;
err = fuse_simple_request ( fc , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2015-03-17 22:25:59 +00:00
clear_nlink ( d_inode ( entry ) ) ;
2005-09-09 13:10:29 -07:00
fuse_invalidate_attr ( dir ) ;
2006-01-06 00:19:38 -08:00
fuse_invalidate_entry_cache ( entry ) ;
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR )
fuse_invalidate_entry ( entry ) ;
return err ;
}
2014-04-28 16:43:44 +02:00
static int fuse_rename_common ( struct inode * olddir , struct dentry * oldent ,
struct inode * newdir , struct dentry * newent ,
unsigned int flags , int opcode , size_t argsize )
2005-09-09 13:10:29 -07:00
{
int err ;
2014-04-28 16:43:44 +02:00
struct fuse_rename2_in inarg ;
2005-09-09 13:10:29 -07:00
struct fuse_conn * fc = get_fuse_conn ( olddir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2014-04-28 16:43:44 +02:00
memset ( & inarg , 0 , argsize ) ;
2005-09-09 13:10:29 -07:00
inarg . newdir = get_node_id ( newdir ) ;
2014-04-28 16:43:44 +02:00
inarg . flags = flags ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = opcode ;
args . in . h . nodeid = get_node_id ( olddir ) ;
args . in . numargs = 3 ;
args . in . args [ 0 ] . size = argsize ;
args . in . args [ 0 ] . value = & inarg ;
args . in . args [ 1 ] . size = oldent - > d_name . len + 1 ;
args . in . args [ 1 ] . value = oldent - > d_name . name ;
args . in . args [ 2 ] . size = newent - > d_name . len + 1 ;
args . in . args [ 2 ] . value = newent - > d_name . name ;
err = fuse_simple_request ( fc , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2007-11-28 16:22:03 -08:00
/* ctime changes */
2015-03-17 22:25:59 +00:00
fuse_invalidate_attr ( d_inode ( oldent ) ) ;
fuse_update_ctime ( d_inode ( oldent ) ) ;
2007-11-28 16:22:03 -08:00
2014-04-28 16:43:44 +02:00
if ( flags & RENAME_EXCHANGE ) {
2015-03-17 22:25:59 +00:00
fuse_invalidate_attr ( d_inode ( newent ) ) ;
fuse_update_ctime ( d_inode ( newent ) ) ;
2014-04-28 16:43:44 +02:00
}
2005-09-09 13:10:29 -07:00
fuse_invalidate_attr ( olddir ) ;
if ( olddir ! = newdir )
fuse_invalidate_attr ( newdir ) ;
2006-01-06 00:19:38 -08:00
/* newent will end up negative */
2015-03-17 22:25:59 +00:00
if ( ! ( flags & RENAME_EXCHANGE ) & & d_really_is_positive ( newent ) ) {
fuse_invalidate_attr ( d_inode ( newent ) ) ;
2006-01-06 00:19:38 -08:00
fuse_invalidate_entry_cache ( newent ) ;
2015-03-17 22:25:59 +00:00
fuse_update_ctime ( d_inode ( newent ) ) ;
2009-11-04 10:24:52 +01:00
}
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR ) {
/* If request was interrupted, DEITY only knows if the
rename actually took place . If the invalidation
fails ( e . g . some process has CWD under the renamed
directory ) , then there can be inconsistency between
the dcache and the real filesystem . Tough luck . */
fuse_invalidate_entry ( oldent ) ;
2015-03-17 22:25:59 +00:00
if ( d_really_is_positive ( newent ) )
2005-09-09 13:10:29 -07:00
fuse_invalidate_entry ( newent ) ;
}
return err ;
}
2014-04-28 16:43:44 +02:00
static int fuse_rename2 ( struct inode * olddir , struct dentry * oldent ,
struct inode * newdir , struct dentry * newent ,
unsigned int flags )
{
struct fuse_conn * fc = get_fuse_conn ( olddir ) ;
int err ;
if ( flags & ~ ( RENAME_NOREPLACE | RENAME_EXCHANGE ) )
return - EINVAL ;
2014-07-10 10:50:19 +02:00
if ( flags ) {
if ( fc - > no_rename2 | | fc - > minor < 23 )
return - EINVAL ;
2014-04-28 16:43:44 +02:00
2014-07-10 10:50:19 +02:00
err = fuse_rename_common ( olddir , oldent , newdir , newent , flags ,
FUSE_RENAME2 ,
sizeof ( struct fuse_rename2_in ) ) ;
if ( err = = - ENOSYS ) {
fc - > no_rename2 = 1 ;
err = - EINVAL ;
}
} else {
err = fuse_rename_common ( olddir , oldent , newdir , newent , 0 ,
FUSE_RENAME ,
sizeof ( struct fuse_rename_in ) ) ;
2014-04-28 16:43:44 +02:00
}
2014-07-10 10:50:19 +02:00
2014-04-28 16:43:44 +02:00
return err ;
2014-07-10 10:50:19 +02:00
}
2014-04-28 16:43:44 +02:00
2005-09-09 13:10:29 -07:00
static int fuse_link ( struct dentry * entry , struct inode * newdir ,
struct dentry * newent )
{
int err ;
struct fuse_link_in inarg ;
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2005-09-09 13:10:29 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . oldnodeid = get_node_id ( inode ) ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_LINK ;
args . in . numargs = 2 ;
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
args . in . args [ 0 ] . value = & inarg ;
args . in . args [ 1 ] . size = newent - > d_name . len + 1 ;
args . in . args [ 1 ] . value = newent - > d_name . name ;
err = create_new_entry ( fc , & args , newdir , newent , inode - > i_mode ) ;
2005-09-09 13:10:29 -07:00
/* Contrary to "normal" filesystems it can happen that link
makes two " logical " inodes point to the same " physical "
inode . We invalidate the attributes of the old one , so it
will reflect changes in the backing inode ( link count ,
etc . )
*/
2012-03-05 15:48:11 +01:00
if ( ! err ) {
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
spin_lock ( & fc - > lock ) ;
fi - > attr_version = + + fc - > attr_version ;
inc_nlink ( inode ) ;
spin_unlock ( & fc - > lock ) ;
2005-09-09 13:10:29 -07:00
fuse_invalidate_attr ( inode ) ;
2014-04-28 14:19:24 +02:00
fuse_update_ctime ( inode ) ;
2012-03-05 15:48:11 +01:00
} else if ( err = = - EINTR ) {
fuse_invalidate_attr ( inode ) ;
}
2005-09-09 13:10:29 -07:00
return err ;
}
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
static void fuse_fillattr ( struct inode * inode , struct fuse_attr * attr ,
struct kstat * stat )
{
2012-05-10 19:49:38 +04:00
unsigned int blkbits ;
2013-10-10 17:10:46 +04:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
/* see the comment in fuse_change_attributes() */
2013-12-26 19:51:11 +04:00
if ( fc - > writeback_cache & & S_ISREG ( inode - > i_mode ) ) {
2013-10-10 17:10:46 +04:00
attr - > size = i_size_read ( inode ) ;
2013-12-26 19:51:11 +04:00
attr - > mtime = inode - > i_mtime . tv_sec ;
attr - > mtimensec = inode - > i_mtime . tv_nsec ;
2014-04-28 14:19:24 +02:00
attr - > ctime = inode - > i_ctime . tv_sec ;
attr - > ctimensec = inode - > i_ctime . tv_nsec ;
2013-12-26 19:51:11 +04:00
}
2012-05-10 19:49:38 +04:00
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
stat - > dev = inode - > i_sb - > s_dev ;
stat - > ino = attr - > ino ;
stat - > mode = ( inode - > i_mode & S_IFMT ) | ( attr - > mode & 07777 ) ;
stat - > nlink = attr - > nlink ;
2012-02-07 16:26:03 -08:00
stat - > uid = make_kuid ( & init_user_ns , attr - > uid ) ;
stat - > gid = make_kgid ( & init_user_ns , attr - > gid ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
stat - > rdev = inode - > i_rdev ;
stat - > atime . tv_sec = attr - > atime ;
stat - > atime . tv_nsec = attr - > atimensec ;
stat - > mtime . tv_sec = attr - > mtime ;
stat - > mtime . tv_nsec = attr - > mtimensec ;
stat - > ctime . tv_sec = attr - > ctime ;
stat - > ctime . tv_nsec = attr - > ctimensec ;
stat - > size = attr - > size ;
stat - > blocks = attr - > blocks ;
2012-05-10 19:49:38 +04:00
if ( attr - > blksize ! = 0 )
blkbits = ilog2 ( attr - > blksize ) ;
else
blkbits = inode - > i_sb - > s_blocksize_bits ;
stat - > blksize = 1 < < blkbits ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
}
2007-10-18 03:06:59 -07:00
static int fuse_do_getattr ( struct inode * inode , struct kstat * stat ,
struct file * file )
2005-09-09 13:10:28 -07:00
{
int err ;
2007-10-18 03:06:59 -07:00
struct fuse_getattr_in inarg ;
struct fuse_attr_out outarg ;
2005-09-09 13:10:28 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2007-11-28 16:21:59 -08:00
attr_version = fuse_get_attr_version ( fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2007-10-18 03:06:59 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2007-10-18 03:06:59 -07:00
/* Directories have separate file-handle space */
if ( file & & S_ISREG ( inode - > i_mode ) ) {
struct fuse_file * ff = file - > private_data ;
inarg . getattr_flags | = FUSE_GETATTR_FH ;
inarg . fh = ff - > fh ;
}
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_GETATTR ;
args . in . h . nodeid = get_node_id ( inode ) ;
args . in . numargs = 1 ;
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
args . in . args [ 0 ] . value = & inarg ;
args . out . numargs = 1 ;
2015-01-06 10:45:35 +01:00
args . out . args [ 0 ] . size = sizeof ( outarg ) ;
2014-12-12 09:49:05 +01:00
args . out . args [ 0 ] . value = & outarg ;
err = fuse_simple_request ( fc , & args ) ;
2005-09-09 13:10:28 -07:00
if ( ! err ) {
2007-10-18 03:06:59 -07:00
if ( ( inode - > i_mode ^ outarg . attr . mode ) & S_IFMT ) {
2005-09-09 13:10:28 -07:00
make_bad_inode ( inode ) ;
err = - EIO ;
} else {
2007-10-18 03:06:59 -07:00
fuse_change_attributes ( inode , & outarg . attr ,
attr_timeout ( & outarg ) ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
attr_version ) ;
if ( stat )
2007-10-18 03:06:59 -07:00
fuse_fillattr ( inode , & outarg . attr , stat ) ;
2005-09-09 13:10:28 -07:00
}
}
return err ;
}
2007-11-28 16:21:59 -08:00
int fuse_update_attributes ( struct inode * inode , struct kstat * stat ,
struct file * file , bool * refreshed )
{
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
int err ;
bool r ;
2014-07-07 15:28:50 +02:00
if ( time_before64 ( fi - > i_time , get_jiffies_64 ( ) ) ) {
2007-11-28 16:21:59 -08:00
r = true ;
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
2007-11-28 16:21:59 -08:00
err = fuse_do_getattr ( inode , stat , file ) ;
} else {
r = false ;
err = 0 ;
if ( stat ) {
generic_fillattr ( inode , stat ) ;
stat - > mode = fi - > orig_i_mode ;
2012-05-10 19:49:38 +04:00
stat - > ino = fi - > orig_ino ;
2007-11-28 16:21:59 -08:00
}
}
if ( refreshed ! = NULL )
* refreshed = r ;
return err ;
}
2009-05-31 11:13:57 -04:00
int fuse_reverse_inval_entry ( struct super_block * sb , u64 parent_nodeid ,
2011-12-06 21:50:06 +01:00
u64 child_nodeid , struct qstr * name )
2009-05-31 11:13:57 -04:00
{
int err = - ENOTDIR ;
struct inode * parent ;
struct dentry * dir ;
struct dentry * entry ;
parent = ilookup5 ( sb , parent_nodeid , fuse_inode_eq , & parent_nodeid ) ;
if ( ! parent )
return - ENOENT ;
2016-01-22 15:40:57 -05:00
inode_lock ( parent ) ;
2009-05-31 11:13:57 -04:00
if ( ! S_ISDIR ( parent - > i_mode ) )
goto unlock ;
err = - ENOENT ;
dir = d_find_alias ( parent ) ;
if ( ! dir )
goto unlock ;
2016-06-10 07:51:30 -07:00
name - > hash = full_name_hash ( dir , name - > name , name - > len ) ;
2009-05-31 11:13:57 -04:00
entry = d_lookup ( dir , name ) ;
dput ( dir ) ;
if ( ! entry )
goto unlock ;
fuse_invalidate_attr ( parent ) ;
fuse_invalidate_entry ( entry ) ;
2011-12-06 21:50:06 +01:00
2015-03-17 22:25:59 +00:00
if ( child_nodeid ! = 0 & & d_really_is_positive ( entry ) ) {
2016-01-22 15:40:57 -05:00
inode_lock ( d_inode ( entry ) ) ;
2015-03-17 22:25:59 +00:00
if ( get_node_id ( d_inode ( entry ) ) ! = child_nodeid ) {
2011-12-06 21:50:06 +01:00
err = - ENOENT ;
goto badentry ;
}
if ( d_mountpoint ( entry ) ) {
err = - EBUSY ;
goto badentry ;
}
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
if ( d_is_dir ( entry ) ) {
2011-12-06 21:50:06 +01:00
shrink_dcache_parent ( entry ) ;
if ( ! simple_empty ( entry ) ) {
err = - ENOTEMPTY ;
goto badentry ;
}
2015-03-17 22:25:59 +00:00
d_inode ( entry ) - > i_flags | = S_DEAD ;
2011-12-06 21:50:06 +01:00
}
dont_mount ( entry ) ;
2015-03-17 22:25:59 +00:00
clear_nlink ( d_inode ( entry ) ) ;
2011-12-06 21:50:06 +01:00
err = 0 ;
badentry :
2016-01-22 15:40:57 -05:00
inode_unlock ( d_inode ( entry ) ) ;
2011-12-06 21:50:06 +01:00
if ( ! err )
d_delete ( entry ) ;
} else {
err = 0 ;
}
2009-05-31 11:13:57 -04:00
dput ( entry ) ;
unlock :
2016-01-22 15:40:57 -05:00
inode_unlock ( parent ) ;
2009-05-31 11:13:57 -04:00
iput ( parent ) ;
return err ;
}
2005-09-09 13:10:34 -07:00
/*
* Calling into a user - controlled filesystem gives the filesystem
2013-01-14 22:30:00 -08:00
* daemon ptrace - like capabilities over the current process . This
2005-09-09 13:10:34 -07:00
* means , that the filesystem daemon is able to record the exact
* filesystem operations performed , and can also control the behavior
* of the requester process in otherwise impossible ways . For example
* it can delay the operation for arbitrary length of time allowing
* DoS against the requester .
*
* For this reason only those processes can call into the filesystem ,
* for which the owner of the mount has ptrace privilege . This
* excludes processes started by other users , suid or sgid processes .
*/
2013-01-14 22:30:00 -08:00
int fuse_allow_current_process ( struct fuse_conn * fc )
2005-09-09 13:10:34 -07:00
{
2008-11-14 10:39:19 +11:00
const struct cred * cred ;
2005-09-09 13:10:34 -07:00
2016-10-01 07:32:32 +02:00
if ( fc - > allow_other )
2005-09-09 13:10:34 -07:00
return 1 ;
2013-01-14 22:30:00 -08:00
cred = current_cred ( ) ;
2012-02-07 16:26:03 -08:00
if ( uid_eq ( cred - > euid , fc - > user_id ) & &
uid_eq ( cred - > suid , fc - > user_id ) & &
uid_eq ( cred - > uid , fc - > user_id ) & &
gid_eq ( cred - > egid , fc - > group_id ) & &
gid_eq ( cred - > sgid , fc - > group_id ) & &
gid_eq ( cred - > gid , fc - > group_id ) )
2013-01-14 22:30:00 -08:00
return 1 ;
2008-11-14 10:39:19 +11:00
2013-01-14 22:30:00 -08:00
return 0 ;
2005-09-09 13:10:34 -07:00
}
2005-11-07 00:59:50 -08:00
static int fuse_access ( struct inode * inode , int mask )
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-11-07 00:59:50 -08:00
struct fuse_access_in inarg ;
int err ;
2013-10-01 16:41:23 +02:00
BUG_ON ( mask & MAY_NOT_BLOCK ) ;
2005-11-07 00:59:50 -08:00
if ( fc - > no_access )
return 0 ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2008-07-15 21:03:57 -04:00
inarg . mask = mask & ( MAY_READ | MAY_WRITE | MAY_EXEC ) ;
2014-12-12 09:49:05 +01:00
args . in . h . opcode = FUSE_ACCESS ;
args . in . h . nodeid = get_node_id ( inode ) ;
args . in . numargs = 1 ;
args . in . args [ 0 ] . size = sizeof ( inarg ) ;
args . in . args [ 0 ] . value = & inarg ;
err = fuse_simple_request ( fc , & args ) ;
2005-11-07 00:59:50 -08:00
if ( err = = - ENOSYS ) {
fc - > no_access = 1 ;
err = 0 ;
}
return err ;
}
2011-06-20 19:28:19 -04:00
static int fuse_perm_getattr ( struct inode * inode , int mask )
2011-03-21 13:58:06 +01:00
{
2011-06-20 19:28:19 -04:00
if ( mask & MAY_NOT_BLOCK )
2011-03-21 13:58:06 +01:00
return - ECHILD ;
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
2011-03-21 13:58:06 +01:00
return fuse_do_getattr ( inode , NULL , NULL ) ;
}
2006-01-06 00:19:39 -08:00
/*
* Check permission . The two basic access models of FUSE are :
*
* 1 ) Local access checking ( ' default_permissions ' mount option ) based
* on file mode . This is the plain old disk filesystem permission
* modell .
*
* 2 ) " Remote " access checking , where server is responsible for
* checking permission in each inode operation . An exception to this
* is if - > permission ( ) was invoked from sys_access ( ) in which case an
* access request is sent . Execute permission is still checked
* locally based on file mode .
*/
2011-06-20 19:28:19 -04:00
static int fuse_permission ( struct inode * inode , int mask )
2005-09-09 13:10:28 -07:00
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2007-10-16 23:31:02 -07:00
bool refreshed = false ;
int err = 0 ;
2005-09-09 13:10:28 -07:00
2013-01-14 22:30:00 -08:00
if ( ! fuse_allow_current_process ( fc ) )
2005-09-09 13:10:28 -07:00
return - EACCES ;
2007-10-16 23:31:02 -07:00
/*
2007-10-16 23:31:06 -07:00
* If attributes are needed , refresh them before proceeding
2007-10-16 23:31:02 -07:00
*/
2016-10-01 07:32:32 +02:00
if ( fc - > default_permissions | |
2007-10-16 23:31:06 -07:00
( ( mask & MAY_EXEC ) & & S_ISREG ( inode - > i_mode ) ) ) {
2011-03-21 13:58:06 +01:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2014-07-07 15:28:50 +02:00
if ( time_before64 ( fi - > i_time , get_jiffies_64 ( ) ) ) {
2011-03-21 13:58:06 +01:00
refreshed = true ;
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2011-03-21 13:58:06 +01:00
if ( err )
return err ;
}
2007-10-16 23:31:02 -07:00
}
2016-10-01 07:32:32 +02:00
if ( fc - > default_permissions ) {
2011-06-20 19:16:29 -04:00
err = generic_permission ( inode , mask ) ;
2005-09-09 13:10:31 -07:00
/* If permission is denied, try to refresh file
attributes . This is also needed , because the root
node will at first have no permissions */
2007-10-16 23:31:02 -07:00
if ( err = = - EACCES & & ! refreshed ) {
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2005-09-09 13:10:31 -07:00
if ( ! err )
2011-06-20 19:16:29 -04:00
err = generic_permission ( inode , mask ) ;
2005-09-09 13:10:31 -07:00
}
2006-01-06 00:19:39 -08:00
/* Note: the opposite of the above test does not
exist . So if permissions are revoked this won ' t be
noticed immediately , only after the attribute
timeout has expired */
2010-07-23 11:43:51 -04:00
} else if ( mask & ( MAY_ACCESS | MAY_CHDIR ) ) {
2007-10-16 23:31:06 -07:00
err = fuse_access ( inode , mask ) ;
} else if ( ( mask & MAY_EXEC ) & & S_ISREG ( inode - > i_mode ) ) {
if ( ! ( inode - > i_mode & S_IXUGO ) ) {
if ( refreshed )
return - EACCES ;
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2007-10-16 23:31:06 -07:00
if ( ! err & & ! ( inode - > i_mode & S_IXUGO ) )
return - EACCES ;
}
2005-09-09 13:10:28 -07:00
}
2007-10-16 23:31:02 -07:00
return err ;
2005-09-09 13:10:28 -07:00
}
static int parse_dirfile ( char * buf , size_t nbytes , struct file * file ,
2013-05-18 03:03:58 -04:00
struct dir_context * ctx )
2005-09-09 13:10:28 -07:00
{
while ( nbytes > = FUSE_NAME_OFFSET ) {
struct fuse_dirent * dirent = ( struct fuse_dirent * ) buf ;
size_t reclen = FUSE_DIRENT_SIZE ( dirent ) ;
if ( ! dirent - > namelen | | dirent - > namelen > FUSE_NAME_MAX )
return - EIO ;
if ( reclen > nbytes )
break ;
2013-09-03 14:28:38 +02:00
if ( memchr ( dirent - > name , ' / ' , dirent - > namelen ) ! = NULL )
return - EIO ;
2005-09-09 13:10:28 -07:00
2013-05-18 03:03:58 -04:00
if ( ! dir_emit ( ctx , dirent - > name , dirent - > namelen ,
dirent - > ino , dirent - > type ) )
2005-09-09 13:10:28 -07:00
break ;
buf + = reclen ;
nbytes - = reclen ;
2013-05-18 03:03:58 -04:00
ctx - > pos = dirent - > off ;
2005-09-09 13:10:28 -07:00
}
return 0 ;
}
2012-08-19 08:53:23 -04:00
static int fuse_direntplus_link ( struct file * file ,
struct fuse_direntplus * direntplus ,
u64 attr_version )
{
struct fuse_entry_out * o = & direntplus - > entry_out ;
struct fuse_dirent * dirent = & direntplus - > dirent ;
struct dentry * parent = file - > f_path . dentry ;
struct qstr name = QSTR_INIT ( dirent - > name , dirent - > namelen ) ;
struct dentry * dentry ;
struct dentry * alias ;
2015-03-17 22:25:59 +00:00
struct inode * dir = d_inode ( parent ) ;
2012-08-19 08:53:23 -04:00
struct fuse_conn * fc ;
struct inode * inode ;
2016-04-20 17:30:32 -04:00
DECLARE_WAIT_QUEUE_HEAD_ONSTACK ( wq ) ;
2012-08-19 08:53:23 -04:00
if ( ! o - > nodeid ) {
/*
* Unlike in the case of fuse_lookup , zero nodeid does not mean
* ENOENT . Instead , it only means the userspace filesystem did
* not want to return attributes / handle for this entry .
*
* So do nothing .
*/
return 0 ;
}
if ( name . name [ 0 ] = = ' . ' ) {
/*
* We could potentially refresh the attributes of the directory
* and its parent ?
*/
if ( name . len = = 1 )
return 0 ;
if ( name . name [ 1 ] = = ' . ' & & name . len = = 2 )
return 0 ;
}
2013-07-17 14:53:53 +02:00
if ( invalid_nodeid ( o - > nodeid ) )
return - EIO ;
if ( ! fuse_valid_type ( o - > attr . mode ) )
return - EIO ;
2012-08-19 08:53:23 -04:00
fc = get_fuse_conn ( dir ) ;
2016-06-10 07:51:30 -07:00
name . hash = full_name_hash ( parent , name . name , name . len ) ;
2012-08-19 08:53:23 -04:00
dentry = d_lookup ( parent , & name ) ;
2016-04-20 17:30:32 -04:00
if ( ! dentry ) {
retry :
dentry = d_alloc_parallel ( parent , & name , & wq ) ;
if ( IS_ERR ( dentry ) )
return PTR_ERR ( dentry ) ;
}
if ( ! d_in_lookup ( dentry ) ) {
struct fuse_inode * fi ;
2015-03-17 22:25:59 +00:00
inode = d_inode ( dentry ) ;
2016-04-20 17:30:32 -04:00
if ( ! inode | |
get_node_id ( inode ) ! = o - > nodeid | |
( ( o - > attr . mode ^ inode - > i_mode ) & S_IFMT ) ) {
2014-02-13 09:46:25 -08:00
d_invalidate ( dentry ) ;
2016-04-20 17:30:32 -04:00
dput ( dentry ) ;
goto retry ;
}
if ( is_bad_inode ( inode ) ) {
dput ( dentry ) ;
return - EIO ;
2012-08-19 08:53:23 -04:00
}
2016-04-20 17:30:32 -04:00
fi = get_fuse_inode ( inode ) ;
spin_lock ( & fc - > lock ) ;
fi - > nlookup + + ;
spin_unlock ( & fc - > lock ) ;
2012-08-19 08:53:23 -04:00
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
2016-04-20 17:30:32 -04:00
fuse_change_attributes ( inode , & o - > attr ,
entry_attr_timeout ( o ) ,
attr_version ) ;
/*
* The other branch comes via fuse_iget ( )
* which bumps nlookup inside
*/
} else {
inode = fuse_iget ( dir - > i_sb , o - > nodeid , o - > generation ,
& o - > attr , entry_attr_timeout ( o ) ,
attr_version ) ;
if ( ! inode )
inode = ERR_PTR ( - ENOMEM ) ;
2013-07-17 14:53:53 +02:00
2016-04-20 17:30:32 -04:00
alias = d_splice_alias ( inode , dentry ) ;
d_lookup_done ( dentry ) ;
if ( alias ) {
dput ( dentry ) ;
dentry = alias ;
}
if ( IS_ERR ( dentry ) )
return PTR_ERR ( dentry ) ;
2012-08-19 08:53:23 -04:00
}
2013-10-01 16:41:22 +02:00
if ( fc - > readdirplus_auto )
set_bit ( FUSE_I_INIT_RDPLUS , & get_fuse_inode ( inode ) - > state ) ;
2012-08-19 08:53:23 -04:00
fuse_change_entry_timeout ( dentry , o ) ;
2013-07-17 14:53:54 +02:00
dput ( dentry ) ;
2016-04-20 17:30:32 -04:00
return 0 ;
2012-08-19 08:53:23 -04:00
}
static int parse_dirplusfile ( char * buf , size_t nbytes , struct file * file ,
2013-05-18 03:03:58 -04:00
struct dir_context * ctx , u64 attr_version )
2012-08-19 08:53:23 -04:00
{
struct fuse_direntplus * direntplus ;
struct fuse_dirent * dirent ;
size_t reclen ;
int over = 0 ;
int ret ;
while ( nbytes > = FUSE_NAME_OFFSET_DIRENTPLUS ) {
direntplus = ( struct fuse_direntplus * ) buf ;
dirent = & direntplus - > dirent ;
reclen = FUSE_DIRENTPLUS_SIZE ( direntplus ) ;
if ( ! dirent - > namelen | | dirent - > namelen > FUSE_NAME_MAX )
return - EIO ;
if ( reclen > nbytes )
break ;
2013-09-03 14:28:38 +02:00
if ( memchr ( dirent - > name , ' / ' , dirent - > namelen ) ! = NULL )
return - EIO ;
2012-08-19 08:53:23 -04:00
if ( ! over ) {
/* We fill entries into dstbuf only as much as
it can hold . But we still continue iterating
over remaining entries to link them . If not ,
we need to send a FORGET for each of those
which we did not link .
*/
2013-05-18 03:03:58 -04:00
over = ! dir_emit ( ctx , dirent - > name , dirent - > namelen ,
dirent - > ino , dirent - > type ) ;
ctx - > pos = dirent - > off ;
2012-08-19 08:53:23 -04:00
}
buf + = reclen ;
nbytes - = reclen ;
ret = fuse_direntplus_link ( file , direntplus , attr_version ) ;
if ( ret )
fuse_force_forget ( file , direntplus - > entry_out . nodeid ) ;
}
return 0 ;
}
2013-05-18 03:03:58 -04:00
static int fuse_readdir ( struct file * file , struct dir_context * ctx )
2005-09-09 13:10:28 -07:00
{
2013-01-15 11:23:28 +08:00
int plus , err ;
2005-09-09 13:10:36 -07:00
size_t nbytes ;
struct page * page ;
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2005-09-09 13:10:28 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2006-01-06 00:19:39 -08:00
struct fuse_req * req ;
2012-08-19 08:53:23 -04:00
u64 attr_version = 0 ;
2006-01-06 00:19:39 -08:00
if ( is_bad_inode ( inode ) )
return - EIO ;
2012-10-26 19:48:30 +04:00
req = fuse_get_req ( fc , 1 ) ;
2006-04-10 22:54:58 -07:00
if ( IS_ERR ( req ) )
return PTR_ERR ( req ) ;
2005-09-09 13:10:28 -07:00
2005-09-09 13:10:36 -07:00
page = alloc_page ( GFP_KERNEL ) ;
if ( ! page ) {
fuse_put_request ( fc , req ) ;
return - ENOMEM ;
}
2013-01-15 11:23:28 +08:00
2013-05-18 03:03:58 -04:00
plus = fuse_use_readdirplus ( inode , ctx ) ;
2009-04-02 14:25:34 +02:00
req - > out . argpages = 1 ;
2005-09-09 13:10:36 -07:00
req - > num_pages = 1 ;
req - > pages [ 0 ] = page ;
2012-10-26 19:49:33 +04:00
req - > page_descs [ 0 ] . length = PAGE_SIZE ;
2013-01-15 11:23:28 +08:00
if ( plus ) {
2012-08-19 08:53:23 -04:00
attr_version = fuse_get_attr_version ( fc ) ;
2013-05-18 03:03:58 -04:00
fuse_read_fill ( req , file , ctx - > pos , PAGE_SIZE ,
2012-08-19 08:53:23 -04:00
FUSE_READDIRPLUS ) ;
} else {
2013-05-18 03:03:58 -04:00
fuse_read_fill ( req , file , ctx - > pos , PAGE_SIZE ,
2012-08-19 08:53:23 -04:00
FUSE_READDIR ) ;
}
2016-06-30 13:10:49 +02:00
fuse_lock_inode ( inode ) ;
2008-11-26 12:03:55 +01:00
fuse_request_send ( fc , req ) ;
2016-06-30 13:10:49 +02:00
fuse_unlock_inode ( inode ) ;
2006-01-16 22:14:45 -08:00
nbytes = req - > out . args [ 0 ] . size ;
2005-09-09 13:10:28 -07:00
err = req - > out . h . error ;
fuse_put_request ( fc , req ) ;
2012-08-19 08:53:23 -04:00
if ( ! err ) {
2013-01-15 11:23:28 +08:00
if ( plus ) {
2012-08-19 08:53:23 -04:00
err = parse_dirplusfile ( page_address ( page ) , nbytes ,
2013-05-18 03:03:58 -04:00
file , ctx ,
2012-08-19 08:53:23 -04:00
attr_version ) ;
} else {
err = parse_dirfile ( page_address ( page ) , nbytes , file ,
2013-05-18 03:03:58 -04:00
ctx ) ;
2012-08-19 08:53:23 -04:00
}
}
2005-09-09 13:10:28 -07:00
2005-09-09 13:10:36 -07:00
__free_page ( page ) ;
2013-11-05 03:55:43 -08:00
fuse_invalidate_atime ( inode ) ;
2005-09-09 13:10:36 -07:00
return err ;
2005-09-09 13:10:28 -07:00
}
2015-11-17 10:20:54 -05:00
static const char * fuse_get_link ( struct dentry * dentry ,
2015-12-29 15:58:39 -05:00
struct inode * inode ,
struct delayed_call * done )
2005-09-09 13:10:28 -07:00
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:28 -07:00
char * link ;
2014-12-12 09:49:05 +01:00
ssize_t ret ;
2005-09-09 13:10:28 -07:00
2015-11-17 10:20:54 -05:00
if ( ! dentry )
return ERR_PTR ( - ECHILD ) ;
2015-12-29 16:03:53 -05:00
link = kmalloc ( PAGE_SIZE , GFP_KERNEL ) ;
2014-12-12 09:49:05 +01:00
if ( ! link )
return ERR_PTR ( - ENOMEM ) ;
args . in . h . opcode = FUSE_READLINK ;
args . in . h . nodeid = get_node_id ( inode ) ;
args . out . argvar = 1 ;
args . out . numargs = 1 ;
args . out . args [ 0 ] . size = PAGE_SIZE - 1 ;
args . out . args [ 0 ] . value = link ;
ret = fuse_simple_request ( fc , & args ) ;
if ( ret < 0 ) {
2015-12-29 16:03:53 -05:00
kfree ( link ) ;
2014-12-12 09:49:05 +01:00
link = ERR_PTR ( ret ) ;
} else {
link [ ret ] = ' \0 ' ;
2015-12-29 15:58:39 -05:00
set_delayed_call ( done , kfree_link , link ) ;
2014-12-12 09:49:05 +01:00
}
2013-11-05 03:55:43 -08:00
fuse_invalidate_atime ( inode ) ;
2005-09-09 13:10:28 -07:00
return link ;
}
static int fuse_dir_open ( struct inode * inode , struct file * file )
{
2009-04-28 16:56:37 +02:00
return fuse_open_common ( inode , file , true ) ;
2005-09-09 13:10:28 -07:00
}
static int fuse_dir_release ( struct inode * inode , struct file * file )
{
2009-04-28 16:56:39 +02:00
fuse_release_common ( file , FUSE_RELEASEDIR ) ;
return 0 ;
2005-09-09 13:10:28 -07:00
}
2011-07-16 20:44:56 -04:00
static int fuse_dir_fsync ( struct file * file , loff_t start , loff_t end ,
int datasync )
2005-09-09 13:10:38 -07:00
{
2011-07-16 20:44:56 -04:00
return fuse_fsync_common ( file , start , end , datasync , 1 ) ;
2005-09-09 13:10:38 -07:00
}
2011-12-13 11:58:49 +01:00
static long fuse_dir_ioctl ( struct file * file , unsigned int cmd ,
unsigned long arg )
{
struct fuse_conn * fc = get_fuse_conn ( file - > f_mapping - > host ) ;
/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
if ( fc - > minor < 18 )
return - ENOTTY ;
return fuse_ioctl_common ( file , cmd , arg , FUSE_IOCTL_DIR ) ;
}
static long fuse_dir_compat_ioctl ( struct file * file , unsigned int cmd ,
unsigned long arg )
{
struct fuse_conn * fc = get_fuse_conn ( file - > f_mapping - > host ) ;
if ( fc - > minor < 18 )
return - ENOTTY ;
return fuse_ioctl_common ( file , cmd , arg ,
FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR ) ;
}
2013-12-26 19:51:11 +04:00
static bool update_mtime ( unsigned ivalid , bool trust_local_mtime )
2007-10-18 03:07:01 -07:00
{
/* Always update if mtime is explicitly set */
if ( ivalid & ATTR_MTIME_SET )
return true ;
2013-12-26 19:51:11 +04:00
/* Or if kernel i_mtime is the official one */
if ( trust_local_mtime )
return true ;
2007-10-18 03:07:01 -07:00
/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
if ( ( ivalid & ATTR_SIZE ) & & ( ivalid & ( ATTR_OPEN | ATTR_FILE ) ) )
return false ;
/* In all other cases update */
return true ;
}
2013-12-26 19:51:11 +04:00
static void iattr_to_fattr ( struct iattr * iattr , struct fuse_setattr_in * arg ,
2014-04-28 14:19:25 +02:00
bool trust_local_cmtime )
2005-09-09 13:10:29 -07:00
{
unsigned ivalid = iattr - > ia_valid ;
if ( ivalid & ATTR_MODE )
2005-11-07 00:59:52 -08:00
arg - > valid | = FATTR_MODE , arg - > mode = iattr - > ia_mode ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_UID )
2012-02-07 16:26:03 -08:00
arg - > valid | = FATTR_UID , arg - > uid = from_kuid ( & init_user_ns , iattr - > ia_uid ) ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_GID )
2012-02-07 16:26:03 -08:00
arg - > valid | = FATTR_GID , arg - > gid = from_kgid ( & init_user_ns , iattr - > ia_gid ) ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_SIZE )
2005-11-07 00:59:52 -08:00
arg - > valid | = FATTR_SIZE , arg - > size = iattr - > ia_size ;
2007-10-18 03:07:01 -07:00
if ( ivalid & ATTR_ATIME ) {
arg - > valid | = FATTR_ATIME ;
2005-11-07 00:59:52 -08:00
arg - > atime = iattr - > ia_atime . tv_sec ;
2007-10-18 03:07:01 -07:00
arg - > atimensec = iattr - > ia_atime . tv_nsec ;
if ( ! ( ivalid & ATTR_ATIME_SET ) )
arg - > valid | = FATTR_ATIME_NOW ;
}
2014-04-28 14:19:25 +02:00
if ( ( ivalid & ATTR_MTIME ) & & update_mtime ( ivalid , trust_local_cmtime ) ) {
2007-10-18 03:07:01 -07:00
arg - > valid | = FATTR_MTIME ;
2005-11-07 00:59:52 -08:00
arg - > mtime = iattr - > ia_mtime . tv_sec ;
2007-10-18 03:07:01 -07:00
arg - > mtimensec = iattr - > ia_mtime . tv_nsec ;
2014-04-28 14:19:25 +02:00
if ( ! ( ivalid & ATTR_MTIME_SET ) & & ! trust_local_cmtime )
2007-10-18 03:07:01 -07:00
arg - > valid | = FATTR_MTIME_NOW ;
2005-11-07 00:59:52 -08:00
}
2014-04-28 14:19:25 +02:00
if ( ( ivalid & ATTR_CTIME ) & & trust_local_cmtime ) {
arg - > valid | = FATTR_CTIME ;
arg - > ctime = iattr - > ia_ctime . tv_sec ;
arg - > ctimensec = iattr - > ia_ctime . tv_nsec ;
}
2005-09-09 13:10:29 -07:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
/*
* Prevent concurrent writepages on inode
*
* This is done by adding a negative bias to the inode write counter
* and waiting for all pending writes to finish .
*/
void fuse_set_nowrite ( struct inode * inode )
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2016-01-22 15:40:57 -05:00
BUG_ON ( ! inode_is_locked ( inode ) ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
spin_lock ( & fc - > lock ) ;
BUG_ON ( fi - > writectr < 0 ) ;
fi - > writectr + = FUSE_NOWRITE ;
spin_unlock ( & fc - > lock ) ;
wait_event ( fi - > page_waitq , fi - > writectr = = FUSE_NOWRITE ) ;
}
/*
* Allow writepages on inode
*
* Remove the bias from the writecounter and send any queued
* writepages .
*/
static void __fuse_release_nowrite ( struct inode * inode )
{
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
BUG_ON ( fi - > writectr ! = FUSE_NOWRITE ) ;
fi - > writectr = 0 ;
fuse_flush_writepages ( inode ) ;
}
void fuse_release_nowrite ( struct inode * inode )
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
spin_lock ( & fc - > lock ) ;
__fuse_release_nowrite ( inode ) ;
spin_unlock ( & fc - > lock ) ;
}
2014-12-12 09:49:05 +01:00
static void fuse_setattr_fill ( struct fuse_conn * fc , struct fuse_args * args ,
2013-12-26 19:51:11 +04:00
struct inode * inode ,
struct fuse_setattr_in * inarg_p ,
struct fuse_attr_out * outarg_p )
{
2014-12-12 09:49:05 +01:00
args - > in . h . opcode = FUSE_SETATTR ;
args - > in . h . nodeid = get_node_id ( inode ) ;
args - > in . numargs = 1 ;
args - > in . args [ 0 ] . size = sizeof ( * inarg_p ) ;
args - > in . args [ 0 ] . value = inarg_p ;
args - > out . numargs = 1 ;
2015-01-06 10:45:35 +01:00
args - > out . args [ 0 ] . size = sizeof ( * outarg_p ) ;
2014-12-12 09:49:05 +01:00
args - > out . args [ 0 ] . value = outarg_p ;
2013-12-26 19:51:11 +04:00
}
/*
* Flush inode - > i_mtime to the server
*/
2014-04-28 14:19:24 +02:00
int fuse_flush_times ( struct inode * inode , struct fuse_file * ff )
2013-12-26 19:51:11 +04:00
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2013-12-26 19:51:11 +04:00
struct fuse_setattr_in inarg ;
struct fuse_attr_out outarg ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2014-04-28 14:19:24 +02:00
inarg . valid = FATTR_MTIME ;
2013-12-26 19:51:11 +04:00
inarg . mtime = inode - > i_mtime . tv_sec ;
inarg . mtimensec = inode - > i_mtime . tv_nsec ;
2014-04-28 14:19:24 +02:00
if ( fc - > minor > = 23 ) {
inarg . valid | = FATTR_CTIME ;
inarg . ctime = inode - > i_ctime . tv_sec ;
inarg . ctimensec = inode - > i_ctime . tv_nsec ;
}
2014-04-28 14:19:23 +02:00
if ( ff ) {
inarg . valid | = FATTR_FH ;
inarg . fh = ff - > fh ;
}
2014-12-12 09:49:05 +01:00
fuse_setattr_fill ( fc , & args , inode , & inarg , & outarg ) ;
2013-12-26 19:51:11 +04:00
2014-12-12 09:49:05 +01:00
return fuse_simple_request ( fc , & args ) ;
2013-12-26 19:51:11 +04:00
}
2006-01-06 00:19:39 -08:00
/*
* Set attributes , and at the same time refresh them .
*
* Truncation is slightly complicated , because the ' truncate ' request
* may fail , in which case we don ' t want to touch the mapping .
2006-10-17 00:10:06 -07:00
* vmtruncate ( ) doesn ' t allow for this case , so do the rlimit checking
* and the actual truncation by hand .
2006-01-06 00:19:39 -08:00
*/
2016-05-26 17:12:41 +02:00
int fuse_do_setattr ( struct dentry * dentry , struct iattr * attr ,
2012-12-18 14:05:08 +04:00
struct file * file )
2005-09-09 13:10:29 -07:00
{
2016-05-26 17:12:41 +02:00
struct inode * inode = d_inode ( dentry ) ;
2005-09-09 13:10:29 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
struct fuse_setattr_in inarg ;
struct fuse_attr_out outarg ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
bool is_truncate = false ;
2013-10-10 17:10:46 +04:00
bool is_wb = fc - > writeback_cache ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
loff_t oldsize ;
2005-09-09 13:10:29 -07:00
int err ;
2014-04-28 14:19:25 +02:00
bool trust_local_cmtime = is_wb & & S_ISREG ( inode - > i_mode ) ;
2005-09-09 13:10:29 -07:00
2016-10-01 07:32:32 +02:00
if ( ! fc - > default_permissions )
2010-06-04 11:30:03 +02:00
attr - > ia_valid | = ATTR_FORCE ;
2016-05-26 16:55:18 +02:00
err = setattr_prepare ( dentry , attr ) ;
2010-06-04 11:30:03 +02:00
if ( err )
return err ;
2005-09-09 13:10:31 -07:00
2011-02-25 14:44:58 +01:00
if ( attr - > ia_valid & ATTR_OPEN ) {
if ( fc - > atomic_o_trunc )
return 0 ;
file = NULL ;
}
2007-10-18 03:07:02 -07:00
2010-06-04 11:30:04 +02:00
if ( attr - > ia_valid & ATTR_SIZE )
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
is_truncate = true ;
2005-09-09 13:10:29 -07:00
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
if ( is_truncate ) {
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
fuse_set_nowrite ( inode ) ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
set_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
2014-04-28 14:19:25 +02:00
if ( trust_local_cmtime & & attr - > ia_size ! = inode - > i_size )
attr - > ia_valid | = ATTR_MTIME | ATTR_CTIME ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2014-04-28 14:19:25 +02:00
iattr_to_fattr ( attr , & inarg , trust_local_cmtime ) ;
2007-10-18 03:07:00 -07:00
if ( file ) {
struct fuse_file * ff = file - > private_data ;
inarg . valid | = FATTR_FH ;
inarg . fh = ff - > fh ;
}
2007-10-18 03:07:04 -07:00
if ( attr - > ia_valid & ATTR_SIZE ) {
/* For mandatory locking in truncate */
inarg . valid | = FATTR_LOCKOWNER ;
inarg . lock_owner = fuse_lock_owner_id ( fc , current - > files ) ;
}
2014-12-12 09:49:05 +01:00
fuse_setattr_fill ( fc , & args , inode , & inarg , & outarg ) ;
err = fuse_simple_request ( fc , & args ) ;
2007-10-16 23:31:01 -07:00
if ( err ) {
if ( err = = - EINTR )
fuse_invalidate_attr ( inode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
goto error ;
2007-10-16 23:31:01 -07:00
}
2005-09-09 13:10:29 -07:00
2007-10-16 23:31:01 -07:00
if ( ( inode - > i_mode ^ outarg . attr . mode ) & S_IFMT ) {
make_bad_inode ( inode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
err = - EIO ;
goto error ;
}
spin_lock ( & fc - > lock ) ;
2013-12-26 19:51:11 +04:00
/* the kernel maintains i_mtime locally */
2014-04-28 14:19:25 +02:00
if ( trust_local_cmtime ) {
if ( attr - > ia_valid & ATTR_MTIME )
inode - > i_mtime = attr - > ia_mtime ;
if ( attr - > ia_valid & ATTR_CTIME )
inode - > i_ctime = attr - > ia_ctime ;
2014-04-28 14:19:23 +02:00
/* FIXME: clear I_DIRTY_SYNC? */
2013-12-26 19:51:11 +04:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
fuse_change_attributes_common ( inode , & outarg . attr ,
attr_timeout ( & outarg ) ) ;
oldsize = inode - > i_size ;
2013-10-10 17:10:46 +04:00
/* see the comment in fuse_change_attributes() */
if ( ! is_wb | | is_truncate | | ! S_ISREG ( inode - > i_mode ) )
i_size_write ( inode , outarg . attr . size ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
if ( is_truncate ) {
/* NOTE: this may release/reacquire fc->lock */
__fuse_release_nowrite ( inode ) ;
}
spin_unlock ( & fc - > lock ) ;
/*
* Only call invalidate_inode_pages2 ( ) after removing
* FUSE_NOWRITE , otherwise fuse_launder_page ( ) would deadlock .
*/
2013-10-10 17:10:46 +04:00
if ( ( is_truncate | | ! is_wb ) & &
S_ISREG ( inode - > i_mode ) & & oldsize ! = outarg . attr . size ) {
2013-09-12 15:13:56 -07:00
truncate_pagecache ( inode , outarg . attr . size ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
invalidate_inode_pages2 ( inode - > i_mapping ) ;
2007-10-16 23:31:01 -07:00
}
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
clear_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
2007-10-16 23:31:01 -07:00
return 0 ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
error :
if ( is_truncate )
fuse_release_nowrite ( inode ) ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
clear_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
return err ;
2005-09-09 13:10:29 -07:00
}
2007-10-18 03:07:00 -07:00
static int fuse_setattr ( struct dentry * entry , struct iattr * attr )
{
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2016-10-01 07:32:32 +02:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2016-10-01 07:32:32 +02:00
struct file * file = ( attr - > ia_valid & ATTR_FILE ) ? attr - > ia_file : NULL ;
2016-10-01 07:32:32 +02:00
int ret ;
2012-12-18 14:05:08 +04:00
if ( ! fuse_allow_current_process ( get_fuse_conn ( inode ) ) )
return - EACCES ;
2016-10-01 07:32:32 +02:00
if ( attr - > ia_valid & ( ATTR_KILL_SUID | ATTR_KILL_SGID ) ) {
attr - > ia_valid & = ~ ( ATTR_KILL_SUID | ATTR_KILL_SGID |
ATTR_MODE ) ;
2016-10-01 07:32:32 +02:00
2016-10-01 07:32:32 +02:00
/*
2016-10-01 07:32:32 +02:00
* The only sane way to reliably kill suid / sgid is to do it in
* the userspace filesystem
*
* This should be done on write ( ) , truncate ( ) and chown ( ) .
2016-10-01 07:32:32 +02:00
*/
2016-10-01 07:32:32 +02:00
if ( ! fc - > handle_killpriv ) {
/*
* ia_mode calculation may have used stale i_mode .
* Refresh and recalculate .
*/
ret = fuse_do_getattr ( inode , NULL , file ) ;
if ( ret )
return ret ;
attr - > ia_mode = inode - > i_mode ;
fuse: fix clearing suid, sgid for chown()
Basically, the pjdfstests set the ownership of a file to 06555, and then
chowns it (as root) to a new uid/gid. Prior to commit a09f99eddef4 ("fuse:
fix killing s[ug]id in setattr"), fuse would send down a setattr with both
the uid/gid change and a new mode. Now, it just sends down the uid/gid
change.
Technically this is NOTABUG, since POSIX doesn't _require_ that we clear
these bits for a privileged process, but Linux (wisely) has done that and I
think we don't want to change that behavior here.
This is caused by the use of should_remove_suid(), which will always return
0 when the process has CAP_FSETID.
In fact we really don't need to be calling should_remove_suid() at all,
since we've already been indicated that we should remove the suid, we just
don't want to use a (very) stale mode for that.
This patch should fix the above as well as simplify the logic.
Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fixes: a09f99eddef4 ("fuse: fix killing s[ug]id in setattr")
Cc: <stable@vger.kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
2016-12-06 16:18:45 +01:00
if ( inode - > i_mode & S_ISUID ) {
2016-10-01 07:32:32 +02:00
attr - > ia_valid | = ATTR_MODE ;
attr - > ia_mode & = ~ S_ISUID ;
}
fuse: fix clearing suid, sgid for chown()
Basically, the pjdfstests set the ownership of a file to 06555, and then
chowns it (as root) to a new uid/gid. Prior to commit a09f99eddef4 ("fuse:
fix killing s[ug]id in setattr"), fuse would send down a setattr with both
the uid/gid change and a new mode. Now, it just sends down the uid/gid
change.
Technically this is NOTABUG, since POSIX doesn't _require_ that we clear
these bits for a privileged process, but Linux (wisely) has done that and I
think we don't want to change that behavior here.
This is caused by the use of should_remove_suid(), which will always return
0 when the process has CAP_FSETID.
In fact we really don't need to be calling should_remove_suid() at all,
since we've already been indicated that we should remove the suid, we just
don't want to use a (very) stale mode for that.
This patch should fix the above as well as simplify the logic.
Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fixes: a09f99eddef4 ("fuse: fix killing s[ug]id in setattr")
Cc: <stable@vger.kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
2016-12-06 16:18:45 +01:00
if ( ( inode - > i_mode & ( S_ISGID | S_IXGRP ) ) = = ( S_ISGID | S_IXGRP ) ) {
2016-10-01 07:32:32 +02:00
attr - > ia_valid | = ATTR_MODE ;
attr - > ia_mode & = ~ S_ISGID ;
}
2016-10-01 07:32:32 +02:00
}
}
if ( ! attr - > ia_valid )
return 0 ;
2016-10-01 07:32:32 +02:00
2016-10-10 13:04:49 -07:00
ret = fuse_do_setattr ( entry , attr , file ) ;
2016-10-01 07:32:32 +02:00
if ( ! ret ) {
2016-08-29 08:46:37 -05:00
/*
* If filesystem supports acls it may have updated acl xattrs in
* the filesystem , so forget cached acls for the inode .
*/
if ( fc - > posix_acl )
forget_all_cached_acls ( inode ) ;
2016-10-01 07:32:32 +02:00
/* Directory mode changed, may need to revalidate access */
if ( d_is_dir ( entry ) & & ( attr - > ia_valid & ATTR_MODE ) )
fuse_invalidate_entry_cache ( entry ) ;
}
return ret ;
2007-10-18 03:07:00 -07:00
}
2005-09-09 13:10:28 -07:00
static int fuse_getattr ( struct vfsmount * mnt , struct dentry * entry ,
struct kstat * stat )
{
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2007-10-16 23:31:02 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2013-01-14 22:30:00 -08:00
if ( ! fuse_allow_current_process ( fc ) )
2007-10-16 23:31:02 -07:00
return - EACCES ;
2007-11-28 16:21:59 -08:00
return fuse_update_attributes ( inode , stat , NULL , NULL ) ;
2005-09-09 13:10:28 -07:00
}
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_dir_inode_operations = {
2005-09-09 13:10:28 -07:00
. lookup = fuse_lookup ,
2005-09-09 13:10:29 -07:00
. mkdir = fuse_mkdir ,
. symlink = fuse_symlink ,
. unlink = fuse_unlink ,
. rmdir = fuse_rmdir ,
2016-09-27 11:03:58 +02:00
. rename = fuse_rename2 ,
2005-09-09 13:10:29 -07:00
. link = fuse_link ,
. setattr = fuse_setattr ,
. create = fuse_create ,
2012-06-05 15:10:22 +02:00
. atomic_open = fuse_atomic_open ,
2005-09-09 13:10:29 -07:00
. mknod = fuse_mknod ,
2005-09-09 13:10:28 -07:00
. permission = fuse_permission ,
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2016-08-29 08:46:37 -05:00
. get_acl = fuse_get_acl ,
. set_acl = fuse_set_acl ,
2005-09-09 13:10:28 -07:00
} ;
2006-03-28 01:56:42 -08:00
static const struct file_operations fuse_dir_operations = {
2005-09-09 13:10:30 -07:00
. llseek = generic_file_llseek ,
2005-09-09 13:10:28 -07:00
. read = generic_read_dir ,
2016-04-20 17:30:32 -04:00
. iterate_shared = fuse_readdir ,
2005-09-09 13:10:28 -07:00
. open = fuse_dir_open ,
. release = fuse_dir_release ,
2005-09-09 13:10:38 -07:00
. fsync = fuse_dir_fsync ,
2011-12-13 11:58:49 +01:00
. unlocked_ioctl = fuse_dir_ioctl ,
. compat_ioctl = fuse_dir_compat_ioctl ,
2005-09-09 13:10:28 -07:00
} ;
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_common_inode_operations = {
2005-09-09 13:10:29 -07:00
. setattr = fuse_setattr ,
2005-09-09 13:10:28 -07:00
. permission = fuse_permission ,
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2016-08-29 08:46:37 -05:00
. get_acl = fuse_get_acl ,
. set_acl = fuse_set_acl ,
2005-09-09 13:10:28 -07:00
} ;
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_symlink_inode_operations = {
2005-09-09 13:10:29 -07:00
. setattr = fuse_setattr ,
2015-11-17 10:20:54 -05:00
. get_link = fuse_get_link ,
2005-09-09 13:10:28 -07:00
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2005-09-09 13:10:28 -07:00
} ;
void fuse_init_common ( struct inode * inode )
{
inode - > i_op = & fuse_common_inode_operations ;
}
void fuse_init_dir ( struct inode * inode )
{
inode - > i_op = & fuse_dir_inode_operations ;
inode - > i_fop = & fuse_dir_operations ;
}
void fuse_init_symlink ( struct inode * inode )
{
inode - > i_op = & fuse_symlink_inode_operations ;
}