2005-09-09 13:10:28 -07:00
/*
FUSE : Filesystem in Userspace
2008-11-26 12:03:54 +01:00
Copyright ( C ) 2001 - 2008 Miklos Szeredi < miklos @ szeredi . hu >
2005-09-09 13:10:28 -07:00
This program can be distributed under the terms of the GNU GPL .
See the file COPYING .
*/
# include "fuse_i.h"
# include <linux/pagemap.h>
# include <linux/file.h>
2020-04-21 14:47:15 +02:00
# include <linux/fs_context.h>
2022-07-11 10:48:08 -07:00
# include <linux/moduleparam.h>
2005-09-09 13:10:28 -07:00
# include <linux/sched.h>
# include <linux/namei.h>
2010-12-07 20:16:56 +01:00
# include <linux/slab.h>
2016-08-29 08:46:36 -05:00
# include <linux/xattr.h>
2018-10-01 10:07:05 +02:00
# include <linux/iversion.h>
2016-08-29 08:46:37 -05:00
# include <linux/posix_acl.h>
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
# include <linux/security.h>
# include <linux/types.h>
# include <linux/kernel.h>
2005-09-09 13:10:28 -07:00
2022-07-11 10:48:08 -07:00
static bool __read_mostly allow_sys_admin_access ;
module_param ( allow_sys_admin_access , bool , 0644 ) ;
MODULE_PARM_DESC ( allow_sys_admin_access ,
" Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check " ) ;
2013-01-15 11:23:28 +08:00
static void fuse_advise_use_readdirplus ( struct inode * dir )
{
struct fuse_inode * fi = get_fuse_inode ( dir ) ;
set_bit ( FUSE_I_ADVISE_RDPLUS , & fi - > state ) ;
}
2019-09-16 16:56:41 -07:00
# if BITS_PER_LONG >= 64
static inline void __fuse_dentry_settime ( struct dentry * entry , u64 time )
{
entry - > d_fsdata = ( void * ) time ;
}
static inline u64 fuse_dentry_time ( const struct dentry * entry )
{
return ( u64 ) entry - > d_fsdata ;
}
# else
2016-10-01 07:32:32 +02:00
union fuse_dentry {
u64 time ;
struct rcu_head rcu ;
} ;
2019-09-16 16:56:41 -07:00
static inline void __fuse_dentry_settime ( struct dentry * dentry , u64 time )
{
( ( union fuse_dentry * ) dentry - > d_fsdata ) - > time = time ;
}
static inline u64 fuse_dentry_time ( const struct dentry * entry )
{
return ( ( union fuse_dentry * ) entry - > d_fsdata ) - > time ;
}
# endif
2018-08-15 17:42:34 +02:00
static void fuse_dentry_settime ( struct dentry * dentry , u64 time )
2006-07-30 03:04:10 -07:00
{
2018-08-15 17:42:34 +02:00
struct fuse_conn * fc = get_fuse_conn_super ( dentry - > d_sb ) ;
bool delete = ! time & & fc - > delete_stale ;
/*
* Mess with DCACHE_OP_DELETE because dput ( ) will be faster without it .
* Don ' t care about races , either way it ' s just an optimization
*/
if ( ( ! delete & & ( dentry - > d_flags & DCACHE_OP_DELETE ) ) | |
( delete & & ! ( dentry - > d_flags & DCACHE_OP_DELETE ) ) ) {
spin_lock ( & dentry - > d_lock ) ;
if ( ! delete )
dentry - > d_flags & = ~ DCACHE_OP_DELETE ;
else
dentry - > d_flags | = DCACHE_OP_DELETE ;
spin_unlock ( & dentry - > d_lock ) ;
}
2019-09-16 16:56:41 -07:00
__fuse_dentry_settime ( dentry , time ) ;
2006-07-30 03:04:10 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* FUSE caches dentries and attributes with separate timeout . The
* time in jiffies until the dentry / attributes are valid is stored in
2016-10-01 07:32:32 +02:00
* dentry - > d_fsdata and fuse_inode - > i_time respectively .
2006-01-06 00:19:39 -08:00
*/
/*
* Calculate the time in jiffies until a dentry / attributes are valid
*/
2016-10-01 07:32:32 +02:00
static u64 time_to_jiffies ( u64 sec , u32 nsec )
2005-09-09 13:10:28 -07:00
{
2006-07-30 03:04:08 -07:00
if ( sec | | nsec ) {
2016-10-01 07:32:32 +02:00
struct timespec64 ts = {
sec ,
2017-01-13 15:58:30 +00:00
min_t ( u32 , nsec , NSEC_PER_SEC - 1 )
2016-10-01 07:32:32 +02:00
} ;
return get_jiffies_64 ( ) + timespec64_to_jiffies ( & ts ) ;
2006-07-30 03:04:08 -07:00
} else
2006-07-30 03:04:10 -07:00
return 0 ;
2005-09-09 13:10:28 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* Set dentry and possibly attribute timeouts from the lookup / mk *
* replies
*/
2018-09-28 16:43:23 +02:00
void fuse_change_entry_timeout ( struct dentry * entry , struct fuse_entry_out * o )
2006-01-06 00:19:34 -08:00
{
2006-07-30 03:04:10 -07:00
fuse_dentry_settime ( entry ,
time_to_jiffies ( o - > entry_valid , o - > entry_valid_nsec ) ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
}
static u64 attr_timeout ( struct fuse_attr_out * o )
{
return time_to_jiffies ( o - > attr_valid , o - > attr_valid_nsec ) ;
}
2018-09-28 16:43:23 +02:00
u64 entry_attr_timeout ( struct fuse_entry_out * o )
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
{
return time_to_jiffies ( o - > attr_valid , o - > attr_valid_nsec ) ;
2006-01-06 00:19:38 -08:00
}
2021-10-22 17:03:02 +02:00
void fuse_invalidate_attr_mask ( struct inode * inode , u32 mask )
2018-10-15 15:43:06 +02:00
{
set_mask_bits ( & get_fuse_inode ( inode ) - > inval_mask , 0 , mask ) ;
}
2006-01-06 00:19:39 -08:00
/*
* Mark the attributes as stale , so that at the next call to
* - > getattr ( ) they will be fetched from userspace
*/
2006-01-06 00:19:38 -08:00
void fuse_invalidate_attr ( struct inode * inode )
{
2018-10-15 15:43:06 +02:00
fuse_invalidate_attr_mask ( inode , STATX_BASIC_STATS ) ;
2006-01-06 00:19:38 -08:00
}
2018-10-01 10:07:05 +02:00
static void fuse_dir_changed ( struct inode * dir )
{
fuse_invalidate_attr ( dir ) ;
inode_maybe_inc_iversion ( dir , false ) ;
}
2023-01-08 17:00:23 -08:00
/*
2013-11-05 03:55:43 -08:00
* Mark the attributes as stale due to an atime change . Avoid the invalidate if
* atime is not used .
*/
void fuse_invalidate_atime ( struct inode * inode )
{
if ( ! IS_RDONLY ( inode ) )
2018-10-15 15:43:06 +02:00
fuse_invalidate_attr_mask ( inode , STATX_ATIME ) ;
2013-11-05 03:55:43 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Just mark the entry as stale , so that a next attempt to look it up
* will result in a new lookup call to userspace
*
* This is called when a dentry is about to become negative and the
* timeout is unknown ( unlink , rmdir , rename and in some cases
* lookup )
*/
2008-07-25 01:49:00 -07:00
void fuse_invalidate_entry_cache ( struct dentry * entry )
2006-01-06 00:19:38 -08:00
{
2006-07-30 03:04:10 -07:00
fuse_dentry_settime ( entry , 0 ) ;
2006-01-06 00:19:38 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Same as fuse_invalidate_entry_cache ( ) , but also try to remove the
* dentry from the hash
*/
2006-01-06 00:19:38 -08:00
static void fuse_invalidate_entry ( struct dentry * entry )
{
d_invalidate ( entry ) ;
fuse_invalidate_entry_cache ( entry ) ;
2006-01-06 00:19:34 -08:00
}
2014-12-12 09:49:05 +01:00
static void fuse_lookup_init ( struct fuse_conn * fc , struct fuse_args * args ,
2016-07-20 22:34:44 -04:00
u64 nodeid , const struct qstr * name ,
2005-09-09 13:10:28 -07:00
struct fuse_entry_out * outarg )
{
2007-10-18 03:07:05 -07:00
memset ( outarg , 0 , sizeof ( struct fuse_entry_out ) ) ;
2019-09-10 15:04:08 +02:00
args - > opcode = FUSE_LOOKUP ;
args - > nodeid = nodeid ;
args - > in_numargs = 1 ;
args - > in_args [ 0 ] . size = name - > len + 1 ;
args - > in_args [ 0 ] . value = name - > name ;
args - > out_numargs = 1 ;
args - > out_args [ 0 ] . size = sizeof ( struct fuse_entry_out ) ;
args - > out_args [ 0 ] . value = outarg ;
2005-09-09 13:10:28 -07:00
}
2006-01-06 00:19:39 -08:00
/*
* Check whether the dentry is still valid
*
* If the entry validity timeout has expired and the dentry is
* positive , try to redo the lookup . If the lookup results in a
* different inode , then let the VFS invalidate the dentry and redo
* the lookup once more . If the lookup results in the same inode ,
* then refresh the attributes , timeouts and mark the dentry valid .
*/
2012-06-10 16:03:43 -04:00
static int fuse_dentry_revalidate ( struct dentry * entry , unsigned int flags )
2005-09-09 13:10:28 -07:00
{
2011-01-07 17:49:57 +11:00
struct inode * inode ;
2013-06-03 14:40:22 +02:00
struct dentry * parent ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm ;
2013-10-01 16:41:22 +02:00
struct fuse_inode * fi ;
2013-09-05 11:44:43 +02:00
int ret ;
2006-01-06 00:19:38 -08:00
2015-03-17 22:25:59 +00:00
inode = d_inode_rcu ( entry ) ;
2020-12-10 15:33:14 +01:00
if ( inode & & fuse_is_bad ( inode ) )
2013-09-05 11:44:43 +02:00
goto invalid ;
2014-06-26 20:21:57 -04:00
else if ( time_before64 ( fuse_dentry_time ( entry ) , get_jiffies_64 ( ) ) | |
2022-09-28 20:19:34 +08:00
( flags & ( LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET ) ) ) {
2005-09-09 13:10:28 -07:00
struct fuse_entry_out outarg ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2006-01-06 00:19:38 -08:00
2006-02-28 16:59:03 -08:00
/* For negative dentries, always do a fresh lookup */
2006-01-06 00:19:38 -08:00
if ( ! inode )
2013-09-05 11:44:43 +02:00
goto invalid ;
2006-01-06 00:19:38 -08:00
2013-09-05 11:44:43 +02:00
ret = - ECHILD ;
2012-06-10 16:03:43 -04:00
if ( flags & LOOKUP_RCU )
2013-09-05 11:44:43 +02:00
goto out ;
2011-03-21 13:58:06 +01:00
2020-05-06 17:44:12 +02:00
fm = get_fuse_mount ( inode ) ;
2005-09-09 13:10:28 -07:00
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2014-12-12 09:49:05 +01:00
ret = - ENOMEM ;
if ( ! forget )
2013-09-05 11:44:43 +02:00
goto out ;
2006-11-25 11:09:20 -08:00
2020-05-06 17:44:12 +02:00
attr_version = fuse_get_attr_version ( fm - > fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2006-10-17 00:10:12 -07:00
parent = dget_parent ( entry ) ;
2020-05-06 17:44:12 +02:00
fuse_lookup_init ( fm - > fc , & args , get_node_id ( d_inode ( parent ) ) ,
2008-07-25 01:49:01 -07:00
& entry - > d_name , & outarg ) ;
2020-05-06 17:44:12 +02:00
ret = fuse_simple_request ( fm , & args ) ;
2006-10-17 00:10:12 -07:00
dput ( parent ) ;
2006-02-28 16:59:03 -08:00
/* Zero nodeid is same as -ENOENT */
2014-12-12 09:49:05 +01:00
if ( ! ret & & ! outarg . nodeid )
ret = - ENOENT ;
if ( ! ret ) {
2013-10-01 16:41:22 +02:00
fi = get_fuse_inode ( inode ) ;
2020-04-21 14:47:15 +02:00
if ( outarg . nodeid ! = get_node_id ( inode ) | |
( bool ) IS_AUTOMOUNT ( inode ) ! = ( bool ) ( outarg . attr . flags & FUSE_ATTR_SUBMOUNT ) ) {
2020-05-06 17:44:12 +02:00
fuse_queue_forget ( fm - > fc , forget ,
outarg . nodeid , 1 ) ;
2013-09-05 11:44:43 +02:00
goto invalid ;
2005-09-09 13:10:29 -07:00
}
2018-11-09 13:33:27 +03:00
spin_lock ( & fi - > lock ) ;
2008-11-26 12:03:54 +01:00
fi - > nlookup + + ;
2018-11-09 13:33:27 +03:00
spin_unlock ( & fi - > lock ) ;
2005-09-09 13:10:29 -07:00
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2014-12-12 09:49:05 +01:00
if ( ret = = - ENOMEM )
goto out ;
2019-11-12 11:49:04 +01:00
if ( ret | | fuse_invalid_attr ( & outarg . attr ) | |
2021-06-21 14:03:53 +03:00
fuse_stale_inode ( inode , outarg . generation , & outarg . attr ) )
2013-09-05 11:44:43 +02:00
goto invalid ;
2005-09-09 13:10:28 -07:00
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_attributes ( inode , & outarg . attr ,
entry_attr_timeout ( & outarg ) ,
attr_version ) ;
fuse_change_entry_timeout ( entry , & outarg ) ;
2013-06-03 14:40:22 +02:00
} else if ( inode ) {
2013-10-01 16:41:22 +02:00
fi = get_fuse_inode ( inode ) ;
if ( flags & LOOKUP_RCU ) {
if ( test_bit ( FUSE_I_INIT_RDPLUS , & fi - > state ) )
return - ECHILD ;
} else if ( test_and_clear_bit ( FUSE_I_INIT_RDPLUS , & fi - > state ) ) {
2013-06-03 14:40:22 +02:00
parent = dget_parent ( entry ) ;
2015-03-17 22:25:59 +00:00
fuse_advise_use_readdirplus ( d_inode ( parent ) ) ;
2013-06-03 14:40:22 +02:00
dput ( parent ) ;
}
2005-09-09 13:10:28 -07:00
}
2013-09-05 11:44:43 +02:00
ret = 1 ;
out :
return ret ;
invalid :
ret = 0 ;
goto out ;
2005-09-09 13:10:28 -07:00
}
2019-09-16 16:56:41 -07:00
# if BITS_PER_LONG < 64
2016-10-01 07:32:32 +02:00
static int fuse_dentry_init ( struct dentry * dentry )
{
2019-09-17 12:35:33 -07:00
dentry - > d_fsdata = kzalloc ( sizeof ( union fuse_dentry ) ,
GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE ) ;
2016-10-01 07:32:32 +02:00
return dentry - > d_fsdata ? 0 : - ENOMEM ;
}
static void fuse_dentry_release ( struct dentry * dentry )
{
union fuse_dentry * fd = dentry - > d_fsdata ;
kfree_rcu ( fd , rcu ) ;
}
2019-09-16 16:56:41 -07:00
# endif
2016-10-01 07:32:32 +02:00
2018-08-15 17:42:34 +02:00
static int fuse_dentry_delete ( const struct dentry * dentry )
{
return time_before64 ( fuse_dentry_time ( dentry ) , get_jiffies_64 ( ) ) ;
}
2020-04-21 14:47:15 +02:00
/*
* Create a fuse_mount object with a new superblock ( with path - > dentry
* as the root ) , and return that mount so it can be auto - mounted on
* @ path .
*/
static struct vfsmount * fuse_dentry_automount ( struct path * path )
{
struct fs_context * fsc ;
struct vfsmount * mnt ;
struct fuse_inode * mp_fi = get_fuse_inode ( d_inode ( path - > dentry ) ) ;
fsc = fs_context_for_submount ( path - > mnt - > mnt_sb - > s_type , path - > dentry ) ;
2021-06-04 18:11:55 +02:00
if ( IS_ERR ( fsc ) )
return ERR_CAST ( fsc ) ;
2020-04-21 14:47:15 +02:00
2021-06-04 18:11:54 +02:00
/* Pass the FUSE inode of the mount for fuse_get_tree_submount() */
fsc - > fs_private = mp_fi ;
2020-04-21 14:47:15 +02:00
/* Create the submount */
2021-06-04 18:11:55 +02:00
mnt = fc_mount ( fsc ) ;
if ( ! IS_ERR ( mnt ) )
mntget ( mnt ) ;
2020-04-21 14:47:15 +02:00
put_fs_context ( fsc ) ;
2021-06-04 18:11:55 +02:00
return mnt ;
2020-04-21 14:47:15 +02:00
}
2009-02-20 05:59:13 +00:00
const struct dentry_operations fuse_dentry_operations = {
2005-09-09 13:10:28 -07:00
. d_revalidate = fuse_dentry_revalidate ,
2018-08-15 17:42:34 +02:00
. d_delete = fuse_dentry_delete ,
2019-09-16 16:56:41 -07:00
# if BITS_PER_LONG < 64
2016-10-01 07:32:32 +02:00
. d_init = fuse_dentry_init ,
. d_release = fuse_dentry_release ,
2019-09-16 16:56:41 -07:00
# endif
2020-04-21 14:47:15 +02:00
. d_automount = fuse_dentry_automount ,
2005-09-09 13:10:28 -07:00
} ;
2016-10-18 15:36:48 +02:00
const struct dentry_operations fuse_root_dentry_operations = {
2019-09-16 16:56:41 -07:00
# if BITS_PER_LONG < 64
2016-10-18 15:36:48 +02:00
. d_init = fuse_dentry_init ,
. d_release = fuse_dentry_release ,
2019-09-16 16:56:41 -07:00
# endif
2016-10-18 15:36:48 +02:00
} ;
2007-04-08 16:04:00 -07:00
int fuse_valid_type ( int m )
2006-01-06 00:19:43 -08:00
{
return S_ISREG ( m ) | | S_ISDIR ( m ) | | S_ISLNK ( m ) | | S_ISCHR ( m ) | |
S_ISBLK ( m ) | | S_ISFIFO ( m ) | | S_ISSOCK ( m ) ;
}
2019-11-12 11:49:04 +01:00
bool fuse_invalid_attr ( struct fuse_attr * attr )
{
return ! fuse_valid_type ( attr - > mode ) | |
attr - > size > LLONG_MAX ;
}
2016-07-20 22:34:44 -04:00
int fuse_lookup_name ( struct super_block * sb , u64 nodeid , const struct qstr * name ,
2008-07-25 01:49:01 -07:00
struct fuse_entry_out * outarg , struct inode * * inode )
2005-09-09 13:10:28 -07:00
{
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount_super ( sb ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2008-07-25 01:49:01 -07:00
int err ;
2005-09-09 13:10:28 -07:00
2008-07-25 01:49:01 -07:00
* inode = NULL ;
err = - ENAMETOOLONG ;
if ( name - > len > FUSE_NAME_MAX )
goto out ;
2005-09-09 13:10:28 -07:00
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
err = - ENOMEM ;
2014-12-12 09:49:05 +01:00
if ( ! forget )
2008-07-25 01:49:01 -07:00
goto out ;
2006-11-25 11:09:20 -08:00
2020-05-06 17:44:12 +02:00
attr_version = fuse_get_attr_version ( fm - > fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2020-05-06 17:44:12 +02:00
fuse_lookup_init ( fm - > fc , & args , nodeid , name , outarg ) ;
err = fuse_simple_request ( fm , & args ) ;
2006-02-28 16:59:03 -08:00
/* Zero nodeid is same as -ENOENT, but with valid timeout */
2008-07-25 01:49:01 -07:00
if ( err | | ! outarg - > nodeid )
goto out_put_forget ;
err = - EIO ;
if ( ! outarg - > nodeid )
goto out_put_forget ;
2019-11-12 11:49:04 +01:00
if ( fuse_invalid_attr ( & outarg - > attr ) )
2008-07-25 01:49:01 -07:00
goto out_put_forget ;
* inode = fuse_iget ( sb , outarg - > nodeid , outarg - > generation ,
& outarg - > attr , entry_attr_timeout ( outarg ) ,
attr_version ) ;
err = - ENOMEM ;
if ( ! * inode ) {
2020-05-06 17:44:12 +02:00
fuse_queue_forget ( fm - > fc , forget , outarg - > nodeid , 1 ) ;
2008-07-25 01:49:01 -07:00
goto out ;
2005-09-09 13:10:28 -07:00
}
2008-07-25 01:49:01 -07:00
err = 0 ;
out_put_forget :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2008-07-25 01:49:01 -07:00
out :
return err ;
}
static struct dentry * fuse_lookup ( struct inode * dir , struct dentry * entry ,
2012-06-10 17:13:09 -04:00
unsigned int flags )
2008-07-25 01:49:01 -07:00
{
int err ;
struct fuse_entry_out outarg ;
struct inode * inode ;
struct dentry * newent ;
bool outarg_valid = true ;
2018-07-26 16:13:11 +02:00
bool locked ;
2008-07-25 01:49:01 -07:00
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( dir ) )
return ERR_PTR ( - EIO ) ;
2018-07-26 16:13:11 +02:00
locked = fuse_lock_inode ( dir ) ;
2008-07-25 01:49:01 -07:00
err = fuse_lookup_name ( dir - > i_sb , get_node_id ( dir ) , & entry - > d_name ,
& outarg , & inode ) ;
2018-07-26 16:13:11 +02:00
fuse_unlock_inode ( dir , locked ) ;
2008-07-25 01:49:01 -07:00
if ( err = = - ENOENT ) {
outarg_valid = false ;
err = 0 ;
}
if ( err )
goto out_err ;
err = - EIO ;
if ( inode & & get_node_id ( inode ) = = FUSE_ROOT_ID )
goto out_iput ;
2005-09-09 13:10:28 -07:00
2014-10-12 22:24:21 -04:00
newent = d_splice_alias ( inode , entry ) ;
2013-09-05 11:44:42 +02:00
err = PTR_ERR ( newent ) ;
if ( IS_ERR ( newent ) )
goto out_err ;
2006-10-17 00:10:11 -07:00
2008-07-25 01:48:59 -07:00
entry = newent ? newent : entry ;
2008-07-25 01:49:01 -07:00
if ( outarg_valid )
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_entry_timeout ( entry , & outarg ) ;
2006-01-06 00:19:38 -08:00
else
fuse_invalidate_entry_cache ( entry ) ;
2008-07-25 01:49:01 -07:00
2019-10-21 15:57:07 +02:00
if ( inode )
fuse_advise_use_readdirplus ( dir ) ;
2008-07-25 01:48:59 -07:00
return newent ;
2008-07-25 01:49:01 -07:00
out_iput :
iput ( inode ) ;
out_err :
return ERR_PTR ( err ) ;
2005-09-09 13:10:28 -07:00
}
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
static int get_security_context ( struct dentry * entry , umode_t mode ,
2022-11-10 15:46:33 +01:00
struct fuse_in_arg * ext )
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
{
struct fuse_secctx * fctx ;
struct fuse_secctx_header * header ;
void * ctx = NULL , * ptr ;
u32 ctxlen , total_len = sizeof ( * header ) ;
int err , nr_ctx = 0 ;
const char * name ;
size_t namelen ;
err = security_dentry_init_security ( entry , mode , & entry - > d_name ,
& name , & ctx , & ctxlen ) ;
if ( err ) {
if ( err ! = - EOPNOTSUPP )
goto out_err ;
/* No LSM is supporting this security hook. Ignore error */
ctxlen = 0 ;
ctx = NULL ;
}
if ( ctxlen ) {
nr_ctx = 1 ;
namelen = strlen ( name ) + 1 ;
err = - EIO ;
if ( WARN_ON ( namelen > XATTR_NAME_MAX + 1 | | ctxlen > S32_MAX ) )
goto out_err ;
total_len + = FUSE_REC_ALIGN ( sizeof ( * fctx ) + namelen + ctxlen ) ;
}
err = - ENOMEM ;
header = ptr = kzalloc ( total_len , GFP_KERNEL ) ;
if ( ! ptr )
goto out_err ;
header - > nr_secctx = nr_ctx ;
header - > size = total_len ;
ptr + = sizeof ( * header ) ;
if ( nr_ctx ) {
fctx = ptr ;
fctx - > size = ctxlen ;
ptr + = sizeof ( * fctx ) ;
strcpy ( ptr , name ) ;
ptr + = namelen ;
memcpy ( ptr , ctx , ctxlen ) ;
}
2022-11-10 15:46:33 +01:00
ext - > size = total_len ;
ext - > value = header ;
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
err = 0 ;
out_err :
kfree ( ctx ) ;
return err ;
}
2022-11-10 15:46:33 +01:00
static void * extend_arg ( struct fuse_in_arg * buf , u32 bytes )
{
void * p ;
u32 newlen = buf - > size + bytes ;
p = krealloc ( buf - > value , newlen , GFP_KERNEL ) ;
if ( ! p ) {
kfree ( buf - > value ) ;
buf - > size = 0 ;
buf - > value = NULL ;
return NULL ;
}
memset ( p + buf - > size , 0 , bytes ) ;
buf - > value = p ;
buf - > size = newlen ;
return p + newlen - bytes ;
}
static u32 fuse_ext_size ( size_t size )
{
return FUSE_REC_ALIGN ( sizeof ( struct fuse_ext_header ) + size ) ;
}
/*
* This adds just a single supplementary group that matches the parent ' s group .
*/
static int get_create_supp_group ( struct inode * dir , struct fuse_in_arg * ext )
{
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
struct fuse_ext_header * xh ;
struct fuse_supp_groups * sg ;
kgid_t kgid = dir - > i_gid ;
gid_t parent_gid = from_kgid ( fc - > user_ns , kgid ) ;
u32 sg_len = fuse_ext_size ( sizeof ( * sg ) + sizeof ( sg - > groups [ 0 ] ) ) ;
if ( parent_gid = = ( gid_t ) - 1 | | gid_eq ( kgid , current_fsgid ( ) ) | |
! in_group_p ( kgid ) )
return 0 ;
xh = extend_arg ( ext , sg_len ) ;
if ( ! xh )
return - ENOMEM ;
xh - > size = sg_len ;
xh - > type = FUSE_EXT_GROUPS ;
sg = ( struct fuse_supp_groups * ) & xh [ 1 ] ;
sg - > nr_groups = 1 ;
sg - > groups [ 0 ] = parent_gid ;
return 0 ;
}
static int get_create_ext ( struct fuse_args * args ,
struct inode * dir , struct dentry * dentry ,
2022-11-10 15:46:33 +01:00
umode_t mode )
{
struct fuse_conn * fc = get_fuse_conn_super ( dentry - > d_sb ) ;
struct fuse_in_arg ext = { . size = 0 , . value = NULL } ;
int err = 0 ;
if ( fc - > init_security )
err = get_security_context ( dentry , mode , & ext ) ;
2022-11-10 15:46:33 +01:00
if ( ! err & & fc - > create_supp_group )
err = get_create_supp_group ( dir , & ext ) ;
2022-11-10 15:46:33 +01:00
if ( ! err & & ext . size ) {
WARN_ON ( args - > in_numargs > = ARRAY_SIZE ( args - > in_args ) ) ;
args - > is_ext = true ;
args - > ext_idx = args - > in_numargs + + ;
args - > in_args [ args - > ext_idx ] = ext ;
} else {
kfree ( ext . value ) ;
}
return err ;
}
static void free_ext_value ( struct fuse_args * args )
{
if ( args - > is_ext )
kfree ( args - > in_args [ args - > ext_idx ] . value ) ;
}
2006-01-06 00:19:39 -08:00
/*
* Atomic create + open operation
*
* If the filesystem doesn ' t support this , then fall back to separate
* ' mknod ' + ' open ' requests .
*/
2012-06-22 12:39:14 +04:00
static int fuse_create_open ( struct inode * dir , struct dentry * entry ,
2021-04-07 14:36:45 +02:00
struct file * file , unsigned int flags ,
2022-09-24 07:00:00 +02:00
umode_t mode , u32 opcode )
2005-11-07 00:59:51 -08:00
{
int err ;
struct inode * inode ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
2009-06-30 20:12:23 +02:00
struct fuse_create_in inarg ;
2005-11-07 00:59:51 -08:00
struct fuse_open_out outopen ;
struct fuse_entry_out outentry ;
2018-11-09 13:33:11 +03:00
struct fuse_inode * fi ;
2005-11-07 00:59:51 -08:00
struct fuse_file * ff ;
fuse: fix deadlock between atomic O_TRUNC and page invalidation
fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
deadlock related to the case above, which will cause the xfstests
generic/464 testcase hung in our virtio-fs test environment.
For example, consider two processes concurrently open one same file, one
with O_TRUNC and another without O_TRUNC. The deadlock case is described
below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
to lock a page (acquiring B), open() could have held the page lock
(acquired B), and waiting on the page writeback (acquiring A). This would
lead to deadlocks.
open(O_TRUNC)
----------------------------------------------------------------
fuse_open_common
inode_lock [C acquire]
fuse_set_nowrite [A acquire]
fuse_finish_open
truncate_pagecache
lock_page [B acquire]
truncate_inode_page
unlock_page [B release]
fuse_release_nowrite [A release]
inode_unlock [C release]
----------------------------------------------------------------
open()
----------------------------------------------------------------
fuse_open_common
fuse_finish_open
invalidate_inode_pages2
lock_page [B acquire]
fuse_launder_page
fuse_wait_on_page_writeback [A acquire & release]
unlock_page [B release]
----------------------------------------------------------------
Besides this case, all calls of invalidate_inode_pages2() and
invalidate_inode_pages2_range() in fuse code also can deadlock with
open(O_TRUNC).
Fix by moving the truncate_pagecache() call outside the nowrite protected
region. The nowrite protection is only for delayed writeback
(writeback_cache) case, where inode lock does not protect against
truncation racing with writes on the server. Write syscalls racing with
page cache truncation still get the inode lock protection.
This patch also changes the order of filemap_invalidate_lock()
vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the
order found in fuse_file_fallocate() and fuse_do_setattr().
Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable@vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 15:48:53 +02:00
bool trunc = flags & O_TRUNC ;
2005-11-07 00:59:51 -08:00
2012-08-15 13:01:24 +02:00
/* Userspace expects S_IFREG in create mode */
BUG_ON ( ( mode & S_IFMT ) ! = S_IFREG ) ;
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2012-06-05 15:10:22 +02:00
err = - ENOMEM ;
2010-12-07 20:16:56 +01:00
if ( ! forget )
2012-06-05 15:10:22 +02:00
goto out_err ;
2006-06-25 05:48:50 -07:00
2006-04-10 22:54:58 -07:00
err = - ENOMEM ;
2020-05-06 17:44:12 +02:00
ff = fuse_file_alloc ( fm ) ;
2005-11-07 00:59:51 -08:00
if ( ! ff )
2014-12-12 09:49:05 +01:00
goto out_put_forget_req ;
2005-11-07 00:59:51 -08:00
2020-05-06 17:44:12 +02:00
if ( ! fm - > fc - > dont_mask )
2009-06-30 20:12:23 +02:00
mode & = ~ current_umask ( ) ;
2005-11-07 00:59:51 -08:00
flags & = ~ O_NOCTTY ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outentry , 0 , sizeof ( outentry ) ) ;
2005-11-07 00:59:51 -08:00
inarg . flags = flags ;
inarg . mode = mode ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2020-10-09 14:15:11 -04:00
fuse: fix deadlock between atomic O_TRUNC and page invalidation
fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
deadlock related to the case above, which will cause the xfstests
generic/464 testcase hung in our virtio-fs test environment.
For example, consider two processes concurrently open one same file, one
with O_TRUNC and another without O_TRUNC. The deadlock case is described
below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
to lock a page (acquiring B), open() could have held the page lock
(acquired B), and waiting on the page writeback (acquiring A). This would
lead to deadlocks.
open(O_TRUNC)
----------------------------------------------------------------
fuse_open_common
inode_lock [C acquire]
fuse_set_nowrite [A acquire]
fuse_finish_open
truncate_pagecache
lock_page [B acquire]
truncate_inode_page
unlock_page [B release]
fuse_release_nowrite [A release]
inode_unlock [C release]
----------------------------------------------------------------
open()
----------------------------------------------------------------
fuse_open_common
fuse_finish_open
invalidate_inode_pages2
lock_page [B acquire]
fuse_launder_page
fuse_wait_on_page_writeback [A acquire & release]
unlock_page [B release]
----------------------------------------------------------------
Besides this case, all calls of invalidate_inode_pages2() and
invalidate_inode_pages2_range() in fuse code also can deadlock with
open(O_TRUNC).
Fix by moving the truncate_pagecache() call outside the nowrite protected
region. The nowrite protection is only for delayed writeback
(writeback_cache) case, where inode lock does not protect against
truncation racing with writes on the server. Write syscalls racing with
page cache truncation still get the inode lock protection.
This patch also changes the order of filemap_invalidate_lock()
vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the
order found in fuse_file_fallocate() and fuse_do_setattr().
Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable@vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 15:48:53 +02:00
if ( fm - > fc - > handle_killpriv_v2 & & trunc & &
2020-10-09 14:15:11 -04:00
! ( flags & O_EXCL ) & & ! capable ( CAP_FSETID ) ) {
inarg . open_flags | = FUSE_OPEN_KILL_SUIDGID ;
}
2022-09-24 07:00:00 +02:00
args . opcode = opcode ;
2019-09-10 15:04:08 +02:00
args . nodeid = get_node_id ( dir ) ;
args . in_numargs = 2 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
args . in_args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 1 ] . value = entry - > d_name . name ;
args . out_numargs = 2 ;
args . out_args [ 0 ] . size = sizeof ( outentry ) ;
args . out_args [ 0 ] . value = & outentry ;
args . out_args [ 1 ] . size = sizeof ( outopen ) ;
args . out_args [ 1 ] . value = & outopen ;
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
2022-11-10 15:46:33 +01:00
err = get_create_ext ( & args , dir , entry , mode ) ;
2022-11-10 15:46:33 +01:00
if ( err )
goto out_put_forget_req ;
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2022-11-10 15:46:33 +01:00
free_ext_value ( & args ) ;
2012-06-05 15:10:22 +02:00
if ( err )
2005-11-07 00:59:51 -08:00
goto out_free_ff ;
err = - EIO ;
2019-11-12 11:49:04 +01:00
if ( ! S_ISREG ( outentry . attr . mode ) | | invalid_nodeid ( outentry . nodeid ) | |
fuse_invalid_attr ( & outentry . attr ) )
2005-11-07 00:59:51 -08:00
goto out_free_ff ;
2009-04-28 16:56:37 +02:00
ff - > fh = outopen . fh ;
ff - > nodeid = outentry . nodeid ;
ff - > open_flags = outopen . open_flags ;
2005-11-07 00:59:51 -08:00
inode = fuse_iget ( dir - > i_sb , outentry . nodeid , outentry . generation ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
& outentry . attr , entry_attr_timeout ( & outentry ) , 0 ) ;
2005-11-07 00:59:51 -08:00
if ( ! inode ) {
flags & = ~ ( O_CREAT | O_EXCL | O_TRUNC ) ;
2018-11-09 13:33:11 +03:00
fuse_sync_release ( NULL , ff , flags ) ;
2020-05-06 17:44:12 +02:00
fuse_queue_forget ( fm - > fc , forget , outentry . nodeid , 1 ) ;
2012-06-05 15:10:22 +02:00
err = - ENOMEM ;
goto out_err ;
2005-11-07 00:59:51 -08:00
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2005-11-07 00:59:51 -08:00
d_instantiate ( entry , inode ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
fuse_change_entry_timeout ( entry , & outentry ) ;
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( dir ) ;
2018-06-08 11:44:56 -04:00
err = finish_open ( file , entry , generic_file_open ) ;
2012-06-22 12:40:19 +04:00
if ( err ) {
2018-11-09 13:33:11 +03:00
fi = get_fuse_inode ( inode ) ;
fuse_sync_release ( fi , ff , flags ) ;
2012-06-05 15:10:22 +02:00
} else {
2017-02-22 20:08:25 +01:00
file - > private_data = ff ;
2012-06-05 15:10:22 +02:00
fuse_finish_open ( inode , file ) ;
fuse: fix deadlock between atomic O_TRUNC and page invalidation
fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
deadlock related to the case above, which will cause the xfstests
generic/464 testcase hung in our virtio-fs test environment.
For example, consider two processes concurrently open one same file, one
with O_TRUNC and another without O_TRUNC. The deadlock case is described
below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
to lock a page (acquiring B), open() could have held the page lock
(acquired B), and waiting on the page writeback (acquiring A). This would
lead to deadlocks.
open(O_TRUNC)
----------------------------------------------------------------
fuse_open_common
inode_lock [C acquire]
fuse_set_nowrite [A acquire]
fuse_finish_open
truncate_pagecache
lock_page [B acquire]
truncate_inode_page
unlock_page [B release]
fuse_release_nowrite [A release]
inode_unlock [C release]
----------------------------------------------------------------
open()
----------------------------------------------------------------
fuse_open_common
fuse_finish_open
invalidate_inode_pages2
lock_page [B acquire]
fuse_launder_page
fuse_wait_on_page_writeback [A acquire & release]
unlock_page [B release]
----------------------------------------------------------------
Besides this case, all calls of invalidate_inode_pages2() and
invalidate_inode_pages2_range() in fuse code also can deadlock with
open(O_TRUNC).
Fix by moving the truncate_pagecache() call outside the nowrite protected
region. The nowrite protection is only for delayed writeback
(writeback_cache) case, where inode lock does not protect against
truncation racing with writes on the server. Write syscalls racing with
page cache truncation still get the inode lock protection.
This patch also changes the order of filemap_invalidate_lock()
vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the
order found in fuse_file_fallocate() and fuse_do_setattr().
Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable@vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 15:48:53 +02:00
if ( fm - > fc - > atomic_o_trunc & & trunc )
truncate_pagecache ( inode , 0 ) ;
else if ( ! ( ff - > open_flags & FOPEN_KEEP_CACHE ) )
invalidate_inode_pages2 ( inode - > i_mapping ) ;
2005-11-07 00:59:51 -08:00
}
2012-06-22 12:39:14 +04:00
return err ;
2005-11-07 00:59:51 -08:00
2012-06-05 15:10:22 +02:00
out_free_ff :
2005-11-07 00:59:51 -08:00
fuse_file_free ( ff ) ;
2012-06-05 15:10:22 +02:00
out_put_forget_req :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2012-06-05 15:10:22 +02:00
out_err :
2012-06-22 12:39:14 +04:00
return err ;
2012-06-05 15:10:22 +02:00
}
2023-01-13 12:49:16 +01:00
static int fuse_mknod ( struct mnt_idmap * , struct inode * , struct dentry * ,
2021-01-21 14:19:43 +01:00
umode_t , dev_t ) ;
2012-06-22 12:39:14 +04:00
static int fuse_atomic_open ( struct inode * dir , struct dentry * entry ,
2012-06-22 12:40:19 +04:00
struct file * file , unsigned flags ,
2018-06-08 13:32:02 -04:00
umode_t mode )
2012-06-05 15:10:22 +02:00
{
int err ;
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
struct dentry * res = NULL ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( dir ) )
return - EIO ;
2016-07-05 09:44:53 -04:00
if ( d_in_lookup ( entry ) ) {
2012-06-10 17:13:09 -04:00
res = fuse_lookup ( dir , entry , 0 ) ;
2012-06-05 15:10:22 +02:00
if ( IS_ERR ( res ) )
2012-06-22 12:39:14 +04:00
return PTR_ERR ( res ) ;
2012-06-05 15:10:22 +02:00
if ( res )
entry = res ;
}
2015-03-17 22:25:59 +00:00
if ( ! ( flags & O_CREAT ) | | d_really_is_positive ( entry ) )
2012-06-05 15:10:22 +02:00
goto no_open ;
/* Only creates */
2018-06-08 13:22:02 -04:00
file - > f_mode | = FMODE_CREATED ;
2012-06-05 15:10:22 +02:00
if ( fc - > no_create )
goto mknod ;
2022-09-24 07:00:00 +02:00
err = fuse_create_open ( dir , entry , file , flags , mode , FUSE_CREATE ) ;
2012-06-22 12:39:14 +04:00
if ( err = = - ENOSYS ) {
2012-06-05 15:10:22 +02:00
fc - > no_create = 1 ;
goto mknod ;
}
out_dput :
dput ( res ) ;
2012-06-22 12:39:14 +04:00
return err ;
2012-06-05 15:10:22 +02:00
mknod :
2023-01-13 12:49:16 +01:00
err = fuse_mknod ( & nop_mnt_idmap , dir , entry , mode , 0 ) ;
2012-06-22 12:39:14 +04:00
if ( err )
2012-06-05 15:10:22 +02:00
goto out_dput ;
no_open :
2012-06-10 06:48:09 -04:00
return finish_no_open ( file , res ) ;
2005-11-07 00:59:51 -08:00
}
2006-01-06 00:19:39 -08:00
/*
* Code shared between mknod , mkdir , symlink and link
*/
2020-05-06 17:44:12 +02:00
static int create_new_entry ( struct fuse_mount * fm , struct fuse_args * args ,
2005-09-09 13:10:29 -07:00
struct inode * dir , struct dentry * entry ,
2011-07-26 03:17:33 -04:00
umode_t mode )
2005-09-09 13:10:29 -07:00
{
struct fuse_entry_out outarg ;
struct inode * inode ;
2018-05-28 18:27:19 -04:00
struct dentry * d ;
2005-09-09 13:10:29 -07:00
int err ;
2010-12-07 20:16:56 +01:00
struct fuse_forget_link * forget ;
2006-11-25 11:09:20 -08:00
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( dir ) )
return - EIO ;
2010-12-07 20:16:56 +01:00
forget = fuse_alloc_forget ( ) ;
2014-12-12 09:49:05 +01:00
if ( ! forget )
2010-12-07 20:16:56 +01:00
return - ENOMEM ;
2005-09-09 13:10:29 -07:00
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2019-09-10 15:04:08 +02:00
args - > nodeid = get_node_id ( dir ) ;
args - > out_numargs = 1 ;
args - > out_args [ 0 ] . size = sizeof ( outarg ) ;
args - > out_args [ 0 ] . value = & outarg ;
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
2022-11-10 15:46:33 +01:00
if ( args - > opcode ! = FUSE_LINK ) {
2022-11-10 15:46:33 +01:00
err = get_create_ext ( args , dir , entry , mode ) ;
fuse: send security context of inode on file
When a new inode is created, send its security context to server along with
creation request (FUSE_CREAT, FUSE_MKNOD, FUSE_MKDIR and FUSE_SYMLINK).
This gives server an opportunity to create new file and set security
context (possibly atomically). In all the configurations it might not be
possible to set context atomically.
Like nfs and ceph, use security_dentry_init_security() to dermine security
context of inode and send it with create, mkdir, mknod, and symlink
requests.
Following is the information sent to server.
fuse_sectx_header, fuse_secctx, xattr_name, security_context
- struct fuse_secctx_header
This contains total number of security contexts being sent and total
size of all the security contexts (including size of
fuse_secctx_header).
- struct fuse_secctx
This contains size of security context which follows this structure.
There is one fuse_secctx instance per security context.
- xattr name string
This string represents name of xattr which should be used while setting
security context.
- security context
This is the actual security context whose size is specified in
fuse_secctx struct.
Also add the FUSE_SECURITY_CTX flag for the `flags` field of the
fuse_init_out struct. When this flag is set the kernel will append the
security context for a newly created inode to the request (create, mkdir,
mknod, and symlink). The server is responsible for ensuring that the inode
appears atomically (preferrably) with the requested security context.
For example, If the server is using SELinux and backed by a "real" linux
file system that supports extended attributes it can write the security
context value to /proc/thread-self/attr/fscreate before making the syscall
to create the inode.
This patch is based on patch from Chirantan Ekbote <chirantan@chromium.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-11-11 09:32:49 -05:00
if ( err )
goto out_put_forget_req ;
}
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , args ) ;
2022-11-10 15:46:33 +01:00
free_ext_value ( args ) ;
2006-11-25 11:09:20 -08:00
if ( err )
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
err = - EIO ;
2019-11-12 11:49:04 +01:00
if ( invalid_nodeid ( outarg . nodeid ) | | fuse_invalid_attr ( & outarg . attr ) )
2006-11-25 11:09:20 -08:00
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
if ( ( outarg . attr . mode ^ mode ) & S_IFMT )
2006-11-25 11:09:20 -08:00
goto out_put_forget_req ;
2006-01-06 00:19:43 -08:00
2005-09-09 13:10:29 -07:00
inode = fuse_iget ( dir - > i_sb , outarg . nodeid , outarg . generation ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
& outarg . attr , entry_attr_timeout ( & outarg ) , 0 ) ;
2005-09-09 13:10:29 -07:00
if ( ! inode ) {
2020-05-06 17:44:12 +02:00
fuse_queue_forget ( fm - > fc , forget , outarg . nodeid , 1 ) ;
2005-09-09 13:10:29 -07:00
return - ENOMEM ;
}
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2005-09-09 13:10:29 -07:00
2018-05-28 18:27:19 -04:00
d_drop ( entry ) ;
d = d_splice_alias ( inode , entry ) ;
if ( IS_ERR ( d ) )
return PTR_ERR ( d ) ;
2005-09-09 13:10:29 -07:00
2018-05-28 18:27:19 -04:00
if ( d ) {
fuse_change_entry_timeout ( d , & outarg ) ;
dput ( d ) ;
} else {
fuse_change_entry_timeout ( entry , & outarg ) ;
}
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( dir ) ;
2005-09-09 13:10:29 -07:00
return 0 ;
2006-01-06 00:19:43 -08:00
2006-11-25 11:09:20 -08:00
out_put_forget_req :
2010-12-07 20:16:56 +01:00
kfree ( forget ) ;
2006-01-06 00:19:43 -08:00
return err ;
2005-09-09 13:10:29 -07:00
}
2023-01-13 12:49:16 +01:00
static int fuse_mknod ( struct mnt_idmap * idmap , struct inode * dir ,
2021-01-21 14:19:43 +01:00
struct dentry * entry , umode_t mode , dev_t rdev )
2005-09-09 13:10:29 -07:00
{
struct fuse_mknod_in inarg ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2020-05-06 17:44:12 +02:00
if ( ! fm - > fc - > dont_mask )
2009-06-30 20:12:23 +02:00
mode & = ~ current_umask ( ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . mode = mode ;
inarg . rdev = new_encode_dev ( rdev ) ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_MKNOD ;
args . in_numargs = 2 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
args . in_args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 1 ] . value = entry - > d_name . name ;
2020-05-06 17:44:12 +02:00
return create_new_entry ( fm , & args , dir , entry , mode ) ;
2005-09-09 13:10:29 -07:00
}
2023-01-13 12:49:13 +01:00
static int fuse_create ( struct mnt_idmap * idmap , struct inode * dir ,
2021-01-21 14:19:43 +01:00
struct dentry * entry , umode_t mode , bool excl )
2005-09-09 13:10:29 -07:00
{
2023-01-13 12:49:16 +01:00
return fuse_mknod ( & nop_mnt_idmap , dir , entry , mode , 0 ) ;
2005-09-09 13:10:29 -07:00
}
2023-01-13 12:49:18 +01:00
static int fuse_tmpfile ( struct mnt_idmap * idmap , struct inode * dir ,
2022-09-24 07:00:00 +02:00
struct file * file , umode_t mode )
{
struct fuse_conn * fc = get_fuse_conn ( dir ) ;
int err ;
if ( fc - > no_tmpfile )
return - EOPNOTSUPP ;
err = fuse_create_open ( dir , file - > f_path . dentry , file , file - > f_flags , mode , FUSE_TMPFILE ) ;
if ( err = = - ENOSYS ) {
fc - > no_tmpfile = 1 ;
err = - EOPNOTSUPP ;
}
return err ;
}
2023-01-13 12:49:15 +01:00
static int fuse_mkdir ( struct mnt_idmap * idmap , struct inode * dir ,
2021-01-21 14:19:43 +01:00
struct dentry * entry , umode_t mode )
2005-09-09 13:10:29 -07:00
{
struct fuse_mkdir_in inarg ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2020-05-06 17:44:12 +02:00
if ( ! fm - > fc - > dont_mask )
2009-06-30 20:12:23 +02:00
mode & = ~ current_umask ( ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . mode = mode ;
2009-06-30 20:12:23 +02:00
inarg . umask = current_umask ( ) ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_MKDIR ;
args . in_numargs = 2 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
args . in_args [ 1 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 1 ] . value = entry - > d_name . name ;
2020-05-06 17:44:12 +02:00
return create_new_entry ( fm , & args , dir , entry , S_IFDIR ) ;
2005-09-09 13:10:29 -07:00
}
2023-01-13 12:49:14 +01:00
static int fuse_symlink ( struct mnt_idmap * idmap , struct inode * dir ,
2021-01-21 14:19:43 +01:00
struct dentry * entry , const char * link )
2005-09-09 13:10:29 -07:00
{
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2005-09-09 13:10:29 -07:00
unsigned len = strlen ( link ) + 1 ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_SYMLINK ;
args . in_numargs = 2 ;
args . in_args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 0 ] . value = entry - > d_name . name ;
args . in_args [ 1 ] . size = len ;
args . in_args [ 1 ] . value = link ;
2020-05-06 17:44:12 +02:00
return create_new_entry ( fm , & args , dir , entry , S_IFLNK ) ;
2005-09-09 13:10:29 -07:00
}
2021-10-22 17:03:01 +02:00
void fuse_flush_time_update ( struct inode * inode )
{
int err = sync_inode_metadata ( inode , 1 ) ;
mapping_set_error ( inode - > i_mapping , err ) ;
}
2021-10-22 17:03:02 +02:00
static void fuse_update_ctime_in_cache ( struct inode * inode )
2014-04-28 14:19:24 +02:00
{
if ( ! IS_NOCMTIME ( inode ) ) {
2016-09-14 07:48:06 -07:00
inode - > i_ctime = current_time ( inode ) ;
2014-04-28 14:19:24 +02:00
mark_inode_dirty_sync ( inode ) ;
2021-10-22 17:03:01 +02:00
fuse_flush_time_update ( inode ) ;
2014-04-28 14:19:24 +02:00
}
}
2021-10-22 17:03:02 +02:00
void fuse_update_ctime ( struct inode * inode )
{
2021-10-22 17:03:02 +02:00
fuse_invalidate_attr_mask ( inode , STATX_CTIME ) ;
2021-10-22 17:03:02 +02:00
fuse_update_ctime_in_cache ( inode ) ;
}
2021-10-22 17:03:02 +02:00
static void fuse_entry_unlinked ( struct dentry * entry )
{
struct inode * inode = d_inode ( entry ) ;
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
spin_lock ( & fi - > lock ) ;
fi - > attr_version = atomic64_inc_return ( & fc - > attr_version ) ;
/*
* If i_nlink = = 0 then unlink doesn ' t make sense , yet this can
* happen if userspace filesystem is careless . It would be
* difficult to enforce correct nlink usage so just ignore this
* condition here
*/
if ( S_ISDIR ( inode - > i_mode ) )
clear_nlink ( inode ) ;
else if ( inode - > i_nlink > 0 )
drop_nlink ( inode ) ;
spin_unlock ( & fi - > lock ) ;
fuse_invalidate_entry_cache ( entry ) ;
fuse_update_ctime ( inode ) ;
}
2005-09-09 13:10:29 -07:00
static int fuse_unlink ( struct inode * dir , struct dentry * entry )
{
int err ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( dir ) )
return - EIO ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_UNLINK ;
args . nodeid = get_node_id ( dir ) ;
args . in_numargs = 1 ;
args . in_args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 0 ] . value = entry - > d_name . name ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( dir ) ;
2021-10-22 17:03:02 +02:00
fuse_entry_unlinked ( entry ) ;
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR )
fuse_invalidate_entry ( entry ) ;
return err ;
}
static int fuse_rmdir ( struct inode * dir , struct dentry * entry )
{
int err ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( dir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( dir ) )
return - EIO ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_RMDIR ;
args . nodeid = get_node_id ( dir ) ;
args . in_numargs = 1 ;
args . in_args [ 0 ] . size = entry - > d_name . len + 1 ;
args . in_args [ 0 ] . value = entry - > d_name . name ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( dir ) ;
2021-10-22 17:03:02 +02:00
fuse_entry_unlinked ( entry ) ;
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR )
fuse_invalidate_entry ( entry ) ;
return err ;
}
2014-04-28 16:43:44 +02:00
static int fuse_rename_common ( struct inode * olddir , struct dentry * oldent ,
struct inode * newdir , struct dentry * newent ,
unsigned int flags , int opcode , size_t argsize )
2005-09-09 13:10:29 -07:00
{
int err ;
2014-04-28 16:43:44 +02:00
struct fuse_rename2_in inarg ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( olddir ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
2014-04-28 16:43:44 +02:00
memset ( & inarg , 0 , argsize ) ;
2005-09-09 13:10:29 -07:00
inarg . newdir = get_node_id ( newdir ) ;
2014-04-28 16:43:44 +02:00
inarg . flags = flags ;
2019-09-10 15:04:08 +02:00
args . opcode = opcode ;
args . nodeid = get_node_id ( olddir ) ;
args . in_numargs = 3 ;
args . in_args [ 0 ] . size = argsize ;
args . in_args [ 0 ] . value = & inarg ;
args . in_args [ 1 ] . size = oldent - > d_name . len + 1 ;
args . in_args [ 1 ] . value = oldent - > d_name . name ;
args . in_args [ 2 ] . size = newent - > d_name . len + 1 ;
args . in_args [ 2 ] . value = newent - > d_name . name ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2005-09-09 13:10:29 -07:00
if ( ! err ) {
2007-11-28 16:22:03 -08:00
/* ctime changes */
2015-03-17 22:25:59 +00:00
fuse_update_ctime ( d_inode ( oldent ) ) ;
2007-11-28 16:22:03 -08:00
2021-10-22 17:03:01 +02:00
if ( flags & RENAME_EXCHANGE )
2015-03-17 22:25:59 +00:00
fuse_update_ctime ( d_inode ( newent ) ) ;
2014-04-28 16:43:44 +02:00
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( olddir ) ;
2005-09-09 13:10:29 -07:00
if ( olddir ! = newdir )
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( newdir ) ;
2006-01-06 00:19:38 -08:00
/* newent will end up negative */
2021-10-22 17:03:02 +02:00
if ( ! ( flags & RENAME_EXCHANGE ) & & d_really_is_positive ( newent ) )
fuse_entry_unlinked ( newent ) ;
2005-09-09 13:10:29 -07:00
} else if ( err = = - EINTR ) {
/* If request was interrupted, DEITY only knows if the
rename actually took place . If the invalidation
fails ( e . g . some process has CWD under the renamed
directory ) , then there can be inconsistency between
the dcache and the real filesystem . Tough luck . */
fuse_invalidate_entry ( oldent ) ;
2015-03-17 22:25:59 +00:00
if ( d_really_is_positive ( newent ) )
2005-09-09 13:10:29 -07:00
fuse_invalidate_entry ( newent ) ;
}
return err ;
}
2023-01-13 12:49:17 +01:00
static int fuse_rename2 ( struct mnt_idmap * idmap , struct inode * olddir ,
2021-01-21 14:19:43 +01:00
struct dentry * oldent , struct inode * newdir ,
struct dentry * newent , unsigned int flags )
2014-04-28 16:43:44 +02:00
{
struct fuse_conn * fc = get_fuse_conn ( olddir ) ;
int err ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( olddir ) )
return - EIO ;
2020-02-05 08:15:46 -05:00
if ( flags & ~ ( RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT ) )
2014-04-28 16:43:44 +02:00
return - EINVAL ;
2014-07-10 10:50:19 +02:00
if ( flags ) {
if ( fc - > no_rename2 | | fc - > minor < 23 )
return - EINVAL ;
2014-04-28 16:43:44 +02:00
2014-07-10 10:50:19 +02:00
err = fuse_rename_common ( olddir , oldent , newdir , newent , flags ,
FUSE_RENAME2 ,
sizeof ( struct fuse_rename2_in ) ) ;
if ( err = = - ENOSYS ) {
fc - > no_rename2 = 1 ;
err = - EINVAL ;
}
} else {
err = fuse_rename_common ( olddir , oldent , newdir , newent , 0 ,
FUSE_RENAME ,
sizeof ( struct fuse_rename_in ) ) ;
2014-04-28 16:43:44 +02:00
}
2014-07-10 10:50:19 +02:00
2014-04-28 16:43:44 +02:00
return err ;
2014-07-10 10:50:19 +02:00
}
2014-04-28 16:43:44 +02:00
2005-09-09 13:10:29 -07:00
static int fuse_link ( struct dentry * entry , struct inode * newdir ,
struct dentry * newent )
{
int err ;
struct fuse_link_in inarg ;
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
inarg . oldnodeid = get_node_id ( inode ) ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_LINK ;
args . in_numargs = 2 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
args . in_args [ 1 ] . size = newent - > d_name . len + 1 ;
args . in_args [ 1 ] . value = newent - > d_name . name ;
2020-05-06 17:44:12 +02:00
err = create_new_entry ( fm , & args , newdir , newent , inode - > i_mode ) ;
2021-10-22 17:03:02 +02:00
if ( ! err )
fuse_update_ctime_in_cache ( inode ) ;
else if ( err = = - EINTR )
2012-03-05 15:48:11 +01:00
fuse_invalidate_attr ( inode ) ;
2021-10-22 17:03:02 +02:00
2005-09-09 13:10:29 -07:00
return err ;
}
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
static void fuse_fillattr ( struct inode * inode , struct fuse_attr * attr ,
struct kstat * stat )
{
2012-05-10 19:49:38 +04:00
unsigned int blkbits ;
2013-10-10 17:10:46 +04:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
stat - > dev = inode - > i_sb - > s_dev ;
stat - > ino = attr - > ino ;
stat - > mode = ( inode - > i_mode & S_IFMT ) | ( attr - > mode & 07777 ) ;
stat - > nlink = attr - > nlink ;
2018-02-21 11:18:07 -06:00
stat - > uid = make_kuid ( fc - > user_ns , attr - > uid ) ;
stat - > gid = make_kgid ( fc - > user_ns , attr - > gid ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
stat - > rdev = inode - > i_rdev ;
stat - > atime . tv_sec = attr - > atime ;
stat - > atime . tv_nsec = attr - > atimensec ;
stat - > mtime . tv_sec = attr - > mtime ;
stat - > mtime . tv_nsec = attr - > mtimensec ;
stat - > ctime . tv_sec = attr - > ctime ;
stat - > ctime . tv_nsec = attr - > ctimensec ;
stat - > size = attr - > size ;
stat - > blocks = attr - > blocks ;
2012-05-10 19:49:38 +04:00
if ( attr - > blksize ! = 0 )
blkbits = ilog2 ( attr - > blksize ) ;
else
blkbits = inode - > i_sb - > s_blocksize_bits ;
stat - > blksize = 1 < < blkbits ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
}
2007-10-18 03:06:59 -07:00
static int fuse_do_getattr ( struct inode * inode , struct kstat * stat ,
struct file * file )
2005-09-09 13:10:28 -07:00
{
int err ;
2007-10-18 03:06:59 -07:00
struct fuse_getattr_in inarg ;
struct fuse_attr_out outarg ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
u64 attr_version ;
2020-05-06 17:44:12 +02:00
attr_version = fuse_get_attr_version ( fm - > fc ) ;
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
2007-10-18 03:06:59 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2007-10-18 03:06:59 -07:00
/* Directories have separate file-handle space */
if ( file & & S_ISREG ( inode - > i_mode ) ) {
struct fuse_file * ff = file - > private_data ;
inarg . getattr_flags | = FUSE_GETATTR_FH ;
inarg . fh = ff - > fh ;
}
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_GETATTR ;
args . nodeid = get_node_id ( inode ) ;
args . in_numargs = 1 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
args . out_numargs = 1 ;
args . out_args [ 0 ] . size = sizeof ( outarg ) ;
args . out_args [ 0 ] . value = & outarg ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2005-09-09 13:10:28 -07:00
if ( ! err ) {
2019-11-12 11:49:04 +01:00
if ( fuse_invalid_attr ( & outarg . attr ) | |
2021-03-01 20:37:10 -05:00
inode_wrong_type ( inode , outarg . attr . mode ) ) {
2020-12-10 15:33:14 +01:00
fuse_make_bad ( inode ) ;
2005-09-09 13:10:28 -07:00
err = - EIO ;
} else {
2007-10-18 03:06:59 -07:00
fuse_change_attributes ( inode , & outarg . attr ,
attr_timeout ( & outarg ) ,
fuse: fix race between getattr and write
Getattr and lookup operations can be running in parallel to attribute changing
operations, such as write and setattr.
This means, that if for example getattr was slower than a write, the cached
size attribute could be set to a stale value.
To prevent this race, introduce a per-filesystem attribute version counter.
This counter is incremented whenever cached attributes are modified, and the
incremented value stored in the inode.
Before storing new attributes in the cache, getattr and lookup check, using
the version number, whether the attributes have been modified during the
request's lifetime. If so, the returned attributes are not cached, because
they might be stale.
Thanks to Jakub Bogusz for the bug report and test program.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jakub Bogusz <jakub.bogusz@gemius.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 03:06:58 -07:00
attr_version ) ;
if ( stat )
2007-10-18 03:06:59 -07:00
fuse_fillattr ( inode , & outarg . attr , stat ) ;
2005-09-09 13:10:28 -07:00
}
}
return err ;
}
2017-09-12 16:57:54 +02:00
static int fuse_update_get_attr ( struct inode * inode , struct file * file ,
2018-10-15 15:43:06 +02:00
struct kstat * stat , u32 request_mask ,
unsigned int flags )
2007-11-28 16:21:59 -08:00
{
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2017-09-12 16:57:54 +02:00
int err = 0 ;
2018-03-20 17:11:44 +01:00
bool sync ;
2021-10-22 17:03:03 +02:00
u32 inval_mask = READ_ONCE ( fi - > inval_mask ) ;
u32 cache_mask = fuse_get_cache_mask ( inode ) ;
2007-11-28 16:21:59 -08:00
2018-03-20 17:11:44 +01:00
if ( flags & AT_STATX_FORCE_SYNC )
sync = true ;
else if ( flags & AT_STATX_DONT_SYNC )
sync = false ;
2021-10-22 17:03:03 +02:00
else if ( request_mask & inval_mask & ~ cache_mask )
2018-10-15 15:43:06 +02:00
sync = true ;
2018-03-20 17:11:44 +01:00
else
sync = time_before64 ( fi - > i_time , get_jiffies_64 ( ) ) ;
if ( sync ) {
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
2007-11-28 16:21:59 -08:00
err = fuse_do_getattr ( inode , stat , file ) ;
2017-09-12 16:57:54 +02:00
} else if ( stat ) {
2023-01-13 12:49:12 +01:00
generic_fillattr ( & nop_mnt_idmap , inode , stat ) ;
2017-09-12 16:57:54 +02:00
stat - > mode = fi - > orig_i_mode ;
stat - > ino = fi - > orig_ino ;
2007-11-28 16:21:59 -08:00
}
return err ;
}
2021-10-22 17:03:03 +02:00
int fuse_update_attributes ( struct inode * inode , struct file * file , u32 mask )
2017-09-12 16:57:54 +02:00
{
2021-10-22 17:03:03 +02:00
return fuse_update_get_attr ( inode , file , NULL , mask , 0 ) ;
2017-09-12 16:57:54 +02:00
}
2020-05-06 17:44:12 +02:00
int fuse_reverse_inval_entry ( struct fuse_conn * fc , u64 parent_nodeid ,
2022-10-28 14:25:21 +02:00
u64 child_nodeid , struct qstr * name , u32 flags )
2009-05-31 11:13:57 -04:00
{
int err = - ENOTDIR ;
struct inode * parent ;
struct dentry * dir ;
struct dentry * entry ;
2020-05-06 17:44:12 +02:00
parent = fuse_ilookup ( fc , parent_nodeid , NULL ) ;
2009-05-31 11:13:57 -04:00
if ( ! parent )
return - ENOENT ;
2021-10-22 17:03:01 +02:00
inode_lock_nested ( parent , I_MUTEX_PARENT ) ;
2009-05-31 11:13:57 -04:00
if ( ! S_ISDIR ( parent - > i_mode ) )
goto unlock ;
err = - ENOENT ;
dir = d_find_alias ( parent ) ;
if ( ! dir )
goto unlock ;
2016-06-10 07:51:30 -07:00
name - > hash = full_name_hash ( dir , name - > name , name - > len ) ;
2009-05-31 11:13:57 -04:00
entry = d_lookup ( dir , name ) ;
dput ( dir ) ;
if ( ! entry )
goto unlock ;
2018-10-01 10:07:05 +02:00
fuse_dir_changed ( parent ) ;
2022-10-28 14:25:21 +02:00
if ( ! ( flags & FUSE_EXPIRE_ONLY ) )
d_invalidate ( entry ) ;
fuse_invalidate_entry_cache ( entry ) ;
2011-12-06 21:50:06 +01:00
2015-03-17 22:25:59 +00:00
if ( child_nodeid ! = 0 & & d_really_is_positive ( entry ) ) {
2016-01-22 15:40:57 -05:00
inode_lock ( d_inode ( entry ) ) ;
2015-03-17 22:25:59 +00:00
if ( get_node_id ( d_inode ( entry ) ) ! = child_nodeid ) {
2011-12-06 21:50:06 +01:00
err = - ENOENT ;
goto badentry ;
}
if ( d_mountpoint ( entry ) ) {
err = - EBUSY ;
goto badentry ;
}
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
if ( d_is_dir ( entry ) ) {
2011-12-06 21:50:06 +01:00
shrink_dcache_parent ( entry ) ;
if ( ! simple_empty ( entry ) ) {
err = - ENOTEMPTY ;
goto badentry ;
}
2015-03-17 22:25:59 +00:00
d_inode ( entry ) - > i_flags | = S_DEAD ;
2011-12-06 21:50:06 +01:00
}
dont_mount ( entry ) ;
2015-03-17 22:25:59 +00:00
clear_nlink ( d_inode ( entry ) ) ;
2011-12-06 21:50:06 +01:00
err = 0 ;
badentry :
2016-01-22 15:40:57 -05:00
inode_unlock ( d_inode ( entry ) ) ;
2011-12-06 21:50:06 +01:00
if ( ! err )
d_delete ( entry ) ;
} else {
err = 0 ;
}
2009-05-31 11:13:57 -04:00
dput ( entry ) ;
unlock :
2016-01-22 15:40:57 -05:00
inode_unlock ( parent ) ;
2009-05-31 11:13:57 -04:00
iput ( parent ) ;
return err ;
}
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
static inline bool fuse_permissible_uidgid ( struct fuse_conn * fc )
{
const struct cred * cred = current_cred ( ) ;
return ( uid_eq ( cred - > euid , fc - > user_id ) & &
uid_eq ( cred - > suid , fc - > user_id ) & &
uid_eq ( cred - > uid , fc - > user_id ) & &
gid_eq ( cred - > egid , fc - > group_id ) & &
gid_eq ( cred - > sgid , fc - > group_id ) & &
gid_eq ( cred - > gid , fc - > group_id ) ) ;
}
2005-09-09 13:10:34 -07:00
/*
* Calling into a user - controlled filesystem gives the filesystem
2013-01-14 22:30:00 -08:00
* daemon ptrace - like capabilities over the current process . This
2005-09-09 13:10:34 -07:00
* means , that the filesystem daemon is able to record the exact
* filesystem operations performed , and can also control the behavior
* of the requester process in otherwise impossible ways . For example
* it can delay the operation for arbitrary length of time allowing
* DoS against the requester .
*
* For this reason only those processes can call into the filesystem ,
* for which the owner of the mount has ptrace privilege . This
* excludes processes started by other users , suid or sgid processes .
*/
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
bool fuse_allow_current_process ( struct fuse_conn * fc )
2005-09-09 13:10:34 -07:00
{
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
bool allow ;
2022-07-11 10:48:08 -07:00
2016-10-01 07:32:32 +02:00
if ( fc - > allow_other )
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
allow = current_in_userns ( fc - > user_ns ) ;
else
allow = fuse_permissible_uidgid ( fc ) ;
2005-09-09 13:10:34 -07:00
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
if ( ! allow & & allow_sys_admin_access & & capable ( CAP_SYS_ADMIN ) )
allow = true ;
2008-11-14 10:39:19 +11:00
fuse: Rearrange fuse_allow_current_process checks
This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.
Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.
However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1. This can be problematic when a BPF program is tracing capable() calls.
At Meta we ran into such a scenario recently. On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op. We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.
So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand. Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned. These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-10-25 09:10:17 -07:00
return allow ;
2005-09-09 13:10:34 -07:00
}
2005-11-07 00:59:50 -08:00
static int fuse_access ( struct inode * inode , int mask )
{
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-11-07 00:59:50 -08:00
struct fuse_access_in inarg ;
int err ;
2013-10-01 16:41:23 +02:00
BUG_ON ( mask & MAY_NOT_BLOCK ) ;
2020-05-06 17:44:12 +02:00
if ( fm - > fc - > no_access )
2005-11-07 00:59:50 -08:00
return 0 ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2008-07-15 21:03:57 -04:00
inarg . mask = mask & ( MAY_READ | MAY_WRITE | MAY_EXEC ) ;
2019-09-10 15:04:08 +02:00
args . opcode = FUSE_ACCESS ;
args . nodeid = get_node_id ( inode ) ;
args . in_numargs = 1 ;
args . in_args [ 0 ] . size = sizeof ( inarg ) ;
args . in_args [ 0 ] . value = & inarg ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2005-11-07 00:59:50 -08:00
if ( err = = - ENOSYS ) {
2020-05-06 17:44:12 +02:00
fm - > fc - > no_access = 1 ;
2005-11-07 00:59:50 -08:00
err = 0 ;
}
return err ;
}
2011-06-20 19:28:19 -04:00
static int fuse_perm_getattr ( struct inode * inode , int mask )
2011-03-21 13:58:06 +01:00
{
2011-06-20 19:28:19 -04:00
if ( mask & MAY_NOT_BLOCK )
2011-03-21 13:58:06 +01:00
return - ECHILD ;
2016-08-29 08:46:37 -05:00
forget_all_cached_acls ( inode ) ;
2011-03-21 13:58:06 +01:00
return fuse_do_getattr ( inode , NULL , NULL ) ;
}
2006-01-06 00:19:39 -08:00
/*
* Check permission . The two basic access models of FUSE are :
*
* 1 ) Local access checking ( ' default_permissions ' mount option ) based
* on file mode . This is the plain old disk filesystem permission
* modell .
*
* 2 ) " Remote " access checking , where server is responsible for
* checking permission in each inode operation . An exception to this
* is if - > permission ( ) was invoked from sys_access ( ) in which case an
* access request is sent . Execute permission is still checked
* locally based on file mode .
*/
2023-01-13 12:49:22 +01:00
static int fuse_permission ( struct mnt_idmap * idmap ,
2021-01-21 14:19:43 +01:00
struct inode * inode , int mask )
2005-09-09 13:10:28 -07:00
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2007-10-16 23:31:02 -07:00
bool refreshed = false ;
int err = 0 ;
2005-09-09 13:10:28 -07:00
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( inode ) )
return - EIO ;
2013-01-14 22:30:00 -08:00
if ( ! fuse_allow_current_process ( fc ) )
2005-09-09 13:10:28 -07:00
return - EACCES ;
2007-10-16 23:31:02 -07:00
/*
2007-10-16 23:31:06 -07:00
* If attributes are needed , refresh them before proceeding
2007-10-16 23:31:02 -07:00
*/
2016-10-01 07:32:32 +02:00
if ( fc - > default_permissions | |
2007-10-16 23:31:06 -07:00
( ( mask & MAY_EXEC ) & & S_ISREG ( inode - > i_mode ) ) ) {
2011-03-21 13:58:06 +01:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2018-12-03 10:14:43 +01:00
u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID ;
2011-03-21 13:58:06 +01:00
2018-12-03 10:14:43 +01:00
if ( perm_mask & READ_ONCE ( fi - > inval_mask ) | |
time_before64 ( fi - > i_time , get_jiffies_64 ( ) ) ) {
2011-03-21 13:58:06 +01:00
refreshed = true ;
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2011-03-21 13:58:06 +01:00
if ( err )
return err ;
}
2007-10-16 23:31:02 -07:00
}
2016-10-01 07:32:32 +02:00
if ( fc - > default_permissions ) {
2023-01-13 12:49:22 +01:00
err = generic_permission ( & nop_mnt_idmap , inode , mask ) ;
2005-09-09 13:10:31 -07:00
/* If permission is denied, try to refresh file
attributes . This is also needed , because the root
node will at first have no permissions */
2007-10-16 23:31:02 -07:00
if ( err = = - EACCES & & ! refreshed ) {
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2005-09-09 13:10:31 -07:00
if ( ! err )
2023-01-13 12:49:22 +01:00
err = generic_permission ( & nop_mnt_idmap ,
2021-01-21 14:19:24 +01:00
inode , mask ) ;
2005-09-09 13:10:31 -07:00
}
2006-01-06 00:19:39 -08:00
/* Note: the opposite of the above test does not
exist . So if permissions are revoked this won ' t be
noticed immediately , only after the attribute
timeout has expired */
2010-07-23 11:43:51 -04:00
} else if ( mask & ( MAY_ACCESS | MAY_CHDIR ) ) {
2007-10-16 23:31:06 -07:00
err = fuse_access ( inode , mask ) ;
} else if ( ( mask & MAY_EXEC ) & & S_ISREG ( inode - > i_mode ) ) {
if ( ! ( inode - > i_mode & S_IXUGO ) ) {
if ( refreshed )
return - EACCES ;
2011-06-20 19:28:19 -04:00
err = fuse_perm_getattr ( inode , mask ) ;
2007-10-16 23:31:06 -07:00
if ( ! err & & ! ( inode - > i_mode & S_IXUGO ) )
return - EACCES ;
}
2005-09-09 13:10:28 -07:00
}
2007-10-16 23:31:02 -07:00
return err ;
2005-09-09 13:10:28 -07:00
}
2018-10-11 08:17:00 -07:00
static int fuse_readlink_page ( struct inode * inode , struct page * page )
2005-09-09 13:10:28 -07:00
{
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
2019-09-10 15:04:09 +02:00
struct fuse_page_desc desc = { . length = PAGE_SIZE - 1 } ;
struct fuse_args_pages ap = {
. num_pages = 1 ,
. pages = & page ,
. descs = & desc ,
} ;
char * link ;
ssize_t res ;
ap . args . opcode = FUSE_READLINK ;
ap . args . nodeid = get_node_id ( inode ) ;
ap . args . out_pages = true ;
ap . args . out_argvar = true ;
ap . args . page_zeroing = true ;
ap . args . out_numargs = 1 ;
ap . args . out_args [ 0 ] . size = desc . length ;
2020-05-06 17:44:12 +02:00
res = fuse_simple_request ( fm , & ap . args ) ;
2005-09-09 13:10:28 -07:00
2019-09-10 15:04:09 +02:00
fuse_invalidate_atime ( inode ) ;
2015-11-17 10:20:54 -05:00
2019-09-10 15:04:09 +02:00
if ( res < 0 )
return res ;
2014-12-12 09:49:05 +01:00
2019-09-10 15:04:09 +02:00
if ( WARN_ON ( res > = PAGE_SIZE ) )
return - EIO ;
2018-10-11 08:17:00 -07:00
2019-09-10 15:04:09 +02:00
link = page_address ( page ) ;
link [ res ] = ' \0 ' ;
2018-10-11 08:17:00 -07:00
2019-09-10 15:04:09 +02:00
return 0 ;
2018-10-11 08:17:00 -07:00
}
static const char * fuse_get_link ( struct dentry * dentry , struct inode * inode ,
struct delayed_call * callback )
{
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
struct page * page ;
int err ;
err = - EIO ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( inode ) )
2018-10-11 08:17:00 -07:00
goto out_err ;
if ( fc - > cache_symlinks )
return page_get_link ( dentry , inode , callback ) ;
err = - ECHILD ;
if ( ! dentry )
goto out_err ;
page = alloc_page ( GFP_KERNEL ) ;
err = - ENOMEM ;
if ( ! page )
goto out_err ;
err = fuse_readlink_page ( inode , page ) ;
if ( err ) {
__free_page ( page ) ;
goto out_err ;
}
set_delayed_call ( callback , page_put_link , page ) ;
return page_address ( page ) ;
out_err :
return ERR_PTR ( err ) ;
2005-09-09 13:10:28 -07:00
}
static int fuse_dir_open ( struct inode * inode , struct file * file )
{
2009-04-28 16:56:37 +02:00
return fuse_open_common ( inode , file , true ) ;
2005-09-09 13:10:28 -07:00
}
static int fuse_dir_release ( struct inode * inode , struct file * file )
{
2018-12-10 10:54:52 -08:00
fuse_release_common ( file , true ) ;
2009-04-28 16:56:39 +02:00
return 0 ;
2005-09-09 13:10:28 -07:00
}
2011-07-16 20:44:56 -04:00
static int fuse_dir_fsync ( struct file * file , loff_t start , loff_t end ,
int datasync )
2005-09-09 13:10:38 -07:00
{
2018-12-03 10:14:43 +01:00
struct inode * inode = file - > f_mapping - > host ;
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
int err ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( inode ) )
2018-12-03 10:14:43 +01:00
return - EIO ;
if ( fc - > no_fsyncdir )
return 0 ;
inode_lock ( inode ) ;
err = fuse_fsync_common ( file , start , end , datasync , FUSE_FSYNCDIR ) ;
if ( err = = - ENOSYS ) {
fc - > no_fsyncdir = 1 ;
err = 0 ;
}
inode_unlock ( inode ) ;
return err ;
2005-09-09 13:10:38 -07:00
}
2011-12-13 11:58:49 +01:00
static long fuse_dir_ioctl ( struct file * file , unsigned int cmd ,
unsigned long arg )
{
struct fuse_conn * fc = get_fuse_conn ( file - > f_mapping - > host ) ;
/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
if ( fc - > minor < 18 )
return - ENOTTY ;
return fuse_ioctl_common ( file , cmd , arg , FUSE_IOCTL_DIR ) ;
}
static long fuse_dir_compat_ioctl ( struct file * file , unsigned int cmd ,
unsigned long arg )
{
struct fuse_conn * fc = get_fuse_conn ( file - > f_mapping - > host ) ;
if ( fc - > minor < 18 )
return - ENOTTY ;
return fuse_ioctl_common ( file , cmd , arg ,
FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR ) ;
}
2013-12-26 19:51:11 +04:00
static bool update_mtime ( unsigned ivalid , bool trust_local_mtime )
2007-10-18 03:07:01 -07:00
{
/* Always update if mtime is explicitly set */
if ( ivalid & ATTR_MTIME_SET )
return true ;
2013-12-26 19:51:11 +04:00
/* Or if kernel i_mtime is the official one */
if ( trust_local_mtime )
return true ;
2007-10-18 03:07:01 -07:00
/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
if ( ( ivalid & ATTR_SIZE ) & & ( ivalid & ( ATTR_OPEN | ATTR_FILE ) ) )
return false ;
/* In all other cases update */
return true ;
}
2018-02-21 11:18:07 -06:00
static void iattr_to_fattr ( struct fuse_conn * fc , struct iattr * iattr ,
struct fuse_setattr_in * arg , bool trust_local_cmtime )
2005-09-09 13:10:29 -07:00
{
unsigned ivalid = iattr - > ia_valid ;
if ( ivalid & ATTR_MODE )
2005-11-07 00:59:52 -08:00
arg - > valid | = FATTR_MODE , arg - > mode = iattr - > ia_mode ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_UID )
2018-02-21 11:18:07 -06:00
arg - > valid | = FATTR_UID , arg - > uid = from_kuid ( fc - > user_ns , iattr - > ia_uid ) ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_GID )
2018-02-21 11:18:07 -06:00
arg - > valid | = FATTR_GID , arg - > gid = from_kgid ( fc - > user_ns , iattr - > ia_gid ) ;
2005-09-09 13:10:29 -07:00
if ( ivalid & ATTR_SIZE )
2005-11-07 00:59:52 -08:00
arg - > valid | = FATTR_SIZE , arg - > size = iattr - > ia_size ;
2007-10-18 03:07:01 -07:00
if ( ivalid & ATTR_ATIME ) {
arg - > valid | = FATTR_ATIME ;
2005-11-07 00:59:52 -08:00
arg - > atime = iattr - > ia_atime . tv_sec ;
2007-10-18 03:07:01 -07:00
arg - > atimensec = iattr - > ia_atime . tv_nsec ;
if ( ! ( ivalid & ATTR_ATIME_SET ) )
arg - > valid | = FATTR_ATIME_NOW ;
}
2014-04-28 14:19:25 +02:00
if ( ( ivalid & ATTR_MTIME ) & & update_mtime ( ivalid , trust_local_cmtime ) ) {
2007-10-18 03:07:01 -07:00
arg - > valid | = FATTR_MTIME ;
2005-11-07 00:59:52 -08:00
arg - > mtime = iattr - > ia_mtime . tv_sec ;
2007-10-18 03:07:01 -07:00
arg - > mtimensec = iattr - > ia_mtime . tv_nsec ;
2014-04-28 14:19:25 +02:00
if ( ! ( ivalid & ATTR_MTIME_SET ) & & ! trust_local_cmtime )
2007-10-18 03:07:01 -07:00
arg - > valid | = FATTR_MTIME_NOW ;
2005-11-07 00:59:52 -08:00
}
2014-04-28 14:19:25 +02:00
if ( ( ivalid & ATTR_CTIME ) & & trust_local_cmtime ) {
arg - > valid | = FATTR_CTIME ;
arg - > ctime = iattr - > ia_ctime . tv_sec ;
arg - > ctimensec = iattr - > ia_ctime . tv_nsec ;
}
2005-09-09 13:10:29 -07:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
/*
* Prevent concurrent writepages on inode
*
* This is done by adding a negative bias to the inode write counter
* and waiting for all pending writes to finish .
*/
void fuse_set_nowrite ( struct inode * inode )
{
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2016-01-22 15:40:57 -05:00
BUG_ON ( ! inode_is_locked ( inode ) ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
2018-11-09 13:33:22 +03:00
spin_lock ( & fi - > lock ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
BUG_ON ( fi - > writectr < 0 ) ;
fi - > writectr + = FUSE_NOWRITE ;
2018-11-09 13:33:22 +03:00
spin_unlock ( & fi - > lock ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
wait_event ( fi - > page_waitq , fi - > writectr = = FUSE_NOWRITE ) ;
}
/*
* Allow writepages on inode
*
* Remove the bias from the writecounter and send any queued
* writepages .
*/
static void __fuse_release_nowrite ( struct inode * inode )
{
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
BUG_ON ( fi - > writectr ! = FUSE_NOWRITE ) ;
fi - > writectr = 0 ;
fuse_flush_writepages ( inode ) ;
}
void fuse_release_nowrite ( struct inode * inode )
{
2018-11-09 13:33:22 +03:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
2018-11-09 13:33:22 +03:00
spin_lock ( & fi - > lock ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
__fuse_release_nowrite ( inode ) ;
2018-11-09 13:33:22 +03:00
spin_unlock ( & fi - > lock ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
}
2014-12-12 09:49:05 +01:00
static void fuse_setattr_fill ( struct fuse_conn * fc , struct fuse_args * args ,
2013-12-26 19:51:11 +04:00
struct inode * inode ,
struct fuse_setattr_in * inarg_p ,
struct fuse_attr_out * outarg_p )
{
2019-09-10 15:04:08 +02:00
args - > opcode = FUSE_SETATTR ;
args - > nodeid = get_node_id ( inode ) ;
args - > in_numargs = 1 ;
args - > in_args [ 0 ] . size = sizeof ( * inarg_p ) ;
args - > in_args [ 0 ] . value = inarg_p ;
args - > out_numargs = 1 ;
args - > out_args [ 0 ] . size = sizeof ( * outarg_p ) ;
args - > out_args [ 0 ] . value = outarg_p ;
2013-12-26 19:51:11 +04:00
}
/*
* Flush inode - > i_mtime to the server
*/
2014-04-28 14:19:24 +02:00
int fuse_flush_times ( struct inode * inode , struct fuse_file * ff )
2013-12-26 19:51:11 +04:00
{
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2013-12-26 19:51:11 +04:00
struct fuse_setattr_in inarg ;
struct fuse_attr_out outarg ;
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2014-04-28 14:19:24 +02:00
inarg . valid = FATTR_MTIME ;
2013-12-26 19:51:11 +04:00
inarg . mtime = inode - > i_mtime . tv_sec ;
inarg . mtimensec = inode - > i_mtime . tv_nsec ;
2020-05-06 17:44:12 +02:00
if ( fm - > fc - > minor > = 23 ) {
2014-04-28 14:19:24 +02:00
inarg . valid | = FATTR_CTIME ;
inarg . ctime = inode - > i_ctime . tv_sec ;
inarg . ctimensec = inode - > i_ctime . tv_nsec ;
}
2014-04-28 14:19:23 +02:00
if ( ff ) {
inarg . valid | = FATTR_FH ;
inarg . fh = ff - > fh ;
}
2020-05-06 17:44:12 +02:00
fuse_setattr_fill ( fm - > fc , & args , inode , & inarg , & outarg ) ;
2013-12-26 19:51:11 +04:00
2020-05-06 17:44:12 +02:00
return fuse_simple_request ( fm , & args ) ;
2013-12-26 19:51:11 +04:00
}
2006-01-06 00:19:39 -08:00
/*
* Set attributes , and at the same time refresh them .
*
* Truncation is slightly complicated , because the ' truncate ' request
* may fail , in which case we don ' t want to touch the mapping .
2006-10-17 00:10:06 -07:00
* vmtruncate ( ) doesn ' t allow for this case , so do the rlimit checking
* and the actual truncation by hand .
2006-01-06 00:19:39 -08:00
*/
2016-05-26 17:12:41 +02:00
int fuse_do_setattr ( struct dentry * dentry , struct iattr * attr ,
2012-12-18 14:05:08 +04:00
struct file * file )
2005-09-09 13:10:29 -07:00
{
2016-05-26 17:12:41 +02:00
struct inode * inode = d_inode ( dentry ) ;
2020-05-06 17:44:12 +02:00
struct fuse_mount * fm = get_fuse_mount ( inode ) ;
struct fuse_conn * fc = fm - > fc ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2021-04-21 17:18:39 +02:00
struct address_space * mapping = inode - > i_mapping ;
2014-12-12 09:49:05 +01:00
FUSE_ARGS ( args ) ;
2005-09-09 13:10:29 -07:00
struct fuse_setattr_in inarg ;
struct fuse_attr_out outarg ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
bool is_truncate = false ;
2021-10-22 17:03:03 +02:00
bool is_wb = fc - > writeback_cache & & S_ISREG ( inode - > i_mode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
loff_t oldsize ;
2005-09-09 13:10:29 -07:00
int err ;
2021-10-22 17:03:03 +02:00
bool trust_local_cmtime = is_wb ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
bool fault_blocked = false ;
2005-09-09 13:10:29 -07:00
2016-10-01 07:32:32 +02:00
if ( ! fc - > default_permissions )
2010-06-04 11:30:03 +02:00
attr - > ia_valid | = ATTR_FORCE ;
2023-01-13 12:49:11 +01:00
err = setattr_prepare ( & nop_mnt_idmap , dentry , attr ) ;
2010-06-04 11:30:03 +02:00
if ( err )
return err ;
2005-09-09 13:10:31 -07:00
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
if ( attr - > ia_valid & ATTR_SIZE ) {
if ( WARN_ON ( ! S_ISREG ( inode - > i_mode ) ) )
return - EIO ;
is_truncate = true ;
}
if ( FUSE_IS_DAX ( inode ) & & is_truncate ) {
2021-04-21 17:18:39 +02:00
filemap_invalidate_lock ( mapping ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
fault_blocked = true ;
err = fuse_dax_break_layouts ( inode , 0 , 0 ) ;
if ( err ) {
2021-04-21 17:18:39 +02:00
filemap_invalidate_unlock ( mapping ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
return err ;
}
}
2011-02-25 14:44:58 +01:00
if ( attr - > ia_valid & ATTR_OPEN ) {
2018-02-08 15:17:38 +01:00
/* This is coming from open(..., ... | O_TRUNC); */
WARN_ON ( ! ( attr - > ia_valid & ATTR_SIZE ) ) ;
WARN_ON ( attr - > ia_size ! = 0 ) ;
if ( fc - > atomic_o_trunc ) {
/*
* No need to send request to userspace , since actual
* truncation has already been done by OPEN . But still
* need to truncate page cache .
*/
i_size_write ( inode , 0 ) ;
truncate_pagecache ( inode , 0 ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
goto out ;
2018-02-08 15:17:38 +01:00
}
2011-02-25 14:44:58 +01:00
file = NULL ;
}
2007-10-18 03:07:02 -07:00
2019-10-23 14:26:37 +02:00
/* Flush dirty data/metadata before non-truncate SETATTR */
2021-10-22 17:03:03 +02:00
if ( is_wb & &
2019-10-23 14:26:37 +02:00
attr - > ia_valid &
( ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
ATTR_TIMES_SET ) ) {
err = write_inode_now ( inode , true ) ;
if ( err )
return err ;
fuse_set_nowrite ( inode ) ;
fuse_release_nowrite ( inode ) ;
}
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
if ( is_truncate ) {
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
fuse_set_nowrite ( inode ) ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
set_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
2014-04-28 14:19:25 +02:00
if ( trust_local_cmtime & & attr - > ia_size ! = inode - > i_size )
attr - > ia_valid | = ATTR_MTIME | ATTR_CTIME ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
2005-09-09 13:10:29 -07:00
memset ( & inarg , 0 , sizeof ( inarg ) ) ;
2007-10-18 03:07:05 -07:00
memset ( & outarg , 0 , sizeof ( outarg ) ) ;
2018-02-21 11:18:07 -06:00
iattr_to_fattr ( fc , attr , & inarg , trust_local_cmtime ) ;
2007-10-18 03:07:00 -07:00
if ( file ) {
struct fuse_file * ff = file - > private_data ;
inarg . valid | = FATTR_FH ;
inarg . fh = ff - > fh ;
}
2020-10-09 14:15:09 -04:00
/* Kill suid/sgid for non-directory chown unconditionally */
if ( fc - > handle_killpriv_v2 & & ! S_ISDIR ( inode - > i_mode ) & &
attr - > ia_valid & ( ATTR_UID | ATTR_GID ) )
inarg . valid | = FATTR_KILL_SUIDGID ;
2007-10-18 03:07:04 -07:00
if ( attr - > ia_valid & ATTR_SIZE ) {
/* For mandatory locking in truncate */
inarg . valid | = FATTR_LOCKOWNER ;
inarg . lock_owner = fuse_lock_owner_id ( fc , current - > files ) ;
2020-10-09 14:15:09 -04:00
/* Kill suid/sgid for truncate only if no CAP_FSETID */
if ( fc - > handle_killpriv_v2 & & ! capable ( CAP_FSETID ) )
inarg . valid | = FATTR_KILL_SUIDGID ;
2007-10-18 03:07:04 -07:00
}
2014-12-12 09:49:05 +01:00
fuse_setattr_fill ( fc , & args , inode , & inarg , & outarg ) ;
2020-05-06 17:44:12 +02:00
err = fuse_simple_request ( fm , & args ) ;
2007-10-16 23:31:01 -07:00
if ( err ) {
if ( err = = - EINTR )
fuse_invalidate_attr ( inode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
goto error ;
2007-10-16 23:31:01 -07:00
}
2005-09-09 13:10:29 -07:00
2019-11-12 11:49:04 +01:00
if ( fuse_invalid_attr ( & outarg . attr ) | |
2021-03-01 20:37:10 -05:00
inode_wrong_type ( inode , outarg . attr . mode ) ) {
2020-12-10 15:33:14 +01:00
fuse_make_bad ( inode ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
err = - EIO ;
goto error ;
}
2018-11-09 13:33:22 +03:00
spin_lock ( & fi - > lock ) ;
2013-12-26 19:51:11 +04:00
/* the kernel maintains i_mtime locally */
2014-04-28 14:19:25 +02:00
if ( trust_local_cmtime ) {
if ( attr - > ia_valid & ATTR_MTIME )
inode - > i_mtime = attr - > ia_mtime ;
if ( attr - > ia_valid & ATTR_CTIME )
inode - > i_ctime = attr - > ia_ctime ;
2014-04-28 14:19:23 +02:00
/* FIXME: clear I_DIRTY_SYNC? */
2013-12-26 19:51:11 +04:00
}
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
fuse_change_attributes_common ( inode , & outarg . attr ,
2021-10-22 17:03:03 +02:00
attr_timeout ( & outarg ) ,
fuse_get_cache_mask ( inode ) ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
oldsize = inode - > i_size ;
2013-10-10 17:10:46 +04:00
/* see the comment in fuse_change_attributes() */
2021-10-22 17:03:03 +02:00
if ( ! is_wb | | is_truncate )
2013-10-10 17:10:46 +04:00
i_size_write ( inode , outarg . attr . size ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
if ( is_truncate ) {
2018-11-09 13:33:22 +03:00
/* NOTE: this may release/reacquire fi->lock */
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
__fuse_release_nowrite ( inode ) ;
}
2018-11-09 13:33:22 +03:00
spin_unlock ( & fi - > lock ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
/*
* Only call invalidate_inode_pages2 ( ) after removing
2022-02-09 20:21:56 +00:00
* FUSE_NOWRITE , otherwise fuse_launder_folio ( ) would deadlock .
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
*/
2013-10-10 17:10:46 +04:00
if ( ( is_truncate | | ! is_wb ) & &
S_ISREG ( inode - > i_mode ) & & oldsize ! = outarg . attr . size ) {
2013-09-12 15:13:56 -07:00
truncate_pagecache ( inode , outarg . attr . size ) ;
2021-04-21 17:18:39 +02:00
invalidate_inode_pages2 ( mapping ) ;
2007-10-16 23:31:01 -07:00
}
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
clear_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
out :
if ( fault_blocked )
2021-04-21 17:18:39 +02:00
filemap_invalidate_unlock ( mapping ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
2007-10-16 23:31:01 -07:00
return 0 ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
error :
if ( is_truncate )
fuse_release_nowrite ( inode ) ;
fuse: hotfix truncate_pagecache() issue
The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes
mmap-ed reads and writes, ftruncate(2) and write(2) extending file size.
The typical outcome of doing truncate_pagecache() with outdated arguments
is data corruption from user point of view. This is (in some sense)
acceptable in cases when the issue is triggered by a change of the file on
the server (i.e. externally wrt fuse operation), but it is absolutely
intolerable in scenarios when a single fuse client modifies a file without
any external intervention. A real life case I discovered by fsx-linux
looked like this:
1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The
latter updates i_size, releases fc->lock, but before comparing oldsize vs
attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to
outarg.attr.size because i_size has just been updated (step 2). Hence,
fuse_do_setattr() returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by
write(2) making a hole in the middle of file, then reads data from the hole
either by read(2) or mmap-ed read. The user expects to get zero data from
the hole, but gets stale data because truncate_pagecache() is not executed
yet.
The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating
page cache too late, when the state of inode changed significantly.
Theoretically, the following is possible:
1. As in the previous scenario fuse_dentry_revalidate() discovered that
i_size changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But
by the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.
The result will be the lost of data user wrote on the fourth step.
The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing
file size is in progress. This simplistic approach looks correct for the
cases w/o external changes. And to handle them properly, more sophisticated
and intrusive techniques (e.g. NFS-like one) would be required. I'd like to
postpone it until the issue is well discussed on the mailing list(s).
Changed in v2:
- improved patch description to cover both sides of the issue.
Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
2013-08-30 17:06:04 +04:00
clear_bit ( FUSE_I_SIZE_UNSTABLE , & fi - > state ) ;
virtiofs: serialize truncate/punch_hole and dax fault path
Currently in fuse we don't seem have any lock which can serialize fault
path with truncate/punch_hole path. With dax support I need one for
following reasons.
1. Dax requirement
DAX fault code relies on inode size being stable for the duration of
fault and want to serialize with truncate/punch_hole and they explicitly
mention it.
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2. Make sure there are no users of pages being truncated/punch_hole
get_user_pages() might take references to page and then do some DMA
to said pages. Filesystem might truncate those pages without knowing
that a DMA is in progress or some I/O is in progress. So use
dax_layout_busy_page() to make sure there are no such references
and I/O is not in progress on said pages before moving ahead with
truncation.
3. Limitation of kvm page fault error reporting
If we are truncating file on host first and then removing mappings in
guest lateter (truncate page cache etc), then this could lead to a
problem with KVM. Say a mapping is in place in guest and truncation
happens on host. Now if guest accesses that mapping, then host will
take a fault and kvm will either exit to qemu or spin infinitely.
IOW, before we do truncation on host, we need to make sure that guest
inode does not have any mapping in that region or whole file.
4. virtiofs memory range reclaim
Soon I will introduce the notion of being able to reclaim dax memory
ranges from a fuse dax inode. There also I need to make sure that
no I/O or fault is going on in the reclaimed range and nobody is using
it so that range can be reclaimed without issues.
Currently if we take inode lock, that serializes read/write. But it does
not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem
for this purpose. It can be used to serialize with faults.
As of now, I am adding taking this semaphore only in dax fault path and
not regular fault path because existing code does not have one. May
be existing code can benefit from it as well to take care of some
races, but that we can fix later if need be. For now, I am just focussing
only on DAX path which is new path.
Also added logic to take fuse_inode->i_mmap_sem in
truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and
fuse dax fault are mutually exlusive and avoid all the above problems.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-19 18:19:54 -04:00
if ( fault_blocked )
2021-04-21 17:18:39 +02:00
filemap_invalidate_unlock ( mapping ) ;
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions):
"User-space filesystems are hard to get right. I'd claim that they
are almost impossible, unless you limit them somehow (shared
writable mappings are the nastiest part - if you don't have those,
you can reasonably limit your problems by limiting the number of
dirty pages you accept through normal "write()" calls)."
Instead of attempting the impossible, I've just waited for the dirty page
accounting infrastructure to materialize (thanks to Peter Zijlstra and
others). This nicely solved the biggest problem: limiting the number of pages
used for write caching.
Some small details remained, however, which this largish patch attempts to
address. It provides a page writeback implementation for fuse, which is
completely safe against VM related deadlocks. Performance may not be very
good for certain usage patterns, but generally it should be acceptable.
It has been tested extensively with fsx-linux and bash-shared-mapping.
Fuse page writeback design
--------------------------
fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM.
It copies the contents of the original page, and queues a WRITE request to the
userspace filesystem using this temp page.
The writeback is finished instantly from the MM's point of view: the page is
removed from the radix trees, and the PageDirty and PageWriteback flags are
cleared.
For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented. The per-bdi writeback count is not decremented until the actual
write completes.
On dirtying the page, fuse waits for a previous write to finish before
proceeding. This makes sure, there can only be one temporary page used at a
time for one cached page.
This approach is wasteful in both memory and CPU bandwidth, so why is this
complication needed?
The basic problem is that there can be no guarantee about the time in which
the userspace filesystem will complete a write. It may be buggy or even
malicious, and fail to complete WRITE requests. We don't want unrelated parts
of the system to grind to a halt in such cases.
Also a filesystem may need additional resources (particularly memory) to
complete a WRITE request. There's a great danger of a deadlock if that
allocation may wait for the writepage to finish.
Currently there are several cases where the kernel can block on page
writeback:
- allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
- page migration
- throttle_vm_writeout (through NR_WRITEBACK)
- sync(2)
Of course in some cases (fsync, msync) we explicitly want to allow blocking.
So for these cases new code has to be added to fuse, since the VM is not
tracking writeback pages for us any more.
As an extra safetly measure, the maximum dirty ratio allocated to a single
fuse filesystem is set to 1% by default. This way one (or several) buggy or
malicious fuse filesystems cannot slow down the rest of the system by hogging
dirty memory.
With appropriate privileges, this limit can be raised through
'/sys/class/bdi/<bdi>/max_ratio'.
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 00:54:41 -07:00
return err ;
2005-09-09 13:10:29 -07:00
}
2023-01-13 12:49:11 +01:00
static int fuse_setattr ( struct mnt_idmap * idmap , struct dentry * entry ,
2021-01-21 14:19:43 +01:00
struct iattr * attr )
2007-10-18 03:07:00 -07:00
{
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( entry ) ;
2016-10-01 07:32:32 +02:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2016-10-01 07:32:32 +02:00
struct file * file = ( attr - > ia_valid & ATTR_FILE ) ? attr - > ia_file : NULL ;
2016-10-01 07:32:32 +02:00
int ret ;
2012-12-18 14:05:08 +04:00
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( inode ) )
return - EIO ;
2012-12-18 14:05:08 +04:00
if ( ! fuse_allow_current_process ( get_fuse_conn ( inode ) ) )
return - EACCES ;
2016-10-01 07:32:32 +02:00
if ( attr - > ia_valid & ( ATTR_KILL_SUID | ATTR_KILL_SGID ) ) {
attr - > ia_valid & = ~ ( ATTR_KILL_SUID | ATTR_KILL_SGID |
ATTR_MODE ) ;
2016-10-01 07:32:32 +02:00
2016-10-01 07:32:32 +02:00
/*
2016-10-01 07:32:32 +02:00
* The only sane way to reliably kill suid / sgid is to do it in
* the userspace filesystem
*
* This should be done on write ( ) , truncate ( ) and chown ( ) .
2016-10-01 07:32:32 +02:00
*/
2020-10-09 14:15:10 -04:00
if ( ! fc - > handle_killpriv & & ! fc - > handle_killpriv_v2 ) {
2016-10-01 07:32:32 +02:00
/*
* ia_mode calculation may have used stale i_mode .
* Refresh and recalculate .
*/
ret = fuse_do_getattr ( inode , NULL , file ) ;
if ( ret )
return ret ;
attr - > ia_mode = inode - > i_mode ;
fuse: fix clearing suid, sgid for chown()
Basically, the pjdfstests set the ownership of a file to 06555, and then
chowns it (as root) to a new uid/gid. Prior to commit a09f99eddef4 ("fuse:
fix killing s[ug]id in setattr"), fuse would send down a setattr with both
the uid/gid change and a new mode. Now, it just sends down the uid/gid
change.
Technically this is NOTABUG, since POSIX doesn't _require_ that we clear
these bits for a privileged process, but Linux (wisely) has done that and I
think we don't want to change that behavior here.
This is caused by the use of should_remove_suid(), which will always return
0 when the process has CAP_FSETID.
In fact we really don't need to be calling should_remove_suid() at all,
since we've already been indicated that we should remove the suid, we just
don't want to use a (very) stale mode for that.
This patch should fix the above as well as simplify the logic.
Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fixes: a09f99eddef4 ("fuse: fix killing s[ug]id in setattr")
Cc: <stable@vger.kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
2016-12-06 16:18:45 +01:00
if ( inode - > i_mode & S_ISUID ) {
2016-10-01 07:32:32 +02:00
attr - > ia_valid | = ATTR_MODE ;
attr - > ia_mode & = ~ S_ISUID ;
}
fuse: fix clearing suid, sgid for chown()
Basically, the pjdfstests set the ownership of a file to 06555, and then
chowns it (as root) to a new uid/gid. Prior to commit a09f99eddef4 ("fuse:
fix killing s[ug]id in setattr"), fuse would send down a setattr with both
the uid/gid change and a new mode. Now, it just sends down the uid/gid
change.
Technically this is NOTABUG, since POSIX doesn't _require_ that we clear
these bits for a privileged process, but Linux (wisely) has done that and I
think we don't want to change that behavior here.
This is caused by the use of should_remove_suid(), which will always return
0 when the process has CAP_FSETID.
In fact we really don't need to be calling should_remove_suid() at all,
since we've already been indicated that we should remove the suid, we just
don't want to use a (very) stale mode for that.
This patch should fix the above as well as simplify the logic.
Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fixes: a09f99eddef4 ("fuse: fix killing s[ug]id in setattr")
Cc: <stable@vger.kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
2016-12-06 16:18:45 +01:00
if ( ( inode - > i_mode & ( S_ISGID | S_IXGRP ) ) = = ( S_ISGID | S_IXGRP ) ) {
2016-10-01 07:32:32 +02:00
attr - > ia_valid | = ATTR_MODE ;
attr - > ia_mode & = ~ S_ISGID ;
}
2016-10-01 07:32:32 +02:00
}
}
if ( ! attr - > ia_valid )
return 0 ;
2016-10-01 07:32:32 +02:00
2016-10-10 13:04:49 -07:00
ret = fuse_do_setattr ( entry , attr , file ) ;
2016-10-01 07:32:32 +02:00
if ( ! ret ) {
2016-08-29 08:46:37 -05:00
/*
* If filesystem supports acls it may have updated acl xattrs in
* the filesystem , so forget cached acls for the inode .
*/
if ( fc - > posix_acl )
forget_all_cached_acls ( inode ) ;
2016-10-01 07:32:32 +02:00
/* Directory mode changed, may need to revalidate access */
if ( d_is_dir ( entry ) & & ( attr - > ia_valid & ATTR_MODE ) )
fuse_invalidate_entry_cache ( entry ) ;
}
return ret ;
2007-10-18 03:07:00 -07:00
}
2023-01-13 12:49:12 +01:00
static int fuse_getattr ( struct mnt_idmap * idmap ,
2021-01-21 14:19:43 +01:00
const struct path * path , struct kstat * stat ,
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
u32 request_mask , unsigned int flags )
2005-09-09 13:10:28 -07:00
{
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
struct inode * inode = d_inode ( path - > dentry ) ;
2007-10-16 23:31:02 -07:00
struct fuse_conn * fc = get_fuse_conn ( inode ) ;
2020-12-10 15:33:14 +01:00
if ( fuse_is_bad ( inode ) )
return - EIO ;
2020-05-19 14:50:37 +02:00
if ( ! fuse_allow_current_process ( fc ) ) {
if ( ! request_mask ) {
/*
* If user explicitly requested * nothing * then don ' t
* error out , but return st_dev only .
*/
stat - > result_mask = 0 ;
stat - > dev = inode - > i_sb - > s_dev ;
return 0 ;
}
2007-10-16 23:31:02 -07:00
return - EACCES ;
2020-05-19 14:50:37 +02:00
}
2007-10-16 23:31:02 -07:00
2018-10-15 15:43:06 +02:00
return fuse_update_get_attr ( inode , NULL , stat , request_mask , flags ) ;
2005-09-09 13:10:28 -07:00
}
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_dir_inode_operations = {
2005-09-09 13:10:28 -07:00
. lookup = fuse_lookup ,
2005-09-09 13:10:29 -07:00
. mkdir = fuse_mkdir ,
. symlink = fuse_symlink ,
. unlink = fuse_unlink ,
. rmdir = fuse_rmdir ,
2016-09-27 11:03:58 +02:00
. rename = fuse_rename2 ,
2005-09-09 13:10:29 -07:00
. link = fuse_link ,
. setattr = fuse_setattr ,
. create = fuse_create ,
2012-06-05 15:10:22 +02:00
. atomic_open = fuse_atomic_open ,
2022-09-24 07:00:00 +02:00
. tmpfile = fuse_tmpfile ,
2005-09-09 13:10:29 -07:00
. mknod = fuse_mknod ,
2005-09-09 13:10:28 -07:00
. permission = fuse_permission ,
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2023-01-20 12:55:04 +01:00
. get_inode_acl = fuse_get_inode_acl ,
. get_acl = fuse_get_acl ,
2016-08-29 08:46:37 -05:00
. set_acl = fuse_set_acl ,
2021-04-08 11:11:19 +02:00
. fileattr_get = fuse_fileattr_get ,
. fileattr_set = fuse_fileattr_set ,
2005-09-09 13:10:28 -07:00
} ;
2006-03-28 01:56:42 -08:00
static const struct file_operations fuse_dir_operations = {
2005-09-09 13:10:30 -07:00
. llseek = generic_file_llseek ,
2005-09-09 13:10:28 -07:00
. read = generic_read_dir ,
2016-04-20 17:30:32 -04:00
. iterate_shared = fuse_readdir ,
2005-09-09 13:10:28 -07:00
. open = fuse_dir_open ,
. release = fuse_dir_release ,
2005-09-09 13:10:38 -07:00
. fsync = fuse_dir_fsync ,
2011-12-13 11:58:49 +01:00
. unlocked_ioctl = fuse_dir_ioctl ,
. compat_ioctl = fuse_dir_compat_ioctl ,
2005-09-09 13:10:28 -07:00
} ;
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_common_inode_operations = {
2005-09-09 13:10:29 -07:00
. setattr = fuse_setattr ,
2005-09-09 13:10:28 -07:00
. permission = fuse_permission ,
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2023-01-20 12:55:04 +01:00
. get_inode_acl = fuse_get_inode_acl ,
. get_acl = fuse_get_acl ,
2016-08-29 08:46:37 -05:00
. set_acl = fuse_set_acl ,
2021-04-08 11:11:19 +02:00
. fileattr_get = fuse_fileattr_get ,
. fileattr_set = fuse_fileattr_set ,
2005-09-09 13:10:28 -07:00
} ;
2007-02-12 00:55:38 -08:00
static const struct inode_operations fuse_symlink_inode_operations = {
2005-09-09 13:10:29 -07:00
. setattr = fuse_setattr ,
2015-11-17 10:20:54 -05:00
. get_link = fuse_get_link ,
2005-09-09 13:10:28 -07:00
. getattr = fuse_getattr ,
2005-09-09 13:10:31 -07:00
. listxattr = fuse_listxattr ,
2005-09-09 13:10:28 -07:00
} ;
void fuse_init_common ( struct inode * inode )
{
inode - > i_op = & fuse_common_inode_operations ;
}
void fuse_init_dir ( struct inode * inode )
{
2018-10-01 10:07:05 +02:00
struct fuse_inode * fi = get_fuse_inode ( inode ) ;
2005-09-09 13:10:28 -07:00
inode - > i_op = & fuse_dir_inode_operations ;
inode - > i_fop = & fuse_dir_operations ;
2018-10-01 10:07:05 +02:00
spin_lock_init ( & fi - > rdc . lock ) ;
fi - > rdc . cached = false ;
fi - > rdc . size = 0 ;
fi - > rdc . pos = 0 ;
fi - > rdc . version = 0 ;
2005-09-09 13:10:28 -07:00
}
2022-04-29 11:12:16 -04:00
static int fuse_symlink_read_folio ( struct file * null , struct folio * folio )
2018-10-11 08:17:00 -07:00
{
2022-04-29 11:12:16 -04:00
int err = fuse_readlink_page ( folio - > mapping - > host , & folio - > page ) ;
2018-10-11 08:17:00 -07:00
if ( ! err )
2022-04-29 11:12:16 -04:00
folio_mark_uptodate ( folio ) ;
2018-10-11 08:17:00 -07:00
2022-04-29 11:12:16 -04:00
folio_unlock ( folio ) ;
2018-10-11 08:17:00 -07:00
return err ;
}
static const struct address_space_operations fuse_symlink_aops = {
2022-04-29 11:12:16 -04:00
. read_folio = fuse_symlink_read_folio ,
2018-10-11 08:17:00 -07:00
} ;
2005-09-09 13:10:28 -07:00
void fuse_init_symlink ( struct inode * inode )
{
inode - > i_op = & fuse_symlink_inode_operations ;
2018-10-11 08:17:00 -07:00
inode - > i_data . a_ops = & fuse_symlink_aops ;
inode_nohighmem ( inode ) ;
2005-09-09 13:10:28 -07:00
}