2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
File : fs / xattr . c
Extended attribute handling .
Copyright ( C ) 2001 by Andreas Gruenbacher < a . gruenbacher @ computer . org >
Copyright ( C ) 2001 SGI - Silicon Graphics , Inc < linux - xfs @ oss . sgi . com >
Copyright ( c ) 2004 Red Hat , Inc . , James Morris < jmorris @ redhat . com >
*/
# include <linux/fs.h>
# include <linux/slab.h>
# include <linux/file.h>
# include <linux/xattr.h>
2008-02-16 01:37:38 +03:00
# include <linux/mount.h>
2005-04-17 02:20:36 +04:00
# include <linux/namei.h>
# include <linux/security.h>
2011-03-09 22:39:18 +03:00
# include <linux/evm.h>
2005-04-17 02:20:36 +04:00
# include <linux/syscalls.h>
2011-11-17 08:57:37 +04:00
# include <linux/export.h>
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
# include <linux/fsnotify.h>
2005-11-03 19:00:25 +03:00
# include <linux/audit.h>
2012-04-06 01:25:07 +04:00
# include <linux/vmalloc.h>
2012-02-08 06:52:57 +04:00
# include <linux/posix_acl_xattr.h>
2005-04-17 02:20:36 +04:00
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2006-01-10 07:51:55 +03:00
2022-04-25 03:10:46 +03:00
# include "internal.h"
2016-09-29 18:48:38 +03:00
static const char *
strcmp_prefix ( const char * a , const char * a_prefix )
{
while ( * a_prefix & & * a = = * a_prefix ) {
a + + ;
a_prefix + + ;
}
return * a_prefix ? NULL : a ;
}
/*
* In order to implement different sets of xattr operations for each xattr
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
* prefix , a filesystem should create a null - terminated array of struct
* xattr_handler ( one for each prefix ) and hang a pointer to it off of the
* s_xattr field of the superblock .
2016-09-29 18:48:38 +03:00
*/
# define for_each_xattr_handler(handlers, handler) \
if ( handlers ) \
for ( ( handler ) = * ( handlers ) + + ; \
( handler ) ! = NULL ; \
( handler ) = * ( handlers ) + + )
/*
* Find the xattr_handler with the matching prefix .
*/
static const struct xattr_handler *
2016-09-29 18:48:39 +03:00
xattr_resolve_name ( struct inode * inode , const char * * name )
2016-09-29 18:48:38 +03:00
{
2016-09-29 18:48:39 +03:00
const struct xattr_handler * * handlers = inode - > i_sb - > s_xattr ;
2016-09-29 18:48:38 +03:00
const struct xattr_handler * handler ;
2016-09-29 18:48:40 +03:00
if ( ! ( inode - > i_opflags & IOP_XATTR ) ) {
if ( unlikely ( is_bad_inode ( inode ) ) )
return ERR_PTR ( - EIO ) ;
2016-09-29 18:48:39 +03:00
return ERR_PTR ( - EOPNOTSUPP ) ;
2016-09-29 18:48:40 +03:00
}
2016-09-29 18:48:38 +03:00
for_each_xattr_handler ( handlers , handler ) {
const char * n ;
n = strcmp_prefix ( * name , xattr_prefix ( handler ) ) ;
if ( n ) {
if ( ! handler - > prefix ^ ! * n ) {
if ( * n )
continue ;
return ERR_PTR ( - EINVAL ) ;
}
* name = n ;
return handler ;
}
}
return ERR_PTR ( - EOPNOTSUPP ) ;
}
2006-01-10 07:51:56 +03:00
/*
* Check permissions for extended attribute access . This is a bit complicated
* because different namespaces have very different rules .
*/
static int
2021-01-21 16:19:28 +03:00
xattr_permission ( struct user_namespace * mnt_userns , struct inode * inode ,
const char * name , int mask )
2006-01-10 07:51:56 +03:00
{
/*
* We can never set or remove an extended attribute on a read - only
* filesystem or on an immutable / append - only inode .
*/
if ( mask & MAY_WRITE ) {
if ( IS_IMMUTABLE ( inode ) | | IS_APPEND ( inode ) )
return - EPERM ;
2016-06-29 22:54:46 +03:00
/*
* Updating an xattr will likely cause i_uid and i_gid
* to be writen back improperly if their true value is
* unknown to the vfs .
*/
2021-01-21 16:19:31 +03:00
if ( HAS_UNMAPPED_ID ( mnt_userns , inode ) )
2016-06-29 22:54:46 +03:00
return - EPERM ;
2006-01-10 07:51:56 +03:00
}
/*
* No restriction for security . * and system . * from the VFS . Decision
* on these is left to the underlying filesystem / security module .
*/
if ( ! strncmp ( name , XATTR_SECURITY_PREFIX , XATTR_SECURITY_PREFIX_LEN ) | |
! strncmp ( name , XATTR_SYSTEM_PREFIX , XATTR_SYSTEM_PREFIX_LEN ) )
return 0 ;
/*
2011-05-27 16:50:36 +04:00
* The trusted . * namespace can only be accessed by privileged users .
2006-01-10 07:51:56 +03:00
*/
2011-05-27 16:50:36 +04:00
if ( ! strncmp ( name , XATTR_TRUSTED_PREFIX , XATTR_TRUSTED_PREFIX_LEN ) ) {
if ( ! capable ( CAP_SYS_ADMIN ) )
return ( mask & MAY_WRITE ) ? - EPERM : - ENODATA ;
return 0 ;
}
2006-01-10 07:51:56 +03:00
2011-05-27 16:50:36 +04:00
/*
* In the user . * namespace , only regular files and directories can have
2006-11-03 09:07:29 +03:00
* extended attributes . For sticky directories , only the owner and
2011-05-27 16:50:36 +04:00
* privileged users can write attributes .
2006-11-03 09:07:29 +03:00
*/
2006-01-10 07:51:56 +03:00
if ( ! strncmp ( name , XATTR_USER_PREFIX , XATTR_USER_PREFIX_LEN ) ) {
2006-11-03 09:07:29 +03:00
if ( ! S_ISREG ( inode - > i_mode ) & & ! S_ISDIR ( inode - > i_mode ) )
2011-05-27 16:50:36 +04:00
return ( mask & MAY_WRITE ) ? - EPERM : - ENODATA ;
2006-11-03 09:07:29 +03:00
if ( S_ISDIR ( inode - > i_mode ) & & ( inode - > i_mode & S_ISVTX ) & &
2021-01-21 16:19:25 +03:00
( mask & MAY_WRITE ) & &
2021-01-21 16:19:28 +03:00
! inode_owner_or_capable ( mnt_userns , inode ) )
2006-01-10 07:51:56 +03:00
return - EPERM ;
}
2021-01-21 16:19:28 +03:00
return inode_permission ( mnt_userns , inode , mask ) ;
2006-01-10 07:51:56 +03:00
}
2020-06-24 01:39:19 +03:00
/*
* Look for any handler that deals with the specified namespace .
*/
int
xattr_supported_namespace ( struct inode * inode , const char * prefix )
{
const struct xattr_handler * * handlers = inode - > i_sb - > s_xattr ;
const struct xattr_handler * handler ;
size_t preflen ;
if ( ! ( inode - > i_opflags & IOP_XATTR ) ) {
if ( unlikely ( is_bad_inode ( inode ) ) )
return - EIO ;
return - EOPNOTSUPP ;
}
preflen = strlen ( prefix ) ;
for_each_xattr_handler ( handlers , handler ) {
if ( ! strncmp ( xattr_prefix ( handler ) , prefix , preflen ) )
return 0 ;
}
return - EOPNOTSUPP ;
}
EXPORT_SYMBOL ( xattr_supported_namespace ) ;
2016-09-29 18:48:42 +03:00
int
2021-01-21 16:19:28 +03:00
__vfs_setxattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
struct inode * inode , const char * name , const void * value ,
size_t size , int flags )
2016-09-29 18:48:42 +03:00
{
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
const struct xattr_handler * handler ;
handler = xattr_resolve_name ( inode , & name ) ;
if ( IS_ERR ( handler ) )
return PTR_ERR ( handler ) ;
if ( ! handler - > set )
2016-09-29 18:48:42 +03:00
return - EOPNOTSUPP ;
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
if ( size = = 0 )
value = " " ; /* empty EA, do not remove */
2021-01-21 16:19:28 +03:00
return handler - > set ( handler , mnt_userns , dentry , inode , name , value ,
2021-01-21 16:19:27 +03:00
size , flags ) ;
2016-09-29 18:48:42 +03:00
}
EXPORT_SYMBOL ( __vfs_setxattr ) ;
2009-09-03 22:25:56 +04:00
/**
* __vfs_setxattr_noperm - perform setxattr operation without performing
* permission checks .
*
2021-02-16 07:29:29 +03:00
* @ mnt_userns : user namespace of the mount the inode was found from
* @ dentry : object to perform setxattr on
* @ name : xattr name to set
* @ value : value to set @ name to
* @ size : size of @ value
* @ flags : flags to pass into filesystem operations
2009-09-03 22:25:56 +04:00
*
* returns the result of the internal setxattr or setsecurity operations .
*
* This function requires the caller to lock the inode ' s i_mutex before it
* is executed . It also assumes that the caller will make the appropriate
* permission checks .
*/
2021-01-21 16:19:28 +03:00
int __vfs_setxattr_noperm ( struct user_namespace * mnt_userns ,
struct dentry * dentry , const char * name ,
const void * value , size_t size , int flags )
2006-01-10 07:51:55 +03:00
{
struct inode * inode = dentry - > d_inode ;
2016-11-13 23:23:34 +03:00
int error = - EAGAIN ;
2011-05-28 19:25:51 +04:00
int issec = ! strncmp ( name , XATTR_SECURITY_PREFIX ,
XATTR_SECURITY_PREFIX_LEN ) ;
2006-01-10 07:51:56 +03:00
2011-05-28 19:25:51 +04:00
if ( issec )
inode - > i_flags & = ~ S_NOSEC ;
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
if ( inode - > i_opflags & IOP_XATTR ) {
2021-01-21 16:19:28 +03:00
error = __vfs_setxattr ( mnt_userns , dentry , inode , name , value ,
size , flags ) ;
2006-01-10 07:51:55 +03:00
if ( ! error ) {
fsnotify_xattr ( dentry ) ;
security_inode_post_setxattr ( dentry , name , value ,
size , flags ) ;
}
2016-11-13 23:23:34 +03:00
} else {
2016-09-29 18:48:40 +03:00
if ( unlikely ( is_bad_inode ( inode ) ) )
return - EIO ;
2016-11-13 23:23:34 +03:00
}
if ( error = = - EAGAIN ) {
error = - EOPNOTSUPP ;
if ( issec ) {
const char * suffix = name + XATTR_SECURITY_PREFIX_LEN ;
error = security_inode_setsecurity ( inode , suffix , value ,
size , flags ) ;
if ( ! error )
fsnotify_xattr ( dentry ) ;
}
2006-01-10 07:51:55 +03:00
}
2009-09-03 22:25:56 +04:00
return error ;
}
2020-06-24 01:39:18 +03:00
/**
2020-10-14 02:48:27 +03:00
* __vfs_setxattr_locked - set an extended attribute while holding the inode
2020-06-24 01:39:18 +03:00
* lock
*
2021-02-16 07:29:29 +03:00
* @ mnt_userns : user namespace of the mount of the target inode
2020-10-14 02:48:27 +03:00
* @ dentry : object to perform setxattr on
* @ name : xattr name to set
* @ value : value to set @ name to
* @ size : size of @ value
* @ flags : flags to pass into filesystem operations
* @ delegated_inode : on return , will contain an inode pointer that
2020-06-24 01:39:18 +03:00
* a delegation was broken on , NULL if none .
*/
2009-09-03 22:25:56 +04:00
int
2021-01-21 16:19:28 +03:00
__vfs_setxattr_locked ( struct user_namespace * mnt_userns , struct dentry * dentry ,
const char * name , const void * value , size_t size ,
int flags , struct inode * * delegated_inode )
2009-09-03 22:25:56 +04:00
{
struct inode * inode = dentry - > d_inode ;
int error ;
2021-01-21 16:19:28 +03:00
error = xattr_permission ( mnt_userns , inode , name , MAY_WRITE ) ;
2009-09-03 22:25:56 +04:00
if ( error )
return error ;
2021-01-21 16:19:29 +03:00
error = security_inode_setxattr ( mnt_userns , dentry , name , value , size ,
flags ) ;
2009-09-03 22:25:56 +04:00
if ( error )
goto out ;
2020-06-24 01:39:18 +03:00
error = try_break_deleg ( inode , delegated_inode ) ;
if ( error )
goto out ;
2021-01-21 16:19:28 +03:00
error = __vfs_setxattr_noperm ( mnt_userns , dentry , name , value ,
size , flags ) ;
2009-09-03 22:25:56 +04:00
2006-01-10 07:51:55 +03:00
out :
2020-06-24 01:39:18 +03:00
return error ;
}
EXPORT_SYMBOL_GPL ( __vfs_setxattr_locked ) ;
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
static inline bool is_posix_acl_xattr ( const char * name )
{
return ( strcmp ( name , XATTR_NAME_POSIX_ACL_ACCESS ) = = 0 ) | |
( strcmp ( name , XATTR_NAME_POSIX_ACL_DEFAULT ) = = 0 ) ;
}
2020-06-24 01:39:18 +03:00
int
2021-01-21 16:19:28 +03:00
vfs_setxattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
const char * name , void * value , size_t size , int flags )
2020-06-24 01:39:18 +03:00
{
struct inode * inode = dentry - > d_inode ;
struct inode * delegated_inode = NULL ;
2020-12-14 17:26:13 +03:00
const void * orig_value = value ;
2020-06-24 01:39:18 +03:00
int error ;
2020-12-14 17:26:13 +03:00
if ( size & & strcmp ( name , XATTR_NAME_CAPS ) = = 0 ) {
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
error = cap_convert_nscap ( mnt_userns , dentry ,
( const void * * ) & value , size ) ;
2020-12-14 17:26:13 +03:00
if ( error < 0 )
return error ;
size = error ;
}
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
if ( size & & is_posix_acl_xattr ( name ) )
posix_acl_setxattr_idmapped_mnt ( mnt_userns , inode , value , size ) ;
2020-06-24 01:39:18 +03:00
retry_deleg :
inode_lock ( inode ) ;
2021-01-21 16:19:28 +03:00
error = __vfs_setxattr_locked ( mnt_userns , dentry , name , value , size ,
flags , & delegated_inode ) ;
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2020-06-24 01:39:18 +03:00
if ( delegated_inode ) {
error = break_deleg_wait ( & delegated_inode ) ;
if ( ! error )
goto retry_deleg ;
}
2020-12-14 17:26:13 +03:00
if ( value ! = orig_value )
kfree ( value ) ;
2006-01-10 07:51:55 +03:00
return error ;
}
EXPORT_SYMBOL_GPL ( vfs_setxattr ) ;
2018-04-25 04:22:04 +03:00
static ssize_t
2021-01-21 16:19:29 +03:00
xattr_getsecurity ( struct user_namespace * mnt_userns , struct inode * inode ,
const char * name , void * value , size_t size )
2008-02-05 09:29:39 +03:00
{
void * buffer = NULL ;
ssize_t len ;
if ( ! value | | ! size ) {
2021-01-21 16:19:29 +03:00
len = security_inode_getsecurity ( mnt_userns , inode , name ,
& buffer , false ) ;
2008-02-05 09:29:39 +03:00
goto out_noalloc ;
}
2021-01-21 16:19:29 +03:00
len = security_inode_getsecurity ( mnt_userns , inode , name , & buffer ,
true ) ;
2008-02-05 09:29:39 +03:00
if ( len < 0 )
return len ;
if ( size < len ) {
len = - ERANGE ;
goto out ;
}
memcpy ( value , buffer , len ) ;
out :
2017-09-19 19:39:08 +03:00
kfree ( buffer ) ;
2008-02-05 09:29:39 +03:00
out_noalloc :
return len ;
}
2011-03-09 22:23:34 +03:00
/*
* vfs_getxattr_alloc - allocate memory , if necessary , before calling getxattr
*
* Allocate memory , if not already allocated , or re - allocate correct size ,
* before retrieving the extended attribute .
*
* Returns the result of alloc , if failed , or the getxattr operation .
*/
ssize_t
2021-01-21 16:19:28 +03:00
vfs_getxattr_alloc ( struct user_namespace * mnt_userns , struct dentry * dentry ,
const char * name , char * * xattr_value , size_t xattr_size ,
gfp_t flags )
2011-03-09 22:23:34 +03:00
{
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
const struct xattr_handler * handler ;
2011-03-09 22:23:34 +03:00
struct inode * inode = dentry - > d_inode ;
char * value = * xattr_value ;
int error ;
2021-01-21 16:19:28 +03:00
error = xattr_permission ( mnt_userns , inode , name , MAY_READ ) ;
2011-03-09 22:23:34 +03:00
if ( error )
return error ;
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
handler = xattr_resolve_name ( inode , & name ) ;
if ( IS_ERR ( handler ) )
return PTR_ERR ( handler ) ;
if ( ! handler - > get )
2011-03-09 22:23:34 +03:00
return - EOPNOTSUPP ;
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
error = handler - > get ( handler , dentry , inode , name , NULL , 0 ) ;
2011-03-09 22:23:34 +03:00
if ( error < 0 )
return error ;
if ( ! value | | ( error > xattr_size ) ) {
value = krealloc ( * xattr_value , error + 1 , flags ) ;
if ( ! value )
return - ENOMEM ;
memset ( value , 0 , error + 1 ) ;
}
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
error = handler - > get ( handler , dentry , inode , name , value , error ) ;
2011-03-09 22:23:34 +03:00
* xattr_value = value ;
return error ;
}
2016-09-29 18:48:42 +03:00
ssize_t
__vfs_getxattr ( struct dentry * dentry , struct inode * inode , const char * name ,
void * value , size_t size )
{
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
const struct xattr_handler * handler ;
handler = xattr_resolve_name ( inode , & name ) ;
if ( IS_ERR ( handler ) )
return PTR_ERR ( handler ) ;
if ( ! handler - > get )
2016-09-29 18:48:42 +03:00
return - EOPNOTSUPP ;
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
return handler - > get ( handler , dentry , inode , name , value , size ) ;
2016-09-29 18:48:42 +03:00
}
EXPORT_SYMBOL ( __vfs_getxattr ) ;
2006-01-10 07:51:55 +03:00
ssize_t
2021-01-21 16:19:28 +03:00
vfs_getxattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
const char * name , void * value , size_t size )
2006-01-10 07:51:55 +03:00
{
struct inode * inode = dentry - > d_inode ;
int error ;
2021-01-21 16:19:28 +03:00
error = xattr_permission ( mnt_userns , inode , name , MAY_READ ) ;
2006-01-10 07:51:56 +03:00
if ( error )
return error ;
2006-01-10 07:51:55 +03:00
error = security_inode_getxattr ( dentry , name ) ;
if ( error )
return error ;
if ( ! strncmp ( name , XATTR_SECURITY_PREFIX ,
2006-01-10 07:51:56 +03:00
XATTR_SECURITY_PREFIX_LEN ) ) {
const char * suffix = name + XATTR_SECURITY_PREFIX_LEN ;
2021-01-21 16:19:29 +03:00
int ret = xattr_getsecurity ( mnt_userns , inode , suffix , value ,
size ) ;
2006-01-10 07:51:55 +03:00
/*
* Only overwrite the return value if a security module
* is actually active .
*/
2008-02-05 09:29:40 +03:00
if ( ret = = - EOPNOTSUPP )
goto nolsm ;
return ret ;
2006-01-10 07:51:55 +03:00
}
2008-02-05 09:29:40 +03:00
nolsm :
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
error = __vfs_getxattr ( dentry , inode , name , value , size ) ;
if ( error > 0 & & is_posix_acl_xattr ( name ) )
posix_acl_getxattr_idmapped_mnt ( mnt_userns , inode , value , size ) ;
return error ;
2006-01-10 07:51:55 +03:00
}
EXPORT_SYMBOL_GPL ( vfs_getxattr ) ;
2006-10-10 00:10:48 +04:00
ssize_t
2016-09-29 18:48:43 +03:00
vfs_listxattr ( struct dentry * dentry , char * list , size_t size )
2006-10-10 00:10:48 +04:00
{
2016-09-29 18:48:43 +03:00
struct inode * inode = d_inode ( dentry ) ;
2006-10-10 00:10:48 +04:00
ssize_t error ;
2016-09-29 18:48:43 +03:00
error = security_inode_listxattr ( dentry ) ;
2006-10-10 00:10:48 +04:00
if ( error )
return error ;
2016-09-29 18:48:43 +03:00
if ( inode - > i_op - > listxattr & & ( inode - > i_opflags & IOP_XATTR ) ) {
error = inode - > i_op - > listxattr ( dentry , list , size ) ;
2006-10-10 00:10:48 +04:00
} else {
2016-09-29 18:48:43 +03:00
error = security_inode_listsecurity ( inode , list , size ) ;
2006-10-10 00:10:48 +04:00
if ( size & & error > size )
error = - ERANGE ;
}
return error ;
}
EXPORT_SYMBOL_GPL ( vfs_listxattr ) ;
2006-01-10 07:51:55 +03:00
int
2021-01-21 16:19:28 +03:00
__vfs_removexattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
const char * name )
2006-01-10 07:51:55 +03:00
{
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
struct inode * inode = d_inode ( dentry ) ;
const struct xattr_handler * handler ;
2006-01-10 07:51:55 +03:00
xattr: Stop calling {get,set,remove}xattr inode operations
All filesystems that support xattrs by now do so via xattr handlers.
They all define sb->s_xattr, and their getxattr, setxattr, and
removexattr inode operations use the generic inode operations. On
filesystems that don't support xattrs, the xattr inode operations are
all NULL, and sb->s_xattr is also NULL.
This means that we can remove the getxattr, setxattr, and removexattr
inode operations and directly call the generic handlers, or better,
inline expand those handlers into fs/xattr.c.
Filesystems that do not support xattrs on some inodes should clear the
IOP_XATTR i_opflags flag in those inodes. (Right now, some filesystems
have checks to disable xattrs on some inodes in the ->list, ->get, and
->set xattr handler operations instead.) The IOP_XATTR flag is
automatically cleared in inodes of filesystems that don't have xattr
support.
In orangefs, symlinks do have a setxattr iop but no getxattr iop. Add a
check for symlinks to orangefs_inode_getxattr to preserve the current,
weird behavior; that check may not be necessary though.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2016-09-29 18:48:44 +03:00
handler = xattr_resolve_name ( inode , & name ) ;
if ( IS_ERR ( handler ) )
return PTR_ERR ( handler ) ;
if ( ! handler - > set )
2006-01-10 07:51:55 +03:00
return - EOPNOTSUPP ;
2021-01-21 16:19:28 +03:00
return handler - > set ( handler , mnt_userns , dentry , inode , name , NULL , 0 ,
XATTR_REPLACE ) ;
2016-09-29 18:48:42 +03:00
}
EXPORT_SYMBOL ( __vfs_removexattr ) ;
2020-06-24 01:39:18 +03:00
/**
2020-10-14 02:48:27 +03:00
* __vfs_removexattr_locked - set an extended attribute while holding the inode
2020-06-24 01:39:18 +03:00
* lock
*
2021-02-16 07:29:29 +03:00
* @ mnt_userns : user namespace of the mount of the target inode
2020-10-14 02:48:27 +03:00
* @ dentry : object to perform setxattr on
* @ name : name of xattr to remove
* @ delegated_inode : on return , will contain an inode pointer that
2020-06-24 01:39:18 +03:00
* a delegation was broken on , NULL if none .
*/
2016-09-29 18:48:42 +03:00
int
2021-01-21 16:19:28 +03:00
__vfs_removexattr_locked ( struct user_namespace * mnt_userns ,
struct dentry * dentry , const char * name ,
struct inode * * delegated_inode )
2016-09-29 18:48:42 +03:00
{
struct inode * inode = dentry - > d_inode ;
int error ;
2006-01-10 07:51:55 +03:00
2021-01-21 16:19:28 +03:00
error = xattr_permission ( mnt_userns , inode , name , MAY_WRITE ) ;
2006-01-10 07:51:56 +03:00
if ( error )
return error ;
2021-01-21 16:19:29 +03:00
error = security_inode_removexattr ( mnt_userns , dentry , name ) ;
2014-11-20 17:31:01 +03:00
if ( error )
goto out ;
2006-01-10 07:51:55 +03:00
2020-06-24 01:39:18 +03:00
error = try_break_deleg ( inode , delegated_inode ) ;
if ( error )
goto out ;
2021-01-21 16:19:28 +03:00
error = __vfs_removexattr ( mnt_userns , dentry , name ) ;
2006-01-10 07:51:55 +03:00
2011-03-09 22:39:18 +03:00
if ( ! error ) {
2006-01-10 07:51:55 +03:00
fsnotify_xattr ( dentry ) ;
2011-03-09 22:39:18 +03:00
evm_inode_post_removexattr ( dentry , name ) ;
}
2014-11-20 17:31:01 +03:00
out :
2020-06-24 01:39:18 +03:00
return error ;
}
EXPORT_SYMBOL_GPL ( __vfs_removexattr_locked ) ;
int
2021-01-21 16:19:28 +03:00
vfs_removexattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
const char * name )
2020-06-24 01:39:18 +03:00
{
struct inode * inode = dentry - > d_inode ;
struct inode * delegated_inode = NULL ;
int error ;
retry_deleg :
inode_lock ( inode ) ;
2021-01-21 16:19:28 +03:00
error = __vfs_removexattr_locked ( mnt_userns , dentry ,
name , & delegated_inode ) ;
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2020-06-24 01:39:18 +03:00
if ( delegated_inode ) {
error = break_deleg_wait ( & delegated_inode ) ;
if ( ! error )
goto retry_deleg ;
}
2006-01-10 07:51:55 +03:00
return error ;
}
EXPORT_SYMBOL_GPL ( vfs_removexattr ) ;
2005-04-17 02:20:36 +04:00
/*
* Extended attribute SET operations
*/
2022-04-25 03:10:46 +03:00
int setxattr_copy ( const char __user * name , struct xattr_ctx * ctx )
2005-04-17 02:20:36 +04:00
{
int error ;
2022-04-25 03:10:46 +03:00
if ( ctx - > flags & ~ ( XATTR_CREATE | XATTR_REPLACE ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2022-04-25 03:10:46 +03:00
error = strncpy_from_user ( ctx - > kname - > name , name ,
sizeof ( ctx - > kname - > name ) ) ;
if ( error = = 0 | | error = = sizeof ( ctx - > kname - > name ) )
return - ERANGE ;
2005-04-17 02:20:36 +04:00
if ( error < 0 )
return error ;
2022-04-25 03:10:46 +03:00
error = 0 ;
if ( ctx - > size ) {
if ( ctx - > size > XATTR_SIZE_MAX )
2005-04-17 02:20:36 +04:00
return - E2BIG ;
2022-04-25 03:10:46 +03:00
ctx - > kvalue = vmemdup_user ( ctx - > cvalue , ctx - > size ) ;
if ( IS_ERR ( ctx - > kvalue ) ) {
error = PTR_ERR ( ctx - > kvalue ) ;
ctx - > kvalue = NULL ;
2012-04-06 01:25:07 +04:00
}
2005-04-17 02:20:36 +04:00
}
2022-04-25 03:10:46 +03:00
return error ;
}
static void setxattr_convert ( struct user_namespace * mnt_userns ,
struct dentry * d , struct xattr_ctx * ctx )
{
if ( ctx - > size & &
( ( strcmp ( ctx - > kname - > name , XATTR_NAME_POSIX_ACL_ACCESS ) = = 0 ) | |
( strcmp ( ctx - > kname - > name , XATTR_NAME_POSIX_ACL_DEFAULT ) = = 0 ) ) )
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
posix_acl_fix_xattr_from_user ( ctx - > kvalue , ctx - > size ) ;
2022-04-25 03:10:46 +03:00
}
int do_setxattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
struct xattr_ctx * ctx )
{
setxattr_convert ( mnt_userns , dentry , ctx ) ;
return vfs_setxattr ( mnt_userns , dentry , ctx - > kname - > name ,
ctx - > kvalue , ctx - > size , ctx - > flags ) ;
}
static long
setxattr ( struct user_namespace * mnt_userns , struct dentry * d ,
const char __user * name , const void __user * value , size_t size ,
int flags )
{
struct xattr_name kname ;
struct xattr_ctx ctx = {
. cvalue = value ,
. kvalue = NULL ,
. size = size ,
. kname = & kname ,
. flags = flags ,
} ;
int error ;
error = setxattr_copy ( name , & ctx ) ;
if ( error )
return error ;
error = do_setxattr ( mnt_userns , d , & ctx ) ;
2016-01-03 01:09:47 +03:00
2022-04-25 03:10:46 +03:00
kvfree ( ctx . kvalue ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
static int path_setxattr ( const char __user * pathname ,
const char __user * name , const void __user * value ,
size_t size , int flags , unsigned int lookup_flags )
2005-04-17 02:20:36 +04:00
{
2008-07-22 17:59:21 +04:00
struct path path ;
2005-04-17 02:20:36 +04:00
int error ;
2021-01-21 16:19:28 +03:00
2012-12-11 21:10:15 +04:00
retry :
error = user_path_at ( AT_FDCWD , pathname , lookup_flags , & path ) ;
2005-04-17 02:20:36 +04:00
if ( error )
return error ;
2008-07-22 17:59:21 +04:00
error = mnt_want_write ( path . mnt ) ;
2008-02-16 01:37:38 +03:00
if ( ! error ) {
2021-01-21 16:19:28 +03:00
error = setxattr ( mnt_user_ns ( path . mnt ) , path . dentry , name ,
value , size , flags ) ;
2008-07-22 17:59:21 +04:00
mnt_drop_write ( path . mnt ) ;
2008-02-16 01:37:38 +03:00
}
2008-07-22 17:59:21 +04:00
path_put ( & path ) ;
2012-12-11 21:10:15 +04:00
if ( retry_estale ( error , lookup_flags ) ) {
lookup_flags | = LOOKUP_REVAL ;
goto retry ;
}
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
SYSCALL_DEFINE5 ( setxattr , const char __user * , pathname ,
const char __user * , name , const void __user * , value ,
size_t , size , int , flags )
{
return path_setxattr ( pathname , name , value , size , flags , LOOKUP_FOLLOW ) ;
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE5 ( lsetxattr , const char __user * , pathname ,
const char __user * , name , const void __user * , value ,
size_t , size , int , flags )
2005-04-17 02:20:36 +04:00
{
2014-10-12 20:59:58 +04:00
return path_setxattr ( pathname , name , value , size , flags , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE5 ( fsetxattr , int , fd , const char __user * , name ,
const void __user * , value , size_t , size , int , flags )
2005-04-17 02:20:36 +04:00
{
2012-08-28 20:52:22 +04:00
struct fd f = fdget ( fd ) ;
2005-04-17 02:20:36 +04:00
int error = - EBADF ;
2012-08-28 20:52:22 +04:00
if ( ! f . file )
2005-04-17 02:20:36 +04:00
return error ;
2014-11-01 00:44:57 +03:00
audit_file ( f . file ) ;
2018-07-18 16:44:43 +03:00
error = mnt_want_write_file ( f . file ) ;
2008-02-16 01:37:38 +03:00
if ( ! error ) {
2021-01-21 16:19:28 +03:00
error = setxattr ( file_mnt_user_ns ( f . file ) ,
f . file - > f_path . dentry , name ,
value , size , flags ) ;
2018-07-18 16:44:43 +03:00
mnt_drop_write_file ( f . file ) ;
2008-02-16 01:37:38 +03:00
}
2012-08-28 20:52:22 +04:00
fdput ( f ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
/*
* Extended attribute GET operations
*/
2022-04-25 03:13:50 +03:00
ssize_t
do_getxattr ( struct user_namespace * mnt_userns , struct dentry * d ,
struct xattr_ctx * ctx )
2005-04-17 02:20:36 +04:00
{
ssize_t error ;
2022-04-25 03:13:50 +03:00
char * kname = ctx - > kname - > name ;
2005-04-17 02:20:36 +04:00
2022-04-25 03:13:50 +03:00
if ( ctx - > size ) {
if ( ctx - > size > XATTR_SIZE_MAX )
ctx - > size = XATTR_SIZE_MAX ;
ctx - > kvalue = kvzalloc ( ctx - > size , GFP_KERNEL ) ;
if ( ! ctx - > kvalue )
2017-05-09 01:57:27 +03:00
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
}
2022-04-25 03:13:50 +03:00
error = vfs_getxattr ( mnt_userns , d , kname , ctx - > kvalue , ctx - > size ) ;
2005-09-04 02:55:18 +04:00
if ( error > 0 ) {
2012-02-08 06:52:57 +04:00
if ( ( strcmp ( kname , XATTR_NAME_POSIX_ACL_ACCESS ) = = 0 ) | |
( strcmp ( kname , XATTR_NAME_POSIX_ACL_DEFAULT ) = = 0 ) )
acl: move idmapped mount fixup into vfs_{g,s}etxattr()
This cycle we added support for mounting overlayfs on top of idmapped mounts.
Recently I've started looking into potential corner cases when trying to add
additional tests and I noticed that reporting for POSIX ACLs is currently wrong
when using idmapped layers with overlayfs mounted on top of it.
I'm going to give a rather detailed explanation to both the origin of the
problem and the solution.
Let's assume the user creates the following directory layout and they have a
rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would
expect files on your host system to be owned. For example, ~/.bashrc for your
regular user would be owned by 1000:1000 and /root/.bashrc would be owned by
0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs
filesystem.
The user chooses to set POSIX ACLs using the setfacl binary granting the user
with uid 4 read, write, and execute permissions for their .bashrc file:
setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
Now they to expose the whole rootfs to a container using an idmapped mount. So
they first create:
mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap}
mkdir -pv /vol/contpool/ctrover/{over,work}
chown 10000000:10000000 /vol/contpool/ctrover/{over,work}
The user now creates an idmapped mount for the rootfs:
mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \
/var/lib/lxc/c2/rootfs \
/vol/contpool/lowermap
This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at
/vol/contpool/lowermap/home/ubuntu/.bashrc.
Assume the user wants to expose these idmapped mounts through an overlayfs
mount to a container.
mount -t overlay overlay \
-o lowerdir=/vol/contpool/lowermap, \
upperdir=/vol/contpool/overmap/over, \
workdir=/vol/contpool/overmap/work \
/vol/contpool/merge
The user can do this in two ways:
(1) Mount overlayfs in the initial user namespace and expose it to the
container.
(2) Mount overlayfs on top of the idmapped mounts inside of the container's
user namespace.
Let's assume the user chooses the (1) option and mounts overlayfs on the host
and then changes into a container which uses the idmapping 0:10000000:65536
which is the same used for the two idmapped mounts.
Now the user tries to retrieve the POSIX ACLs using the getfacl command
getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc
and to their surprise they see:
# file: vol/contpool/merge/home/ubuntu/.bashrc
# owner: 1000
# group: 1000
user::rw-
user:4294967295:rwx
group::r--
mask::rwx
other::r--
indicating the the uid wasn't correctly translated according to the idmapped
mount. The problem is how we currently translate POSIX ACLs. Let's inspect the
callchain in this example:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
If the user chooses to use option (2) and mounts overlayfs on top of idmapped
mounts inside the container things don't look that much better:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get == ovl_posix_acl_xattr_get()
| -> ovl_xattr_get()
| -> vfs_getxattr()
| -> __vfs_getxattr()
| -> handler->get() /* lower filesystem callback */
|> posix_acl_fix_xattr_to_user()
{
4 = make_kuid(&init_user_ns, 4);
4 = mapped_kuid_fs(&init_user_ns, 4);
/* FAILURE */
-1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4);
}
As is easily seen the problem arises because the idmapping of the lower mount
isn't taken into account as all of this happens in do_gexattr(). But
do_getxattr() is always called on an overlayfs mount and inode and thus cannot
possible take the idmapping of the lower layers into account.
This problem is similar for fscaps but there the translation happens as part of
vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain:
setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc
The expected outcome here is that we'll receive the cap_net_raw capability as
we are able to map the uid associated with the fscap to 0 within our container.
IOW, we want to see 0 as the result of the idmapping translations.
If the user chooses option (1) we get the following callchain for fscaps:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() ________________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
And if the user chooses option (2) we get:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
-> vfs_getxattr()
-> xattr_getsecurity()
-> security_inode_getsecurity() _______________________________
-> cap_inode_getsecurity() | |
{ V |
10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); |
10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); |
/* Expected result is 0 and thus that we own the fscap. */ |
0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); |
} |
-> vfs_getxattr_alloc() |
-> handler->get == ovl_other_xattr_get() |
|-> vfs_getxattr() |
-> xattr_getsecurity() |
-> security_inode_getsecurity() |
-> cap_inode_getsecurity() |
{ |
0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); |
10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); |
0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); |
|____________________________________________________________________|
}
-> vfs_getxattr_alloc()
-> handler->get == /* lower filesystem callback */
We can see how the translation happens correctly in those cases as the
conversion happens within the vfs_getxattr() helper.
For POSIX ACLs we need to do something similar. However, in contrast to fscaps
we cannot apply the fix directly to the kernel internal posix acl data
structure as this would alter the cached values and would also require a rework
of how we currently deal with POSIX ACLs in general which almost never take the
filesystem idmapping into account (the noteable exception being FUSE but even
there the implementation is special) and instead retrieve the raw values based
on the initial idmapping.
The correct values are then generated right before returning to userspace. The
fix for this is to move taking the mount's idmapping into account directly in
vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user().
To this end we split out two small and unexported helpers
posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The
former to be called in vfs_getxattr() and the latter to be called in
vfs_setxattr().
Let's go back to the original example. Assume the user chose option (1) and
mounted overlayfs on top of idmapped mounts on the host:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { |
| V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(&init_user_ns, 10000004);
| } |_________________________________________________
| |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004);
}
And similarly if the user chooses option (1) and mounted overayfs on top of
idmapped mounts inside the container:
idmapped mount /vol/contpool/merge: 0:10000000:65536
caller's idmapping: 0:10000000:65536
overlayfs idmapping (ofs->creator_cred): 0:10000000:65536
sys_getxattr()
-> path_getxattr()
-> getxattr()
-> do_getxattr()
|> vfs_getxattr()
| |> __vfs_getxattr()
| | -> handler->get == ovl_posix_acl_xattr_get()
| | -> ovl_xattr_get()
| | -> vfs_getxattr()
| | |> __vfs_getxattr()
| | | -> handler->get() /* lower filesystem callback */
| | |> posix_acl_getxattr_idmapped_mnt()
| | {
| | 4 = make_kuid(&init_user_ns, 4);
| | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4);
| | 10000004 = from_kuid(&init_user_ns, 10000004);
| | |_______________________
| | } |
| | |
| |> posix_acl_getxattr_idmapped_mnt() |
| { V
| 10000004 = make_kuid(&init_user_ns, 10000004);
| 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004);
| 10000004 = from_kuid(0(&init_user_ns, 10000004);
| |_________________________________________________
| } |
| |
|> posix_acl_fix_xattr_to_user() |
{ V
10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004);
/* SUCCESS */
4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004);
}
The last remaining problem we need to fix here is ovl_get_acl(). During
ovl_permission() overlayfs will call:
ovl_permission()
-> generic_permission()
-> acl_permission_check()
-> check_acl()
-> get_acl()
-> inode->i_op->get_acl() == ovl_get_acl()
> get_acl() /* on the underlying filesystem)
->inode->i_op->get_acl() == /*lower filesystem callback */
-> posix_acl_permission()
passing through the get_acl request to the underlying filesystem. This will
retrieve the acls stored in the lower filesystem without taking the idmapping
of the underlying mount into account as this would mean altering the cached
values for the lower filesystem. So we block using ACLs for now until we
decided on a nice way to fix this. Note this limitation both in the
documentation and in the code.
The most straightforward solution would be to have ovl_get_acl() simply
duplicate the ACLs, update the values according to the idmapped mount and
return it to acl_permission_check() so it can be used in posix_acl_permission()
forgetting them afterwards. This is a bit heavy handed but fairly
straightforward otherwise.
Link: https://github.com/brauner/mount-idmapped/issues/9
Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org
Cc: Seth Forshee <sforshee@digitalocean.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-07-06 19:30:59 +03:00
posix_acl_fix_xattr_to_user ( ctx - > kvalue , error ) ;
2022-04-25 03:13:50 +03:00
if ( ctx - > size & & copy_to_user ( ctx - > value , ctx - > kvalue , error ) )
2005-09-04 02:55:18 +04:00
error = - EFAULT ;
2022-04-25 03:13:50 +03:00
} else if ( error = = - ERANGE & & ctx - > size > = XATTR_SIZE_MAX ) {
2005-09-04 02:55:18 +04:00
/* The file system tried to returned a value bigger
than XATTR_SIZE_MAX bytes . Not possible . */
error = - E2BIG ;
2005-04-17 02:20:36 +04:00
}
2016-01-03 01:09:47 +03:00
2022-04-25 03:13:50 +03:00
return error ;
}
static ssize_t
getxattr ( struct user_namespace * mnt_userns , struct dentry * d ,
const char __user * name , void __user * value , size_t size )
{
ssize_t error ;
struct xattr_name kname ;
struct xattr_ctx ctx = {
. value = value ,
. kvalue = NULL ,
. size = size ,
. kname = & kname ,
. flags = 0 ,
} ;
error = strncpy_from_user ( kname . name , name , sizeof ( kname . name ) ) ;
if ( error = = 0 | | error = = sizeof ( kname . name ) )
error = - ERANGE ;
if ( error < 0 )
return error ;
error = do_getxattr ( mnt_userns , d , & ctx ) ;
2016-01-03 01:09:47 +03:00
2022-04-25 03:13:50 +03:00
kvfree ( ctx . kvalue ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
static ssize_t path_getxattr ( const char __user * pathname ,
const char __user * name , void __user * value ,
size_t size , unsigned int lookup_flags )
2005-04-17 02:20:36 +04:00
{
2008-07-22 17:59:21 +04:00
struct path path ;
2005-04-17 02:20:36 +04:00
ssize_t error ;
2012-12-11 21:10:16 +04:00
retry :
error = user_path_at ( AT_FDCWD , pathname , lookup_flags , & path ) ;
2005-04-17 02:20:36 +04:00
if ( error )
return error ;
2021-01-21 16:19:28 +03:00
error = getxattr ( mnt_user_ns ( path . mnt ) , path . dentry , name , value , size ) ;
2008-07-22 17:59:21 +04:00
path_put ( & path ) ;
2012-12-11 21:10:16 +04:00
if ( retry_estale ( error , lookup_flags ) ) {
lookup_flags | = LOOKUP_REVAL ;
goto retry ;
}
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
SYSCALL_DEFINE4 ( getxattr , const char __user * , pathname ,
const char __user * , name , void __user * , value , size_t , size )
{
return path_getxattr ( pathname , name , value , size , LOOKUP_FOLLOW ) ;
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE4 ( lgetxattr , const char __user * , pathname ,
const char __user * , name , void __user * , value , size_t , size )
2005-04-17 02:20:36 +04:00
{
2014-10-12 20:59:58 +04:00
return path_getxattr ( pathname , name , value , size , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE4 ( fgetxattr , int , fd , const char __user * , name ,
void __user * , value , size_t , size )
2005-04-17 02:20:36 +04:00
{
2012-08-28 20:52:22 +04:00
struct fd f = fdget ( fd ) ;
2005-04-17 02:20:36 +04:00
ssize_t error = - EBADF ;
2012-08-28 20:52:22 +04:00
if ( ! f . file )
2005-04-17 02:20:36 +04:00
return error ;
2014-11-01 00:44:57 +03:00
audit_file ( f . file ) ;
2021-01-21 16:19:28 +03:00
error = getxattr ( file_mnt_user_ns ( f . file ) , f . file - > f_path . dentry ,
name , value , size ) ;
2012-08-28 20:52:22 +04:00
fdput ( f ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
/*
* Extended attribute LIST operations
*/
static ssize_t
listxattr ( struct dentry * d , char __user * list , size_t size )
{
ssize_t error ;
char * klist = NULL ;
if ( size ) {
if ( size > XATTR_LIST_MAX )
size = XATTR_LIST_MAX ;
2017-05-09 01:57:27 +03:00
klist = kvmalloc ( size , GFP_KERNEL ) ;
if ( ! klist )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
}
2006-10-10 00:10:48 +04:00
error = vfs_listxattr ( d , klist , size ) ;
2005-09-04 02:55:18 +04:00
if ( error > 0 ) {
if ( size & & copy_to_user ( list , klist , error ) )
error = - EFAULT ;
} else if ( error = = - ERANGE & & size > = XATTR_LIST_MAX ) {
/* The file system tried to returned a list bigger
than XATTR_LIST_MAX bytes . Not possible . */
error = - E2BIG ;
2005-04-17 02:20:36 +04:00
}
2016-01-03 01:09:47 +03:00
kvfree ( klist ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
static ssize_t path_listxattr ( const char __user * pathname , char __user * list ,
size_t size , unsigned int lookup_flags )
2005-04-17 02:20:36 +04:00
{
2008-07-22 17:59:21 +04:00
struct path path ;
2005-04-17 02:20:36 +04:00
ssize_t error ;
2012-12-11 21:10:16 +04:00
retry :
error = user_path_at ( AT_FDCWD , pathname , lookup_flags , & path ) ;
2005-04-17 02:20:36 +04:00
if ( error )
return error ;
2008-07-22 17:59:21 +04:00
error = listxattr ( path . dentry , list , size ) ;
path_put ( & path ) ;
2012-12-11 21:10:16 +04:00
if ( retry_estale ( error , lookup_flags ) ) {
lookup_flags | = LOOKUP_REVAL ;
goto retry ;
}
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
SYSCALL_DEFINE3 ( listxattr , const char __user * , pathname , char __user * , list ,
size_t , size )
{
return path_listxattr ( pathname , list , size , LOOKUP_FOLLOW ) ;
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE3 ( llistxattr , const char __user * , pathname , char __user * , list ,
size_t , size )
2005-04-17 02:20:36 +04:00
{
2014-10-12 20:59:58 +04:00
return path_listxattr ( pathname , list , size , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-14 16:14:14 +03:00
SYSCALL_DEFINE3 ( flistxattr , int , fd , char __user * , list , size_t , size )
2005-04-17 02:20:36 +04:00
{
2012-08-28 20:52:22 +04:00
struct fd f = fdget ( fd ) ;
2005-04-17 02:20:36 +04:00
ssize_t error = - EBADF ;
2012-08-28 20:52:22 +04:00
if ( ! f . file )
2005-04-17 02:20:36 +04:00
return error ;
2014-11-01 00:44:57 +03:00
audit_file ( f . file ) ;
2012-08-28 20:52:22 +04:00
error = listxattr ( f . file - > f_path . dentry , list , size ) ;
fdput ( f ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
/*
* Extended attribute REMOVE operations
*/
static long
2021-01-21 16:19:28 +03:00
removexattr ( struct user_namespace * mnt_userns , struct dentry * d ,
const char __user * name )
2005-04-17 02:20:36 +04:00
{
int error ;
char kname [ XATTR_NAME_MAX + 1 ] ;
error = strncpy_from_user ( kname , name , sizeof ( kname ) ) ;
if ( error = = 0 | | error = = sizeof ( kname ) )
error = - ERANGE ;
if ( error < 0 )
return error ;
2021-01-21 16:19:28 +03:00
return vfs_removexattr ( mnt_userns , d , kname ) ;
2005-04-17 02:20:36 +04:00
}
2014-10-12 20:59:58 +04:00
static int path_removexattr ( const char __user * pathname ,
const char __user * name , unsigned int lookup_flags )
2005-04-17 02:20:36 +04:00
{
2008-07-22 17:59:21 +04:00
struct path path ;
2005-04-17 02:20:36 +04:00
int error ;
2012-12-11 21:10:17 +04:00
retry :
error = user_path_at ( AT_FDCWD , pathname , lookup_flags , & path ) ;
2005-04-17 02:20:36 +04:00
if ( error )
return error ;
2008-07-22 17:59:21 +04:00
error = mnt_want_write ( path . mnt ) ;
2008-02-16 01:37:38 +03:00
if ( ! error ) {
2021-01-21 16:19:28 +03:00
error = removexattr ( mnt_user_ns ( path . mnt ) , path . dentry , name ) ;
2008-07-22 17:59:21 +04:00
mnt_drop_write ( path . mnt ) ;
2008-02-16 01:37:38 +03:00
}
2008-07-22 17:59:21 +04:00
path_put ( & path ) ;
2012-12-11 21:10:17 +04:00
if ( retry_estale ( error , lookup_flags ) ) {
lookup_flags | = LOOKUP_REVAL ;
goto retry ;
}
2005-04-17 02:20:36 +04:00
return error ;
}
2014-10-12 20:59:58 +04:00
SYSCALL_DEFINE2 ( removexattr , const char __user * , pathname ,
const char __user * , name )
{
return path_removexattr ( pathname , name , LOOKUP_FOLLOW ) ;
}
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE2 ( lremovexattr , const char __user * , pathname ,
const char __user * , name )
2005-04-17 02:20:36 +04:00
{
2014-10-12 20:59:58 +04:00
return path_removexattr ( pathname , name , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE2 ( fremovexattr , int , fd , const char __user * , name )
2005-04-17 02:20:36 +04:00
{
2012-08-28 20:52:22 +04:00
struct fd f = fdget ( fd ) ;
2005-04-17 02:20:36 +04:00
int error = - EBADF ;
2012-08-28 20:52:22 +04:00
if ( ! f . file )
2005-04-17 02:20:36 +04:00
return error ;
2014-11-01 00:44:57 +03:00
audit_file ( f . file ) ;
2018-07-18 16:44:43 +03:00
error = mnt_want_write_file ( f . file ) ;
2008-02-16 01:37:38 +03:00
if ( ! error ) {
2021-01-21 16:19:28 +03:00
error = removexattr ( file_mnt_user_ns ( f . file ) ,
f . file - > f_path . dentry , name ) ;
2018-07-18 16:44:43 +03:00
mnt_drop_write_file ( f . file ) ;
2008-02-16 01:37:38 +03:00
}
2012-08-28 20:52:22 +04:00
fdput ( f ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
/*
* Combine the results of the list ( ) operation from every xattr_handler in the
* list .
*/
ssize_t
generic_listxattr ( struct dentry * dentry , char * buffer , size_t buffer_size )
{
2010-05-14 04:53:14 +04:00
const struct xattr_handler * handler , * * handlers = dentry - > d_sb - > s_xattr ;
2005-04-17 02:20:36 +04:00
unsigned int size = 0 ;
if ( ! buffer ) {
2009-11-13 12:52:56 +03:00
for_each_xattr_handler ( handlers , handler ) {
2015-12-02 16:44:43 +03:00
if ( ! handler - > name | |
( handler - > list & & ! handler - > list ( dentry ) ) )
2015-12-02 16:44:41 +03:00
continue ;
2015-12-02 16:44:43 +03:00
size + = strlen ( handler - > name ) + 1 ;
2009-11-13 12:52:56 +03:00
}
2005-04-17 02:20:36 +04:00
} else {
char * buf = buffer ;
2015-12-02 16:44:43 +03:00
size_t len ;
2005-04-17 02:20:36 +04:00
for_each_xattr_handler ( handlers , handler ) {
2015-12-02 16:44:43 +03:00
if ( ! handler - > name | |
( handler - > list & & ! handler - > list ( dentry ) ) )
2015-12-02 16:44:41 +03:00
continue ;
2015-12-02 16:44:43 +03:00
len = strlen ( handler - > name ) ;
if ( len + 1 > buffer_size )
2005-04-17 02:20:36 +04:00
return - ERANGE ;
2015-12-02 16:44:43 +03:00
memcpy ( buf , handler - > name , len + 1 ) ;
buf + = len + 1 ;
buffer_size - = len + 1 ;
2005-04-17 02:20:36 +04:00
}
size = buf - buffer ;
}
return size ;
}
EXPORT_SYMBOL ( generic_listxattr ) ;
2012-08-24 00:53:28 +04:00
2015-10-04 20:18:52 +03:00
/**
* xattr_full_name - Compute full attribute name from suffix
*
* @ handler : handler of the xattr_handler operation
* @ name : name passed to the xattr_handler operation
*
* The get and set xattr handler operations are called with the remainder of
* the attribute name after skipping the handler ' s prefix : for example , " foo "
* is passed to the get operation of a handler with prefix " user. " to get
* attribute " user.foo " . The full name is still " there " in the name though .
*
* Note : the list xattr handler operation when called from the vfs is passed a
* NULL name ; some file systems use this operation internally , with varying
* semantics .
*/
const char * xattr_full_name ( const struct xattr_handler * handler ,
const char * name )
{
2015-12-02 16:44:36 +03:00
size_t prefix_len = strlen ( xattr_prefix ( handler ) ) ;
2015-10-04 20:18:52 +03:00
return name - prefix_len ;
}
EXPORT_SYMBOL ( xattr_full_name ) ;
2012-08-24 00:53:28 +04:00
/*
* Allocate new xattr and copy in the value ; but leave the name to callers .
*/
struct simple_xattr * simple_xattr_alloc ( const void * value , size_t size )
{
struct simple_xattr * new_xattr ;
size_t len ;
/* wrap around? */
len = sizeof ( * new_xattr ) + size ;
2014-07-24 01:00:17 +04:00
if ( len < sizeof ( * new_xattr ) )
2012-08-24 00:53:28 +04:00
return NULL ;
2020-03-12 23:03:14 +03:00
new_xattr = kvmalloc ( len , GFP_KERNEL ) ;
2012-08-24 00:53:28 +04:00
if ( ! new_xattr )
return NULL ;
new_xattr - > size = size ;
memcpy ( new_xattr - > value , value , size ) ;
return new_xattr ;
}
/*
* xattr GET operation for in - memory / pseudo filesystems
*/
int simple_xattr_get ( struct simple_xattrs * xattrs , const char * name ,
void * buffer , size_t size )
{
struct simple_xattr * xattr ;
int ret = - ENODATA ;
spin_lock ( & xattrs - > lock ) ;
list_for_each_entry ( xattr , & xattrs - > head , list ) {
if ( strcmp ( name , xattr - > name ) )
continue ;
ret = xattr - > size ;
if ( buffer ) {
if ( size < xattr - > size )
ret = - ERANGE ;
else
memcpy ( buffer , xattr - > value , xattr - > size ) ;
}
break ;
}
spin_unlock ( & xattrs - > lock ) ;
return ret ;
}
2015-12-02 16:44:38 +03:00
/**
* simple_xattr_set - xattr SET operation for in - memory / pseudo filesystems
* @ xattrs : target simple_xattr list
* @ name : name of the extended attribute
* @ value : value of the xattr . If % NULL , will remove the attribute .
* @ size : size of the new xattr
* @ flags : % XATTR_ { CREATE | REPLACE }
2020-03-12 23:03:15 +03:00
* @ removed_size : returns size of the removed xattr , - 1 if none removed
2015-12-02 16:44:38 +03:00
*
* % XATTR_CREATE is set , the xattr shouldn ' t exist already ; otherwise fails
* with - EEXIST . If % XATTR_REPLACE is set , the xattr should exist ;
* otherwise , fails with - ENODATA .
*
* Returns 0 on success , - errno on failure .
*/
int simple_xattr_set ( struct simple_xattrs * xattrs , const char * name ,
2020-03-12 23:03:15 +03:00
const void * value , size_t size , int flags ,
ssize_t * removed_size )
2012-08-24 00:53:28 +04:00
{
struct simple_xattr * xattr ;
2012-10-18 07:41:15 +04:00
struct simple_xattr * new_xattr = NULL ;
2012-08-24 00:53:28 +04:00
int err = 0 ;
2020-04-09 09:27:29 +03:00
if ( removed_size )
* removed_size = - 1 ;
2012-08-24 00:53:28 +04:00
/* value == NULL means remove */
if ( value ) {
new_xattr = simple_xattr_alloc ( value , size ) ;
if ( ! new_xattr )
return - ENOMEM ;
new_xattr - > name = kstrdup ( name , GFP_KERNEL ) ;
if ( ! new_xattr - > name ) {
2020-03-12 23:03:14 +03:00
kvfree ( new_xattr ) ;
2012-08-24 00:53:28 +04:00
return - ENOMEM ;
}
}
spin_lock ( & xattrs - > lock ) ;
list_for_each_entry ( xattr , & xattrs - > head , list ) {
if ( ! strcmp ( name , xattr - > name ) ) {
if ( flags & XATTR_CREATE ) {
xattr = new_xattr ;
err = - EEXIST ;
} else if ( new_xattr ) {
list_replace ( & xattr - > list , & new_xattr - > list ) ;
2020-03-12 23:03:15 +03:00
if ( removed_size )
* removed_size = xattr - > size ;
2012-08-24 00:53:28 +04:00
} else {
list_del ( & xattr - > list ) ;
2020-03-12 23:03:15 +03:00
if ( removed_size )
* removed_size = xattr - > size ;
2012-08-24 00:53:28 +04:00
}
goto out ;
}
}
if ( flags & XATTR_REPLACE ) {
xattr = new_xattr ;
err = - ENODATA ;
} else {
list_add ( & new_xattr - > list , & xattrs - > head ) ;
xattr = NULL ;
}
out :
spin_unlock ( & xattrs - > lock ) ;
if ( xattr ) {
kfree ( xattr - > name ) ;
2020-03-12 23:03:14 +03:00
kvfree ( xattr ) ;
2012-08-24 00:53:28 +04:00
}
return err ;
}
static bool xattr_is_trusted ( const char * name )
{
return ! strncmp ( name , XATTR_TRUSTED_PREFIX , XATTR_TRUSTED_PREFIX_LEN ) ;
}
2015-12-02 16:44:39 +03:00
static int xattr_list_one ( char * * buffer , ssize_t * remaining_size ,
const char * name )
{
size_t len = strlen ( name ) + 1 ;
if ( * buffer ) {
if ( * remaining_size < len )
return - ERANGE ;
memcpy ( * buffer , name , len ) ;
* buffer + = len ;
}
* remaining_size - = len ;
return 0 ;
}
2012-08-24 00:53:28 +04:00
/*
* xattr LIST operation for in - memory / pseudo filesystems
*/
2015-12-02 16:44:39 +03:00
ssize_t simple_xattr_list ( struct inode * inode , struct simple_xattrs * xattrs ,
char * buffer , size_t size )
2012-08-24 00:53:28 +04:00
{
bool trusted = capable ( CAP_SYS_ADMIN ) ;
struct simple_xattr * xattr ;
2015-12-02 16:44:39 +03:00
ssize_t remaining_size = size ;
2016-02-04 04:56:30 +03:00
int err = 0 ;
2015-12-02 16:44:39 +03:00
# ifdef CONFIG_FS_POSIX_ACL
2018-09-18 07:36:36 +03:00
if ( IS_POSIXACL ( inode ) ) {
if ( inode - > i_acl ) {
err = xattr_list_one ( & buffer , & remaining_size ,
XATTR_NAME_POSIX_ACL_ACCESS ) ;
if ( err )
return err ;
}
if ( inode - > i_default_acl ) {
err = xattr_list_one ( & buffer , & remaining_size ,
XATTR_NAME_POSIX_ACL_DEFAULT ) ;
if ( err )
return err ;
}
2015-12-02 16:44:39 +03:00
}
# endif
2012-08-24 00:53:28 +04:00
spin_lock ( & xattrs - > lock ) ;
list_for_each_entry ( xattr , & xattrs - > head , list ) {
/* skip "trusted." attributes for unprivileged callers */
if ( ! trusted & & xattr_is_trusted ( xattr - > name ) )
continue ;
2015-12-02 16:44:39 +03:00
err = xattr_list_one ( & buffer , & remaining_size , xattr - > name ) ;
if ( err )
2016-02-04 04:56:30 +03:00
break ;
2012-08-24 00:53:28 +04:00
}
spin_unlock ( & xattrs - > lock ) ;
2016-02-04 04:56:30 +03:00
return err ? err : size - remaining_size ;
2012-08-24 00:53:28 +04:00
}
2012-09-12 00:28:11 +04:00
/*
* Adds an extended attribute to the list
*/
2012-08-24 00:53:28 +04:00
void simple_xattr_list_add ( struct simple_xattrs * xattrs ,
struct simple_xattr * new_xattr )
{
spin_lock ( & xattrs - > lock ) ;
list_add ( & new_xattr - > list , & xattrs - > head ) ;
spin_unlock ( & xattrs - > lock ) ;
}