b36a5780cb
We ran into issues where mount(8) passed multiple lower layers as one big string through fsconfig(). But the fsconfig() FSCONFIG_SET_STRING option is limited to 256 bytes in strndup_user(). While this would be fixable by extending the fsconfig() buffer I'd rather encourage users to append layers via multiple fsconfig() calls as the interface allows nicely for this. This has also been requested as a feature before. With this port to the new mount api the following will be possible: fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir", "/lower1", 0); /* set upper layer */ fsconfig(fs_fd, FSCONFIG_SET_STRING, "upperdir", "/upper", 0); /* append "/lower2", "/lower3", and "/lower4" */ fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir", ":/lower2:/lower3:/lower4", 0); /* turn index feature on */ fsconfig(fs_fd, FSCONFIG_SET_STRING, "index", "on", 0); /* append "/lower5" */ fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir", ":/lower5", 0); Specifying ':' would have been rejected so this isn't a regression. And we can't simply use "lowerdir=/lower" to append on top of existing layers as "lowerdir=/lower,lowerdir=/other-lower" would make "/other-lower" the only lower layer so we'd break uapi if we changed this. So the ':' prefix seems a good compromise. Users can choose to specify multiple layers at once or individual layers. A layer is appended if it starts with ":". This requires that the user has already added at least one layer before. If lowerdir is specified again without a leading ":" then all previous layers are dropped and replaced with the new layers. If lowerdir is specified and empty than all layers are simply dropped. An additional change is that overlayfs will now parse and resolve layers right when they are specified in fsconfig() instead of deferring until super block creation. This allows users to receive early errors. It also allows users to actually use up to 500 layers something which was theoretically possible but ended up not working due to the mount option string passed via mount(2) being too large. This also allows a more privileged process to set config options for a lesser privileged process as the creds for fsconfig() and the creds for fsopen() can differ. We could restrict that they match by enforcing that the creds of fsopen() and fsconfig() match but I don't see why that needs to be the case and allows for a good delegation mechanism. Plus, in the future it means we're able to extend overlayfs mount options and allow users to specify layers via file descriptors instead of paths: fsconfig(FSCONFIG_SET_PATH{_EMPTY}, "lowerdir", "lower1", dirfd); /* append */ fsconfig(FSCONFIG_SET_PATH{_EMPTY}, "lowerdir", "lower2", dirfd); /* append */ fsconfig(FSCONFIG_SET_PATH{_EMPTY}, "lowerdir", "lower3", dirfd); /* clear all layers specified until now */ fsconfig(FSCONFIG_SET_STRING, "lowerdir", NULL, 0); This would be especially nice if users create an overlayfs mount on top of idmapped layers or just in general private mounts created via open_tree(OPEN_TREE_CLONE). Those mounts would then never have to appear anywhere in the filesystem. But for now just do the minimal thing. We should probably aim to move more validation into ovl_fs_parse_param() so users get errors before fsconfig(FSCONFIG_CMD_CREATE). But that can be done in additional patches later. This is now also rebased on top of the lazy lowerdata lookup which allows the specificatin of data only layers using the new "::" syntax. The rules are simple. A data only layers cannot be followed by any regular layers and data layers must be preceeded by at least one regular layer. Parsing the lowerdir mount option must change because of this. The original patchset used the old lowerdir parsing function to split a lowerdir mount option string such as: lowerdir=/lower1:/lower2::/lower3::/lower4 simply replacing each non-escaped ":" by "\0". So sequences of non-escaped ":" were counted as layers. For example, the previous lowerdir mount option above would've counted 6 layers instead of 4 and a lowerdir mount option such as: lowerdir="/lower1:/lower2::/lower3::/lower4:::::::::::::::::::::::::::" would be counted as 33 layers. Other than being ugly this didn't matter much because kern_path() would reject the first "\0" layer. However, this overcounting of layers becomes problematic when we base allocations on it where we very much only want to allocate space for 4 layers instead of 33. So the new parsing function rejects non-escaped sequences of colons other than ":" and "::" immediately instead of relying on kern_path(). Link: https://github.com/util-linux/util-linux/issues/2287 Link: https://github.com/util-linux/util-linux/issues/1992 Link: https://bugs.archlinux.org/task/78702 Link: https://lore.kernel.org/linux-unionfs/20230530-klagen-zudem-32c0908c2108@brauner Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
199 lines
4.8 KiB
C
199 lines
4.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
*
|
|
* Copyright (C) 2011 Novell Inc.
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
|
*/
|
|
|
|
struct ovl_config {
|
|
char *upperdir;
|
|
char *workdir;
|
|
bool default_permissions;
|
|
int redirect_mode;
|
|
bool index;
|
|
bool uuid;
|
|
bool nfs_export;
|
|
int xino;
|
|
bool metacopy;
|
|
bool userxattr;
|
|
bool ovl_volatile;
|
|
};
|
|
|
|
struct ovl_sb {
|
|
struct super_block *sb;
|
|
dev_t pseudo_dev;
|
|
/* Unusable (conflicting) uuid */
|
|
bool bad_uuid;
|
|
/* Used as a lower layer (but maybe also as upper) */
|
|
bool is_lower;
|
|
};
|
|
|
|
struct ovl_layer {
|
|
/* ovl_free_fs() relies on @mnt being the first member! */
|
|
struct vfsmount *mnt;
|
|
/* Trap in ovl inode cache */
|
|
struct inode *trap;
|
|
struct ovl_sb *fs;
|
|
/* Index of this layer in fs root (upper idx == 0) */
|
|
int idx;
|
|
/* One fsid per unique underlying sb (upper fsid == 0) */
|
|
int fsid;
|
|
char *name;
|
|
};
|
|
|
|
/*
|
|
* ovl_free_fs() relies on @mnt being the first member when unmounting
|
|
* the private mounts created for each layer. Let's check both the
|
|
* offset and type.
|
|
*/
|
|
static_assert(offsetof(struct ovl_layer, mnt) == 0);
|
|
static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
|
|
|
|
struct ovl_path {
|
|
const struct ovl_layer *layer;
|
|
struct dentry *dentry;
|
|
};
|
|
|
|
struct ovl_entry {
|
|
unsigned int __numlower;
|
|
struct ovl_path __lowerstack[];
|
|
};
|
|
|
|
/* private information held for overlayfs's superblock */
|
|
struct ovl_fs {
|
|
unsigned int numlayer;
|
|
/* Number of unique fs among layers including upper fs */
|
|
unsigned int numfs;
|
|
/* Number of data-only lower layers */
|
|
unsigned int numdatalayer;
|
|
const struct ovl_layer *layers;
|
|
struct ovl_sb *fs;
|
|
/* workbasedir is the path at workdir= mount option */
|
|
struct dentry *workbasedir;
|
|
/* workdir is the 'work' directory under workbasedir */
|
|
struct dentry *workdir;
|
|
/* index directory listing overlay inodes by origin file handle */
|
|
struct dentry *indexdir;
|
|
long namelen;
|
|
/* pathnames of lower and upper dirs, for show_options */
|
|
struct ovl_config config;
|
|
/* creds of process who forced instantiation of super block */
|
|
const struct cred *creator_cred;
|
|
bool tmpfile;
|
|
bool noxattr;
|
|
/* Did we take the inuse lock? */
|
|
bool upperdir_locked;
|
|
bool workdir_locked;
|
|
/* Traps in ovl inode cache */
|
|
struct inode *workbasedir_trap;
|
|
struct inode *workdir_trap;
|
|
struct inode *indexdir_trap;
|
|
/* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
|
|
int xino_mode;
|
|
/* For allocation of non-persistent inode numbers */
|
|
atomic_long_t last_ino;
|
|
/* Shared whiteout cache */
|
|
struct dentry *whiteout;
|
|
bool no_shared_whiteout;
|
|
/* r/o snapshot of upperdir sb's only taken on volatile mounts */
|
|
errseq_t errseq;
|
|
};
|
|
|
|
/* Number of lower layers, not including data-only layers */
|
|
static inline unsigned int ovl_numlowerlayer(struct ovl_fs *ofs)
|
|
{
|
|
return ofs->numlayer - ofs->numdatalayer - 1;
|
|
}
|
|
|
|
static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
|
|
{
|
|
return ofs->layers[0].mnt;
|
|
}
|
|
|
|
static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs)
|
|
{
|
|
return mnt_idmap(ovl_upper_mnt(ofs));
|
|
}
|
|
|
|
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
|
|
{
|
|
return (struct ovl_fs *)sb->s_fs_info;
|
|
}
|
|
|
|
static inline bool ovl_should_sync(struct ovl_fs *ofs)
|
|
{
|
|
return !ofs->config.ovl_volatile;
|
|
}
|
|
|
|
static inline unsigned int ovl_numlower(struct ovl_entry *oe)
|
|
{
|
|
return oe ? oe->__numlower : 0;
|
|
}
|
|
|
|
static inline struct ovl_path *ovl_lowerstack(struct ovl_entry *oe)
|
|
{
|
|
return ovl_numlower(oe) ? oe->__lowerstack : NULL;
|
|
}
|
|
|
|
static inline struct ovl_path *ovl_lowerpath(struct ovl_entry *oe)
|
|
{
|
|
return ovl_lowerstack(oe);
|
|
}
|
|
|
|
static inline struct ovl_path *ovl_lowerdata(struct ovl_entry *oe)
|
|
{
|
|
struct ovl_path *lowerstack = ovl_lowerstack(oe);
|
|
|
|
return lowerstack ? &lowerstack[oe->__numlower - 1] : NULL;
|
|
}
|
|
|
|
/* May return NULL if lazy lookup of lowerdata is needed */
|
|
static inline struct dentry *ovl_lowerdata_dentry(struct ovl_entry *oe)
|
|
{
|
|
struct ovl_path *lowerdata = ovl_lowerdata(oe);
|
|
|
|
return lowerdata ? READ_ONCE(lowerdata->dentry) : NULL;
|
|
}
|
|
|
|
/* private information held for every overlayfs dentry */
|
|
static inline unsigned long *OVL_E_FLAGS(struct dentry *dentry)
|
|
{
|
|
return (unsigned long *) &dentry->d_fsdata;
|
|
}
|
|
|
|
struct ovl_inode {
|
|
union {
|
|
struct ovl_dir_cache *cache; /* directory */
|
|
const char *lowerdata_redirect; /* regular file */
|
|
};
|
|
const char *redirect;
|
|
u64 version;
|
|
unsigned long flags;
|
|
struct inode vfs_inode;
|
|
struct dentry *__upperdentry;
|
|
struct ovl_entry *oe;
|
|
|
|
/* synchronize copy up and more */
|
|
struct mutex lock;
|
|
};
|
|
|
|
static inline struct ovl_inode *OVL_I(struct inode *inode)
|
|
{
|
|
return container_of(inode, struct ovl_inode, vfs_inode);
|
|
}
|
|
|
|
static inline struct ovl_entry *OVL_I_E(struct inode *inode)
|
|
{
|
|
return inode ? OVL_I(inode)->oe : NULL;
|
|
}
|
|
|
|
static inline struct ovl_entry *OVL_E(struct dentry *dentry)
|
|
{
|
|
return OVL_I_E(d_inode(dentry));
|
|
}
|
|
|
|
static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
|
|
{
|
|
return READ_ONCE(oi->__upperdentry);
|
|
}
|