1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-05 13:18:06 +03:00

Merge pull request #33475 from poettering/name-to-handle-at-fid

teach inode_same() the concept of name_to_handle_at() FIDs to properly detect inode identities
This commit is contained in:
Lennart Poettering 2024-07-01 18:53:14 +02:00 committed by GitHub
commit 1c30bf35bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 227 additions and 34 deletions

View File

@ -92,3 +92,7 @@
#define RAW_O_LARGEFILE 00100000
#endif
#endif
#ifndef AT_HANDLE_FID
#define AT_HANDLE_FID AT_REMOVEDIR
#endif

View File

@ -13,6 +13,7 @@
#include "fileio.h"
#include "filesystems.h"
#include "fs-util.h"
#include "missing_fcntl.h"
#include "missing_fs.h"
#include "missing_mount.h"
#include "missing_stat.h"
@ -35,6 +36,24 @@
* with large file handles anyway. */
#define ORIGINAL_MAX_HANDLE_SZ 128
bool is_name_to_handle_at_fatal_error(int err) {
/* name_to_handle_at() can return "acceptable" errors that are due to the context. For example the
* kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
* (EACCES/EPERM; maybe through seccomp, because we are running inside of a container), or the mount
* point is not triggered yet (EOVERFLOW, think autofs+nfs4), or some general name_to_handle_at()
* flakiness (EINVAL). However other errors are not supposed to happen and therefore are considered
* fatal ones. */
assert(err < 0);
if (ERRNO_IS_NEG_NOT_SUPPORTED(err))
return false;
if (ERRNO_IS_NEG_PRIVILEGE(err))
return false;
return !IN_SET(err, -EOVERFLOW, -EINVAL);
}
int name_to_handle_at_loop(
int fd,
const char *path,
@ -44,7 +63,8 @@ int name_to_handle_at_loop(
size_t n = ORIGINAL_MAX_HANDLE_SZ;
assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
assert(fd >= 0 || fd == AT_FDCWD);
assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH|AT_HANDLE_FID)) == 0);
/* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
* buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
@ -86,9 +106,9 @@ int name_to_handle_at_loop(
return 0;
}
/* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
* else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
* buffer. In that case propagate EOVERFLOW */
/* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by
* something else (apparently EOVERFLOW is returned for untriggered nfs4 autofs mounts
* sometimes), not by the too small buffer. In that case propagate EOVERFLOW */
if (h->handle_bytes <= n)
return -EOVERFLOW;
@ -101,6 +121,30 @@ int name_to_handle_at_loop(
}
}
int name_to_handle_at_try_fid(
int fd,
const char *path,
struct file_handle **ret_handle,
int *ret_mnt_id,
int flags) {
int r;
assert(fd >= 0 || fd == AT_FDCWD);
/* First issues name_to_handle_at() with AT_HANDLE_FID. If this fails and this is not a fatal error
* we'll try without the flag, in order to support older kernels that didn't have AT_HANDLE_FID
* (i.e. older than Linux 6.5). */
r = name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags | AT_HANDLE_FID);
if (r >= 0)
return r;
if (is_name_to_handle_at_fatal_error(r))
return r;
return name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags & ~AT_HANDLE_FID);
}
static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
_cleanup_free_ char *fdinfo = NULL;
@ -160,17 +204,15 @@ static bool filename_possibly_with_slash_suffix(const char *s) {
return filename_is_valid(copied);
}
static bool is_name_to_handle_at_fatal_error(int err) {
/* name_to_handle_at() can return "acceptable" errors that are due to the context. For
* example the kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall
* was blocked (EACCES/EPERM; maybe through seccomp, because we are running inside of a
* container), or the mount point is not triggered yet (EOVERFLOW, think nfs4), or some
* general name_to_handle_at() flakiness (EINVAL). However other errors are not supposed to
* happen and therefore are considered fatal ones. */
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b) {
if (a == b)
return true;
if (!a != !b)
return false;
if (a->handle_type != b->handle_type)
return false;
assert(err < 0);
return !IN_SET(err, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL);
return memcmp_nn(a->f_handle, a->handle_bytes, b->f_handle, b->handle_bytes) == 0;
}
int fd_is_mount_point(int fd, const char *filename, int flags) {
@ -234,11 +276,11 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
else if (FLAGS_SET(sx.stx_mask, STATX_TYPE) && S_ISLNK(sx.stx_mode))
return false; /* symlinks are never mount points */
r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
r = name_to_handle_at_try_fid(fd, filename, &h, &mount_id, flags);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (r != -EOPNOTSUPP)
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
/* This kernel or file system does not support name_to_handle_at(), hence let's see
@ -248,13 +290,13 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
}
if (isempty(filename))
r = name_to_handle_at_loop(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
r = name_to_handle_at_try_fid(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
else
r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
r = name_to_handle_at_try_fid(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (r != -EOPNOTSUPP)
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
if (nosupp)
/* Both the parent and the directory can't do name_to_handle_at() */
@ -272,17 +314,14 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
/* If the file handle for the directory we are interested in and its parent are identical,
* we assume this is the root directory, which is a mount point. */
if (h->handle_type == h_parent->handle_type &&
memcmp_nn(h->f_handle, h->handle_bytes,
h_parent->f_handle, h_parent->handle_bytes) == 0)
if (file_handle_equal(h_parent, h))
return 1;
return mount_id != mount_id_parent;
fallback_fdinfo:
r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM, -ENOSYS))
if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_PRIVILEGE(r))
goto fallback_fstat;
if (r < 0)
return r;

View File

@ -36,7 +36,12 @@
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
bool is_name_to_handle_at_fatal_error(int err);
int name_to_handle_at_loop(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
int name_to_handle_at_try_fid(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b);
int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret);
int path_get_mnt_id_at(int dir_fd, const char *path, int *ret);

View File

@ -20,6 +20,7 @@
#include "missing_fs.h"
#include "missing_magic.h"
#include "missing_syscall.h"
#include "mountpoint-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
#include "stat-util.h"
@ -271,18 +272,103 @@ int path_is_read_only_fs(const char *path) {
}
int inode_same_at(int fda, const char *filea, int fdb, const char *fileb, int flags) {
struct stat a, b;
struct stat sta, stb;
int r;
assert(fda >= 0 || fda == AT_FDCWD);
assert(fdb >= 0 || fdb == AT_FDCWD);
assert((flags & ~(AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW|AT_NO_AUTOMOUNT)) == 0);
if (fstatat(fda, strempty(filea), &a, flags) < 0)
return log_debug_errno(errno, "Cannot stat %s: %m", filea);
/* Refuse an unset filea or fileb early unless AT_EMPTY_PATH is set */
if ((isempty(filea) || isempty(fileb)) && !FLAGS_SET(flags, AT_EMPTY_PATH))
return -EINVAL;
if (fstatat(fdb, strempty(fileb), &b, flags) < 0)
return log_debug_errno(errno, "Cannot stat %s: %m", fileb);
/* Shortcut: comparing the same fd with itself means we can return true */
if (fda >= 0 && fda == fdb && isempty(filea) && isempty(fileb) && FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW))
return true;
return stat_inode_same(&a, &b);
_cleanup_close_ int pin_a = -EBADF, pin_b = -EBADF;
if (!FLAGS_SET(flags, AT_NO_AUTOMOUNT)) {
/* Let's try to use the name_to_handle_at() AT_HANDLE_FID API to identify identical
* inodes. We have to issue multiple calls on the same file for that (first, to acquire the
* FID, and then to check if .st_dev is actually the same). Hence let's pin the inode in
* between via O_PATH, unless we already have an fd for it. */
if (!isempty(filea)) {
pin_a = openat(fda, filea, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? O_NOFOLLOW : 0));
if (pin_a < 0)
return -errno;
fda = pin_a;
filea = NULL;
flags |= AT_EMPTY_PATH;
}
if (!isempty(fileb)) {
pin_b = openat(fdb, fileb, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? O_NOFOLLOW : 0));
if (pin_b < 0)
return -errno;
fdb = pin_b;
fileb = NULL;
flags |= AT_EMPTY_PATH;
}
int ntha_flags = (flags & AT_EMPTY_PATH) | (FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? 0 : AT_SYMLINK_FOLLOW);
_cleanup_free_ struct file_handle *ha = NULL, *hb = NULL;
int mntida = -1, mntidb = -1;
r = name_to_handle_at_try_fid(
fda,
filea,
&ha,
&mntida,
ntha_flags);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
goto fallback;
}
r = name_to_handle_at_try_fid(
fdb,
fileb,
&hb,
&mntidb,
ntha_flags);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
goto fallback;
}
/* Now compare the two file handles */
if (!file_handle_equal(ha, hb))
return false;
/* If the file handles are the same and they come from the same mount ID? Great, then we are
* good, they are definitely the same */
if (mntida == mntidb)
return true;
/* File handles are the same, they are not on the same mount id. This might either be because
* they are on two entirely different file systems, that just happen to have the same FIDs
* (because they originally where created off the same disk images), or it could be because
* they are located on two distinct bind mounts of the same fs. To check that, let's look at
* .st_rdev of the inode. We simply reuse the fallback codepath for that, since it checks
* exactly that (it checks slightly more, but we don't care.) */
}
fallback:
if (fstatat(fda, strempty(filea), &sta, flags) < 0)
return log_debug_errno(errno, "Cannot stat %s: %m", strna(filea));
if (fstatat(fdb, strempty(fileb), &stb, flags) < 0)
return log_debug_errno(errno, "Cannot stat %s: %m", strna(fileb));
return stat_inode_same(&sta, &stb);
}
bool is_fs_type(const struct statfs *s, statfs_f_type_t magic_value) {

View File

@ -1815,3 +1815,13 @@ int make_fsmount(
return TAKE_FD(mnt_fd);
}
char* umount_and_unlink_and_free(char *p) {
if (!p)
return NULL;
PROTECT_ERRNO;
(void) umount2(p, 0);
(void) unlink(p);
return mfree(p);
}

View File

@ -100,6 +100,9 @@ static inline char *umount_and_free(char *p) {
}
DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_free);
char* umount_and_unlink_and_free(char *p);
DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_unlink_and_free);
int bind_mount_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory);
int mount_image_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory, const MountOptions *options, const ImagePolicy *image_policy);

View File

@ -4,6 +4,7 @@
#include <linux/magic.h>
#include <sched.h>
#include <sys/eventfd.h>
#include <sys/mount.h>
#include <unistd.h>
#include "alloc-util.h"
@ -11,6 +12,8 @@
#include "fd-util.h"
#include "fs-util.h"
#include "macro.h"
#include "missing_mount.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "path-util.h"
@ -47,15 +50,58 @@ TEST(inode_same) {
_cleanup_close_ int fd = -EBADF;
_cleanup_(unlink_tempfilep) char name[] = "/tmp/test-files_same.XXXXXX";
_cleanup_(unlink_tempfilep) char name_alias[] = "/tmp/test-files_same.alias";
int r;
fd = mkostemp_safe(name);
assert_se(fd >= 0);
assert_se(symlink(name, name_alias) >= 0);
assert_se(inode_same(name, name, 0));
assert_se(inode_same(name, name, AT_SYMLINK_NOFOLLOW));
assert_se(inode_same(name, name_alias, 0));
assert_se(!inode_same(name, name_alias, AT_SYMLINK_NOFOLLOW));
assert_se(inode_same(name, name, 0) > 0);
assert_se(inode_same(name, name, AT_SYMLINK_NOFOLLOW) > 0);
assert_se(inode_same(name, name_alias, 0) > 0);
assert_se(inode_same(name, name_alias, AT_SYMLINK_NOFOLLOW) == 0);
assert_se(inode_same("/proc", "/proc", 0));
assert_se(inode_same("/proc", "/proc", AT_SYMLINK_NOFOLLOW));
_cleanup_close_ int fd1 = open("/dev/null", O_CLOEXEC|O_RDONLY),
fd2 = open("/dev/null", O_CLOEXEC|O_RDONLY);
assert_se(fd1 >= 0);
assert_se(fd2 >= 0);
assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH) > 0);
assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH) > 0);
assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
assert_se(inode_same_at(fd1, NULL, fd1, NULL, AT_EMPTY_PATH) > 0);
assert_se(inode_same_at(fd2, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
safe_close(fd2);
fd2 = open("/dev/urandom", O_CLOEXEC|O_RDONLY);
assert_se(fd2 >= 0);
assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH) == 0);
assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH) == 0);
assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) == 0);
assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) == 0);
assert_se(inode_same_at(AT_FDCWD, NULL, AT_FDCWD, NULL, AT_EMPTY_PATH) > 0);
assert_se(inode_same_at(AT_FDCWD, NULL, fd1, NULL, AT_EMPTY_PATH) == 0);
assert_se(inode_same_at(fd1, NULL, AT_FDCWD, NULL, AT_EMPTY_PATH) == 0);
_cleanup_(umount_and_unlink_and_freep) char *p = NULL;
assert_se(tempfn_random_child(NULL, NULL, &p) >= 0);
assert_se(touch(p) >= 0);
r = mount_nofollow_verbose(LOG_ERR, name, p, NULL, MS_BIND, NULL);
if (r < 0)
assert_se(ERRNO_IS_NEG_PRIVILEGE(r));
else {
assert_se(inode_same(name, p, 0) > 0);
assert_se(inode_same(name, p, AT_SYMLINK_NOFOLLOW) > 0);
}
}
TEST(is_symlink) {