Merge pull request #33475 from poettering/name-to-handle-at-fid

teach inode_same() the concept of name_to_handle_at() FIDs to properly detect inode identities
2025-01-05 13:18:06 +03:00 · 2024-07-01 18:53:14 +02:00 · 2024-07-01 18:53:14 +02:00 · 1c30bf35bc
commit 1c30bf35bc
parent 9fc4388ae4 a8db162d8a
7 changed files with 227 additions and 34 deletions
--- a/src/basic/missing_fcntl.h
+++ b/src/basic/missing_fcntl.h
@ -92,3 +92,7 @@
 #define RAW_O_LARGEFILE 00100000
 #endif
 #endif
+
+#ifndef AT_HANDLE_FID
+#define AT_HANDLE_FID AT_REMOVEDIR
+#endif
--- a/src/basic/mountpoint-util.c
+++ b/src/basic/mountpoint-util.c
@ -13,6 +13,7 @@
 #include "fileio.h"
 #include "filesystems.h"
 #include "fs-util.h"
+#include "missing_fcntl.h"
 #include "missing_fs.h"
 #include "missing_mount.h"
 #include "missing_stat.h"
@ -35,6 +36,24 @@
 * with large file handles anyway. */
 #define ORIGINAL_MAX_HANDLE_SZ 128

+bool is_name_to_handle_at_fatal_error(int err) {
+        /* name_to_handle_at() can return "acceptable" errors that are due to the context. For example the
+         * kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked
+         * (EACCES/EPERM; maybe through seccomp, because we are running inside of a container), or the mount
+         * point is not triggered yet (EOVERFLOW, think autofs+nfs4), or some general name_to_handle_at()
+         * flakiness (EINVAL). However other errors are not supposed to happen and therefore are considered
+         * fatal ones. */
+
+        assert(err < 0);
+
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(err))
+                return false;
+        if (ERRNO_IS_NEG_PRIVILEGE(err))
+                return false;
+
+        return !IN_SET(err, -EOVERFLOW, -EINVAL);
+}
+
 int name_to_handle_at_loop(
                int fd,
                const char *path,
@ -44,7 +63,8 @@ int name_to_handle_at_loop(

        size_t n = ORIGINAL_MAX_HANDLE_SZ;

-        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+        assert(fd >= 0 || fd == AT_FDCWD);
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH|AT_HANDLE_FID)) == 0);

        /* We need to invoke name_to_handle_at() in a loop, given that it might return EOVERFLOW when the specified
         * buffer is too small. Note that in contrast to what the docs might suggest, MAX_HANDLE_SZ is only good as a
@ -86,9 +106,9 @@ int name_to_handle_at_loop(
                        return 0;
                }

-                /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by something
-                 * else (apparently EOVERFLOW is returned for untriggered nfs4 mounts sometimes), not by the too small
-                 * buffer. In that case propagate EOVERFLOW */
+                /* If name_to_handle_at() didn't increase the byte size, then this EOVERFLOW is caused by
+                 * something else (apparently EOVERFLOW is returned for untriggered nfs4 autofs mounts
+                 * sometimes), not by the too small buffer. In that case propagate EOVERFLOW */
                if (h->handle_bytes <= n)
                        return -EOVERFLOW;

@ -101,6 +121,30 @@ int name_to_handle_at_loop(
        }
 }

+int name_to_handle_at_try_fid(
+                int fd,
+                const char *path,
+                struct file_handle **ret_handle,
+                int *ret_mnt_id,
+                int flags) {
+
+        int r;
+
+        assert(fd >= 0 || fd == AT_FDCWD);
+
+        /* First issues name_to_handle_at() with AT_HANDLE_FID. If this fails and this is not a fatal error
+         * we'll try without the flag, in order to support older kernels that didn't have AT_HANDLE_FID
+         * (i.e. older than Linux 6.5). */
+
+        r = name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags | AT_HANDLE_FID);
+        if (r >= 0)
+                return r;
+        if (is_name_to_handle_at_fatal_error(r))
+                return r;
+
+        return name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags & ~AT_HANDLE_FID);
+}
+
 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
        char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
        _cleanup_free_ char *fdinfo = NULL;
@ -160,17 +204,15 @@ static bool filename_possibly_with_slash_suffix(const char *s) {
        return filename_is_valid(copied);
 }

-static bool is_name_to_handle_at_fatal_error(int err) {
-        /* name_to_handle_at() can return "acceptable" errors that are due to the context. For
-         * example the kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall
-         * was blocked (EACCES/EPERM; maybe through seccomp, because we are running inside of a
-         * container), or the mount point is not triggered yet (EOVERFLOW, think nfs4), or some
-         * general name_to_handle_at() flakiness (EINVAL). However other errors are not supposed to
-         * happen and therefore are considered fatal ones. */
+bool file_handle_equal(const struct file_handle *a, const struct file_handle *b) {
+        if (a == b)
+                return true;
+        if (!a != !b)
+                return false;
+        if (a->handle_type != b->handle_type)
+                return false;

-        assert(err < 0);
-
-        return !IN_SET(err, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL);
+        return memcmp_nn(a->f_handle, a->handle_bytes, b->f_handle, b->handle_bytes) == 0;
 }

 int fd_is_mount_point(int fd, const char *filename, int flags) {
@ -234,11 +276,11 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
        else if (FLAGS_SET(sx.stx_mask, STATX_TYPE) && S_ISLNK(sx.stx_mode))
                return false; /* symlinks are never mount points */

-        r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
+        r = name_to_handle_at_try_fid(fd, filename, &h, &mount_id, flags);
        if (r < 0) {
                if (is_name_to_handle_at_fatal_error(r))
                        return r;
-                if (r != -EOPNOTSUPP)
+                if (!ERRNO_IS_NOT_SUPPORTED(r))
                        goto fallback_fdinfo;

                /* This kernel or file system does not support name_to_handle_at(), hence let's see
@ -248,13 +290,13 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
        }

        if (isempty(filename))
-                r = name_to_handle_at_loop(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
+                r = name_to_handle_at_try_fid(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
        else
-                r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
+                r = name_to_handle_at_try_fid(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
        if (r < 0) {
                if (is_name_to_handle_at_fatal_error(r))
                        return r;
-                if (r != -EOPNOTSUPP)
+                if (!ERRNO_IS_NOT_SUPPORTED(r))
                        goto fallback_fdinfo;
                if (nosupp)
                        /* Both the parent and the directory can't do name_to_handle_at() */
@ -272,17 +314,14 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {

        /* If the file handle for the directory we are interested in and its parent are identical,
         * we assume this is the root directory, which is a mount point. */
-
-        if (h->handle_type == h_parent->handle_type &&
-            memcmp_nn(h->f_handle, h->handle_bytes,
-                      h_parent->f_handle, h_parent->handle_bytes) == 0)
+        if (file_handle_equal(h_parent, h))
                return 1;

        return mount_id != mount_id_parent;

 fallback_fdinfo:
        r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
-        if (IN_SET(r, -EOPNOTSUPP, -EACCES, -EPERM, -ENOSYS))
+        if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_PRIVILEGE(r))
                goto fallback_fstat;
        if (r < 0)
                return r;
--- a/src/basic/mountpoint-util.h
+++ b/src/basic/mountpoint-util.h
@ -36,7 +36,12 @@
 #define TMPFS_LIMITS_ROOTFS          TMPFS_LIMITS_VAR
 #define TMPFS_LIMITS_VOLATILE_STATE  TMPFS_LIMITS_VAR

+bool is_name_to_handle_at_fatal_error(int err);
+
 int name_to_handle_at_loop(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
+int name_to_handle_at_try_fid(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
+
+bool file_handle_equal(const struct file_handle *a, const struct file_handle *b);

 int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret);
 int path_get_mnt_id_at(int dir_fd, const char *path, int *ret);
--- a/src/basic/stat-util.c
+++ b/src/basic/stat-util.c
@ -20,6 +20,7 @@
 #include "missing_fs.h"
 #include "missing_magic.h"
 #include "missing_syscall.h"
+#include "mountpoint-util.h"
 #include "nulstr-util.h"
 #include "parse-util.h"
 #include "stat-util.h"
@ -271,18 +272,103 @@ int path_is_read_only_fs(const char *path) {
 }

 int inode_same_at(int fda, const char *filea, int fdb, const char *fileb, int flags) {
-        struct stat a, b;
+        struct stat sta, stb;
+        int r;

        assert(fda >= 0 || fda == AT_FDCWD);
        assert(fdb >= 0 || fdb == AT_FDCWD);
+        assert((flags & ~(AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW|AT_NO_AUTOMOUNT)) == 0);

-        if (fstatat(fda, strempty(filea), &a, flags) < 0)
-                return log_debug_errno(errno, "Cannot stat %s: %m", filea);
+        /* Refuse an unset filea or fileb early unless AT_EMPTY_PATH is set */
+        if ((isempty(filea) || isempty(fileb)) && !FLAGS_SET(flags, AT_EMPTY_PATH))
+                return -EINVAL;

-        if (fstatat(fdb, strempty(fileb), &b, flags) < 0)
-                return log_debug_errno(errno, "Cannot stat %s: %m", fileb);
+        /* Shortcut: comparing the same fd with itself means we can return true */
+        if (fda >= 0 && fda == fdb && isempty(filea) && isempty(fileb) && FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW))
+                return true;

-        return stat_inode_same(&a, &b);
+        _cleanup_close_ int pin_a = -EBADF, pin_b = -EBADF;
+        if (!FLAGS_SET(flags, AT_NO_AUTOMOUNT)) {
+                /* Let's try to use the name_to_handle_at() AT_HANDLE_FID API to identify identical
+                 * inodes. We have to issue multiple calls on the same file for that (first, to acquire the
+                 * FID, and then to check if .st_dev is actually the same). Hence let's pin the inode in
+                 * between via O_PATH, unless we already have an fd for it. */
+
+                if (!isempty(filea)) {
+                        pin_a = openat(fda, filea, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? O_NOFOLLOW : 0));
+                        if (pin_a < 0)
+                                return -errno;
+
+                        fda = pin_a;
+                        filea = NULL;
+                        flags |= AT_EMPTY_PATH;
+                }
+
+                if (!isempty(fileb)) {
+                        pin_b = openat(fdb, fileb, O_PATH|O_CLOEXEC|(FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? O_NOFOLLOW : 0));
+                        if (pin_b < 0)
+                                return -errno;
+
+                        fdb = pin_b;
+                        fileb = NULL;
+                        flags |= AT_EMPTY_PATH;
+                }
+
+                int ntha_flags = (flags & AT_EMPTY_PATH) | (FLAGS_SET(flags, AT_SYMLINK_NOFOLLOW) ? 0 : AT_SYMLINK_FOLLOW);
+                _cleanup_free_ struct file_handle *ha = NULL, *hb = NULL;
+                int mntida = -1, mntidb = -1;
+
+                r = name_to_handle_at_try_fid(
+                                fda,
+                                filea,
+                                &ha,
+                                &mntida,
+                                ntha_flags);
+                if (r < 0) {
+                        if (is_name_to_handle_at_fatal_error(r))
+                                return r;
+
+                        goto fallback;
+                }
+
+                r = name_to_handle_at_try_fid(
+                                fdb,
+                                fileb,
+                                &hb,
+                                &mntidb,
+                                ntha_flags);
+                if (r < 0) {
+                        if (is_name_to_handle_at_fatal_error(r))
+                                return r;
+
+                        goto fallback;
+                }
+
+                /* Now compare the two file handles */
+                if (!file_handle_equal(ha, hb))
+                        return false;
+
+                /* If the file handles are the same and they come from the same mount ID? Great, then we are
+                 * good, they are definitely the same */
+                if (mntida == mntidb)
+                        return true;
+
+                /* File handles are the same, they are not on the same mount id. This might either be because
+                 * they are on two entirely different file systems, that just happen to have the same FIDs
+                 * (because they originally where created off the same disk images), or it could be because
+                 * they are located on two distinct bind mounts of the same fs. To check that, let's look at
+                 * .st_rdev of the inode. We simply reuse the fallback codepath for that, since it checks
+                 * exactly that (it checks slightly more, but we don't care.) */
+        }
+
+fallback:
+        if (fstatat(fda, strempty(filea), &sta, flags) < 0)
+                return log_debug_errno(errno, "Cannot stat %s: %m", strna(filea));
+
+        if (fstatat(fdb, strempty(fileb), &stb, flags) < 0)
+                return log_debug_errno(errno, "Cannot stat %s: %m", strna(fileb));
+
+        return stat_inode_same(&sta, &stb);
 }

 bool is_fs_type(const struct statfs *s, statfs_f_type_t magic_value) {
--- a/src/shared/mount-util.c
+++ b/src/shared/mount-util.c
@ -1815,3 +1815,13 @@ int make_fsmount(

        return TAKE_FD(mnt_fd);
 }
+
+char* umount_and_unlink_and_free(char *p) {
+        if (!p)
+                return NULL;
+
+        PROTECT_ERRNO;
+        (void) umount2(p, 0);
+        (void) unlink(p);
+        return mfree(p);
+}
--- a/src/shared/mount-util.h
+++ b/src/shared/mount-util.h
@ -100,6 +100,9 @@ static inline char *umount_and_free(char *p) {
 }
 DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_free);

+char* umount_and_unlink_and_free(char *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_unlink_and_free);
+
 int bind_mount_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory);
 int mount_image_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory, const MountOptions *options, const ImagePolicy *image_policy);

--- a/src/test/test-stat-util.c
+++ b/src/test/test-stat-util.c
@ -4,6 +4,7 @@
 #include <linux/magic.h>
 #include <sched.h>
 #include <sys/eventfd.h>
+#include <sys/mount.h>
 #include <unistd.h>

 #include "alloc-util.h"
@ -11,6 +12,8 @@
 #include "fd-util.h"
 #include "fs-util.h"
 #include "macro.h"
+#include "missing_mount.h"
+#include "mount-util.h"
 #include "mountpoint-util.h"
 #include "namespace-util.h"
 #include "path-util.h"
@ -47,15 +50,58 @@ TEST(inode_same) {
        _cleanup_close_ int fd = -EBADF;
        _cleanup_(unlink_tempfilep) char name[] = "/tmp/test-files_same.XXXXXX";
        _cleanup_(unlink_tempfilep) char name_alias[] = "/tmp/test-files_same.alias";
+        int r;

        fd = mkostemp_safe(name);
        assert_se(fd >= 0);
        assert_se(symlink(name, name_alias) >= 0);

-        assert_se(inode_same(name, name, 0));
-        assert_se(inode_same(name, name, AT_SYMLINK_NOFOLLOW));
-        assert_se(inode_same(name, name_alias, 0));
-        assert_se(!inode_same(name, name_alias, AT_SYMLINK_NOFOLLOW));
+        assert_se(inode_same(name, name, 0) > 0);
+        assert_se(inode_same(name, name, AT_SYMLINK_NOFOLLOW) > 0);
+        assert_se(inode_same(name, name_alias, 0) > 0);
+        assert_se(inode_same(name, name_alias, AT_SYMLINK_NOFOLLOW) == 0);
+
+        assert_se(inode_same("/proc", "/proc", 0));
+        assert_se(inode_same("/proc", "/proc", AT_SYMLINK_NOFOLLOW));
+
+        _cleanup_close_ int fd1 = open("/dev/null", O_CLOEXEC|O_RDONLY),
+                fd2 = open("/dev/null", O_CLOEXEC|O_RDONLY);
+
+        assert_se(fd1 >= 0);
+        assert_se(fd2 >= 0);
+
+        assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH) > 0);
+        assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH) > 0);
+        assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
+        assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
+        assert_se(inode_same_at(fd1, NULL, fd1, NULL, AT_EMPTY_PATH) > 0);
+        assert_se(inode_same_at(fd2, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) > 0);
+
+        safe_close(fd2);
+        fd2 = open("/dev/urandom", O_CLOEXEC|O_RDONLY);
+        assert_se(fd2 >= 0);
+
+        assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH) == 0);
+        assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH) == 0);
+        assert_se(inode_same_at(fd1, NULL, fd2, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) == 0);
+        assert_se(inode_same_at(fd2, NULL, fd1, NULL, AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW) == 0);
+
+        assert_se(inode_same_at(AT_FDCWD, NULL, AT_FDCWD, NULL, AT_EMPTY_PATH) > 0);
+        assert_se(inode_same_at(AT_FDCWD, NULL, fd1, NULL, AT_EMPTY_PATH) == 0);
+        assert_se(inode_same_at(fd1, NULL, AT_FDCWD, NULL, AT_EMPTY_PATH) == 0);
+
+        _cleanup_(umount_and_unlink_and_freep) char *p = NULL;
+
+        assert_se(tempfn_random_child(NULL, NULL, &p) >= 0);
+        assert_se(touch(p) >= 0);
+
+        r = mount_nofollow_verbose(LOG_ERR, name, p, NULL, MS_BIND, NULL);
+        if (r < 0)
+                assert_se(ERRNO_IS_NEG_PRIVILEGE(r));
+        else {
+                assert_se(inode_same(name, p, 0) > 0);
+                assert_se(inode_same(name, p, AT_SYMLINK_NOFOLLOW) > 0);
+        }
 }

 TEST(is_symlink) {