1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-27 18:04:05 +03:00

fd-util: add new helper call fd_duplicate_data_fd()

This call creates an fd from another fd containing the same data.
Specifically, repeated read() on the returned fd should return the same
data as the original fd. This call is useful when we want to copy data
out of disk images and suchlike, and want to be pass fds with the data
around without having to keep the disk image continously mounted.

The implementation tries to be somewhat smart and tries to prefer
memfds/pipes over files in /tmp or /var/tmp based on the size of the
data, but has appropropriate fallbacks in place.
This commit is contained in:
Lennart Poettering 2018-03-09 22:45:08 +01:00
parent cdc0f9be92
commit 4960ce43ff
3 changed files with 289 additions and 0 deletions

View File

@ -12,10 +12,13 @@
#include <sys/stat.h>
#include <unistd.h>
#include "alloc-util.h"
#include "copy.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "io-util.h"
#include "macro.h"
#include "memfd-util.h"
#include "missing.h"
@ -562,6 +565,202 @@ try_dev_shm_without_o_tmpfile:
return -EOPNOTSUPP;
}
/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
#define DATA_FD_MEMORY_LIMIT (64U*1024U)
/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
#define DATA_FD_TMP_LIMIT (1024U*1024U)
int fd_duplicate_data_fd(int fd) {
_cleanup_close_ int copy_fd = -1, tmp_fd = -1;
_cleanup_free_ void *remains = NULL;
_cleanup_free_ char *t = NULL;
size_t remains_size = 0;
const char *td;
struct stat st;
int r;
/* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
* independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
* somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
* uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
* /var/tmp. */
if (fstat(fd, &st) < 0)
return -errno;
/* For now, let's only accept regular files, sockets, pipes and char devices */
if (S_ISDIR(st.st_mode))
return -EISDIR;
if (S_ISLNK(st.st_mode))
return -ELOOP;
if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
return -EBADFD;
/* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
* that we use the reported regular file size only as a hint, given that there are plenty special files in
* /proc and /sys which report a zero file size but can be read from. */
if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
/* Try a memfd first */
copy_fd = memfd_new("data-fd");
if (copy_fd >= 0) {
off_t f;
r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
if (r < 0)
return r;
f = lseek(copy_fd, 0, SEEK_SET);
if (f != 0)
return -errno;
if (r == 0) {
/* Did it fit into the limit? If so, we are done. */
r = memfd_set_sealed(copy_fd);
if (r < 0)
return r;
return TAKE_FD(copy_fd);
}
/* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
} else {
_cleanup_(close_pairp) int pipefds[2] = { -1, -1 };
int isz;
/* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
* then block indefinitely when we hit the pipe size limit */
if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
return -errno;
isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
if (isz < 0)
return -errno;
/* Try to enlarge the pipe size if necessary */
if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
(void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
if (isz < 0)
return -errno;
}
if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size);
if (r < 0 && r != -EAGAIN)
return r; /* If we get EAGAIN it could be because of the source or because of
* the destination fd, we can't know, as sendfile() and friends won't
* tell us. Hence, treat this as reason to fall back, just to be
* sure. */
if (r == 0) {
/* Everything fit in, yay! */
(void) fd_nonblock(pipefds[0], false);
return TAKE_FD(pipefds[0]);
}
/* Things didn't fit in. But we read data into the pipe, let's remember that, so that
* when writing the new file we incorporate this first. */
copy_fd = TAKE_FD(pipefds[0]);
}
}
}
/* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
(DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
off_t f;
tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
if (tmp_fd < 0)
return tmp_fd;
if (copy_fd >= 0) {
/* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
* temporary file first. */
r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
if (r < 0)
return r;
assert(r == 0);
}
if (remains_size > 0) {
/* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
* failed copy operation, let's flush them out next. */
r = loop_write(tmp_fd, remains, remains_size, false);
if (r < 0)
return r;
}
r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
if (r < 0)
return r;
if (r == 0)
goto finish; /* Yay, it fit in */
/* It didn't fit in. Let's not forget to use what we already used */
f = lseek(tmp_fd, 0, SEEK_SET);
if (f != 0)
return -errno;
safe_close(copy_fd);
copy_fd = TAKE_FD(tmp_fd);
remains = mfree(remains);
remains_size = 0;
}
/* As last fallback use /var/tmp */
r = var_tmp_dir(&td);
if (r < 0)
return r;
tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
if (tmp_fd < 0)
return tmp_fd;
if (copy_fd >= 0) {
/* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
* into the temporary file first. */
r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
if (r < 0)
return r;
assert(r == 0);
}
if (remains_size > 0) {
/* Then, copy in any read but not yet written bytes. */
r = loop_write(tmp_fd, remains, remains_size, false);
if (r < 0)
return r;
}
/* Copy in the rest */
r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
if (r < 0)
return r;
assert(r == 0);
finish:
/* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
* file again */
return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
}
int fd_move_above_stdio(int fd) {
int flags, copy;
PROTECT_ERRNO;

View File

@ -81,6 +81,8 @@ enum {
int acquire_data_fd(const void *data, size_t size, unsigned flags);
int fd_duplicate_data_fd(int fd);
/* Hint: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5 */
#define ERRNO_IS_DISCONNECT(r) \
IN_SET(r, ENOTCONN, ECONNRESET, ECONNREFUSED, ECONNABORTED, EPIPE, ENETUNREACH)

View File

@ -228,6 +228,93 @@ static void test_rearrange_stdio(void) {
}
}
static void assert_equal_fd(int fd1, int fd2) {
for (;;) {
uint8_t a[4096], b[4096];
ssize_t x, y;
x = read(fd1, a, sizeof(a));
assert(x >= 0);
y = read(fd2, b, sizeof(b));
assert(y >= 0);
assert(x == y);
if (x == 0)
break;
assert(memcmp(a, b, x) == 0);
}
}
static void test_fd_duplicate_data_fd(void) {
_cleanup_close_ int fd1 = -1, fd2 = -1;
_cleanup_(close_pairp) int sfd[2] = { -1, -1 };
_cleanup_(sigkill_waitp) pid_t pid = -1;
uint64_t i, j;
int r;
fd1 = open("/etc/fstab", O_RDONLY|O_CLOEXEC);
if (fd1 >= 0) {
fd2 = fd_duplicate_data_fd(fd1);
assert_se(fd2 >= 0);
assert_se(lseek(fd1, 0, SEEK_SET) == 0);
assert_equal_fd(fd1, fd2);
}
fd1 = safe_close(fd1);
fd2 = safe_close(fd2);
fd1 = acquire_data_fd("hallo", 6, 0);
assert_se(fd1 >= 0);
fd2 = fd_duplicate_data_fd(fd1);
assert_se(fd2 >= 0);
safe_close(fd1);
fd1 = acquire_data_fd("hallo", 6, 0);
assert_se(fd1 >= 0);
assert_equal_fd(fd1, fd2);
fd1 = safe_close(fd1);
fd2 = safe_close(fd2);
assert_se(socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, sfd) >= 0);
r = safe_fork("(sd-pipe)", FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_LOG, &pid);
assert_se(r >= 0);
if (r == 0) {
/* child */
sfd[0] = safe_close(sfd[0]);
for (i = 0; i < 1536*1024 / sizeof(uint64_t); i++)
assert_se(write(sfd[1], &i, sizeof(i)) == sizeof(i));
sfd[1] = safe_close(sfd[1]);
_exit(EXIT_SUCCESS);
}
sfd[1] = safe_close(sfd[1]);
fd2 = fd_duplicate_data_fd(sfd[0]);
assert_se(fd2 >= 0);
for (i = 0; i < 1536*1024 / sizeof(uint64_t); i++) {
assert_se(read(fd2, &j, sizeof(j)) == sizeof(j));
assert_se(i == j);
}
assert_se(read(fd2, &j, sizeof(j)) == 0);
}
int main(int argc, char *argv[]) {
test_close_many();
test_close_nointr();
@ -236,6 +323,7 @@ int main(int argc, char *argv[]) {
test_acquire_data_fd();
test_fd_move_above_stdio();
test_rearrange_stdio();
test_fd_duplicate_data_fd();
return 0;
}