lib/deploy: Use a FIFREEZE/FITHAW cycle for /boot

See: http://marc.info/?l=linux-fsdevel&m=149520244919284&w=2

XFS doesn't flush the journal on `syncfs()`. GRUB doesn't know how to follow the
XFS journal, so if the filesystem is in a dirty state (possible with xfs
`/boot`, extremely likely with `/`, if the journaled data includes content for
`/boot`, the system may be unbootable if a system crash occurs.

Fix this by doing a `FIFREEZE`+`FITHAW` cycle.  Now, most people
probably would have replaced the `syncfs()` invocation with those two
ioctls.  But this would have become (I believe) the *only* place in
libostree where we weren't safe against interruption.  The failure
mode would be ugly; nothing else would be able to write to the filesystem
until manual intervention.

The real fix here I think is to land an atomic `FIFREEZETHAW` ioctl
in the kernel.  I might try a patch.

In the meantime though, let's jump through some hoops and set up
a "watchdog" child process that acts as a fallback unfreezer.

Closes: https://github.com/ostreedev/ostree/issues/876

Closes: #1049
Approved by: jlebon
This commit is contained in:
Colin Walters 2017-08-02 22:07:26 -04:00 committed by Atomic Bot
parent 9f8f351cd4
commit 8642ef5ab3
7 changed files with 141 additions and 14 deletions

View File

@ -22,8 +22,14 @@
#include <gio/gunixinputstream.h>
#include <gio/gunixoutputstream.h>
#include <glib-unix.h>
#include <sys/mount.h>
#include <sys/statvfs.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <linux/fs.h>
#include <err.h>
#ifdef HAVE_LIBMOUNT
#include <libmount.h>
@ -973,18 +979,126 @@ checksum_from_kernel_src (const char *name,
return TRUE;
}
/* We used to syncfs(), but that doesn't flush the journal on XFS,
* and since GRUB2 can't read the XFS journal, the system
* could fail to boot.
*
* http://marc.info/?l=linux-fsdevel&m=149520244919284&w=2
* https://github.com/ostreedev/ostree/pull/1049
*/
static gboolean
syncfs_dir_at (int dfd,
const char *path,
GCancellable *cancellable,
GError **error)
fsfreeze_thaw_cycle (OstreeSysroot *self,
int rootfs_dfd,
GCancellable *cancellable,
GError **error)
{
glnx_fd_close int child_dfd = -1;
if (!glnx_opendirat (dfd, path, TRUE, &child_dfd, error))
return FALSE;
if (syncfs (child_dfd) != 0)
return glnx_throw_errno_prefix (error, "syncfs(%s)", path);
GLNX_AUTO_PREFIX_ERROR ("During fsfreeze-thaw", error);
int sockpair[2];
if (socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sockpair) < 0)
return glnx_throw_errno_prefix (error, "socketpair");
glnx_fd_close int sock_parent = sockpair[0];
glnx_fd_close int sock_watchdog = sockpair[1];
pid_t pid = fork ();
if (pid < 0)
return glnx_throw_errno_prefix (error, "fork");
const gboolean debug_fifreeze = (self->debug_flags & OSTREE_SYSROOT_DEBUG_TEST_FIFREEZE)>0;
char c = '!';
if (pid == 0) /* Child watchdog/unfreezer process. */
{
(void) close (glnx_steal_fd (&sock_parent));
/* Daemonize, and mask SIGINT/SIGTERM, so we're likely to survive e.g.
* someone doing a `systemctl restart rpm-ostreed` or a Ctrl-C of
* `ostree admin upgrade`.
*/
if (daemon (0, debug_fifreeze ? 1 : 0) < 0)
err (1, "daemon");
int sigs[] = { SIGINT, SIGTERM };
for (guint i = 0; i < G_N_ELEMENTS (sigs); i++)
{
if (signal (sigs[i], SIG_IGN) == SIG_ERR)
err (1, "signal");
}
/* Tell the parent we're ready */
if (write (sock_watchdog, &c, sizeof (c)) != 1)
err (1, "write");
/* Wait for the parent to say it's going to freeze. */
ssize_t bytes_read = TEMP_FAILURE_RETRY (read (sock_watchdog, &c, sizeof (c)));
if (bytes_read < 0)
err (1, "read");
if (bytes_read != 1)
errx (1, "failed to read from parent");
/* Now we wait for the second message from the parent saying the freeze is
* complete. We have a 30 second timeout; if somehow the parent hasn't
* signaled completion, go ahead and unfreeze. But for debugging, just 1
* second to avoid exessively lengthining the test suite.
*/
const int timeout_ms = debug_fifreeze ? 1000 : 30000;
struct pollfd pfds[1];
pfds[0].fd = sock_watchdog;
pfds[0].events = POLLIN | POLLHUP;
int r = TEMP_FAILURE_RETRY (poll (pfds, 1, timeout_ms));
/* Do a thaw if we hit an error, or if the poll timed out */
if (r <= 0)
{
if (TEMP_FAILURE_RETRY (ioctl (rootfs_dfd, FITHAW, 0)) != 0)
{
if (errno == EPERM)
; /* Ignore this for the test suite */
else
err (1, "FITHAW");
}
/* But if we got an error from poll, let's log it */
if (r < 0)
err (1, "poll");
}
if (debug_fifreeze)
g_printerr ("fifreeze watchdog was run\n");
exit (EXIT_SUCCESS);
}
else /* Parent process. */
{
(void) close (glnx_steal_fd (&sock_watchdog));
/* Wait for the watchdog to say it's set up; mainly that it's
* masked SIGTERM successfully.
*/
ssize_t bytes_read = TEMP_FAILURE_RETRY (read (sock_parent, &c, sizeof (c)));
if (bytes_read < 0)
return glnx_throw_errno_prefix (error, "read(watchdog init)");
if (bytes_read != 1)
return glnx_throw (error, "read(watchdog init)");
/* And tell the watchdog that we're ready to start */
if (write (sock_parent, &c, sizeof (c)) != sizeof (c))
return glnx_throw_errno_prefix (error, "write(watchdog start)");
/* Testing infrastructure */
if (debug_fifreeze)
return glnx_throw (error, "aborting due to test-fifreeze");
/* Do a freeze/thaw cycle; TODO add a FIFREEZETHAW ioctl */
if (ioctl (rootfs_dfd, FIFREEZE, 0) != 0)
{
/* Not supported, or we're running in the unit tests (as non-root)?
* OK, let's just do a syncfs.
*/
if (G_IN_SET (errno, EOPNOTSUPP, EPERM))
{
if (TEMP_FAILURE_RETRY (syncfs (rootfs_dfd)) != 0)
return glnx_throw_errno_prefix (error, "syncfs");
/* Write the completion, and return */
if (write (sock_parent, &c, sizeof (c)) != sizeof (c))
return glnx_throw_errno_prefix (error, "write(watchdog syncfs complete)");
return TRUE;
}
else
return glnx_throw_errno_prefix (error, "ioctl(FIFREEZE)");
}
/* And finally thaw, then signal our completion to the watchdog */
if (TEMP_FAILURE_RETRY (ioctl (rootfs_dfd, FITHAW, 0)) != 0)
return glnx_throw_errno_prefix (error, "ioctl(FITHAW)");
if (write (sock_parent, &c, sizeof (c)) != sizeof (c))
return glnx_throw_errno_prefix (error, "write(watchdog FITHAW complete)");
}
return TRUE;
}
@ -1012,7 +1126,10 @@ full_system_sync (OstreeSysroot *self,
out_stats->root_syncfs_msec = (end_msec - start_msec);
start_msec = g_get_monotonic_time () / 1000;
if (!syncfs_dir_at (self->sysroot_fd, "boot", cancellable, error))
glnx_fd_close int boot_dfd = -1;
if (!glnx_opendirat (self->sysroot_fd, "boot", TRUE, &boot_dfd, error))
return FALSE;
if (!fsfreeze_thaw_cycle (self, boot_dfd, cancellable, error))
return FALSE;
end_msec = g_get_monotonic_time () / 1000;
out_stats->boot_syncfs_msec = (end_msec - start_msec);

View File

@ -33,7 +33,8 @@ typedef enum {
OSTREE_SYSROOT_DEBUG_MUTABLE_DEPLOYMENTS = 1 << 0,
/* See https://github.com/ostreedev/ostree/pull/759 */
OSTREE_SYSROOT_DEBUG_NO_XATTRS = 1 << 1,
/* https://github.com/ostreedev/ostree/pull/1049 */
OSTREE_SYSROOT_DEBUG_TEST_FIFREEZE = 1 << 2,
} OstreeSysrootDebugFlags;
/**

View File

@ -166,6 +166,7 @@ ostree_sysroot_init (OstreeSysroot *self)
{
const GDebugKey keys[] = {
{ "mutable-deployments", OSTREE_SYSROOT_DEBUG_MUTABLE_DEPLOYMENTS },
{ "test-fifreeze", OSTREE_SYSROOT_DEBUG_TEST_FIFREEZE },
{ "no-xattrs", OSTREE_SYSROOT_DEBUG_NO_XATTRS },
};

View File

@ -249,3 +249,11 @@ ${CMD_PREFIX} ostree --sysroot=${deployment} remote add --set=gpg-verify=false r
assert_not_file_has_content sysroot/ostree/repo/config remote-test-nonphysical
assert_file_has_content ${deployment}/etc/ostree/remotes.d/remote-test-nonphysical.conf testos-repo
echo "ok remote add nonphysical sysroot"
if env OSTREE_SYSROOT_DEBUG="${OSTREE_SYSROOT_DEBUG},test-fifreeze" \
${CMD_PREFIX} ostree admin deploy --os=testos testos:testos/buildmaster/x86_64-runtime 2>err.txt; then
fatal "fifreeze-test exited successfully?"
fi
assert_file_has_content err.txt "fifreeze watchdog was run"
assert_file_has_content err.txt "During fsfreeze-thaw: aborting due to test-fifreeze"
echo "ok fifreeze test"

View File

@ -19,7 +19,7 @@
set -euo pipefail
echo "1..18"
echo "1..19"
. $(dirname $0)/libtest.sh

View File

@ -19,7 +19,7 @@
set -euo pipefail
echo "1..18"
echo "1..19"
. $(dirname $0)/libtest.sh

View File

@ -20,7 +20,7 @@
set -euo pipefail
echo "1..19"
echo "1..20"
. $(dirname $0)/libtest.sh