Add an ostree-boot-complete.service to propagate staging failures

Quite a while ago we added staged deployments, which solved
a bunch of issues around the `/etc` merge.  However...a persistent
problem since then is that any failures in that process that
happened in the *previous* boot are not very visible.

We ship custom code in `rpm-ostree status` to query the previous
journal.  But that has a few problems - one is that on systems
that have been up a while, that failure message may even get
rotated out.  And second, some systems may not even have a persistent
journal at all.

A general thing we do in e.g. Fedora CoreOS testing is to check
for systemd unit failures.  We do that both in our automated tests,
and we even ship code that displays them on ssh logins.  And beyond
that obviously a lot of other projects do the same; it's easy via
`systemctl --failed`.

So to make failures more visible, change our `ostree-finalize-staged.service`
to have an internal wrapper around the process that "catches" any
errors, and copies the error message into a file in `/boot/ostree`.

Then, a new `ostree-boot-complete.service` looks for this file on
startup and re-emits the error message, and fails.

It also deletes the file.  The rationale is to avoid *continually*
warning.  For example we need to handle the case when an upgrade
process creates a new staged deployment.  Now, we could change the
ostree core code to delete the warning file when that happens instead,
but this is trying to be a conservative change.

This should make failures here much more visible as is.
This commit is contained in:
Colin Walters 2022-04-22 18:46:28 -04:00
parent 98587a72db
commit d3d3e4ea13
13 changed files with 181 additions and 4 deletions

View File

@ -38,6 +38,7 @@ endif
if BUILDOPT_SYSTEMD
systemdsystemunit_DATA = src/boot/ostree-prepare-root.service \
src/boot/ostree-remount.service \
src/boot/ostree-boot-complete.service \
src/boot/ostree-finalize-staged.service \
src/boot/ostree-finalize-staged.path \
$(NULL)
@ -64,6 +65,7 @@ endif
EXTRA_DIST += src/boot/dracut/module-setup.sh \
src/boot/dracut/ostree.conf \
src/boot/mkinitcpio \
src/boot/ostree-boot-complete.service \
src/boot/ostree-prepare-root.service \
src/boot/ostree-finalize-staged.path \
src/boot/ostree-remount.service \

View File

@ -70,6 +70,7 @@ ostree_SOURCES += \
src/ostree/ot-admin-builtin-diff.c \
src/ostree/ot-admin-builtin-deploy.c \
src/ostree/ot-admin-builtin-finalize-staged.c \
src/ostree/ot-admin-builtin-boot-complete.c \
src/ostree/ot-admin-builtin-undeploy.c \
src/ostree/ot-admin-builtin-instutil.c \
src/ostree/ot-admin-builtin-cleanup.c \

View File

@ -0,0 +1,33 @@
# Copyright (C) 2022 Red Hat, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see <https://www.gnu.org/licenses/>.
[Unit]
Description=OSTree Complete Boot
Documentation=man:ostree(1)
# For now, this is the only condition on which we start, but it's
# marked as a triggering condition in case in the future we want
# to do something else.
ConditionPathExists=|/boot/ostree/finalize-failure.stamp
RequiresMountsFor=/boot
# Ensure that we propagate the failure into the current boot before
# any further finalization attempts.
Before=ostree-finalize-staged.service
[Service]
Type=oneshot
# To write to /boot while keeping it read-only
MountFlags=slave
RemainAfterExit=yes
ExecStart=/usr/bin/ostree admin boot-complete

View File

@ -51,6 +51,7 @@ ostree_cmd__private__ (void)
_ostree_repo_static_delta_delete,
_ostree_repo_verify_bindings,
_ostree_sysroot_finalize_staged,
_ostree_sysroot_boot_complete,
};
return &table;

View File

@ -33,6 +33,7 @@ typedef struct {
gboolean (* ostree_static_delta_delete) (OstreeRepo *repo, const char *delta_id, GCancellable *cancellable, GError **error);
gboolean (* ostree_repo_verify_bindings) (const char *collection_id, const char *ref_name, GVariant *commit, GError **error);
gboolean (* ostree_finalize_staged) (OstreeSysroot *sysroot, GCancellable *cancellable, GError **error);
gboolean (* ostree_boot_complete) (OstreeSysroot *sysroot, GCancellable *cancellable, GError **error);
} OstreeCmdPrivateVTable;
/* Note this not really "public", we just export the symbol, but not the header */

View File

@ -134,6 +134,8 @@ require_internal_units (const char *normal_dir,
return FALSE;
if (symlinkat (SYSTEM_DATA_UNIT_PATH "/ostree-finalize-staged.path", normal_dir_dfd, "multi-user.target.wants/ostree-finalize-staged.path") < 0)
return glnx_throw_errno_prefix (error, "symlinkat");
if (symlinkat (SYSTEM_DATA_UNIT_PATH "/ostree-boot-complete.service", normal_dir_dfd, "multi-user.target.wants/ostree-boot-complete.service") < 0)
return glnx_throw_errno_prefix (error, "symlinkat");
return TRUE;
#else

View File

@ -3375,10 +3375,10 @@ ostree_sysroot_stage_tree_with_options (OstreeSysroot *self,
}
/* Invoked at shutdown time by ostree-finalize-staged.service */
gboolean
_ostree_sysroot_finalize_staged (OstreeSysroot *self,
GCancellable *cancellable,
GError **error)
static gboolean
_ostree_sysroot_finalize_staged_inner (OstreeSysroot *self,
GCancellable *cancellable,
GError **error)
{
/* It's totally fine if there's no staged deployment; perhaps down the line
* though we could teach the ostree cmdline to tell systemd to activate the
@ -3475,9 +3475,63 @@ _ostree_sysroot_finalize_staged (OstreeSysroot *self,
if (!ostree_sysroot_prepare_cleanup (self, cancellable, error))
return FALSE;
// Cleanup will have closed some FDs, re-ensure writability
if (!_ostree_sysroot_ensure_writable (self, error))
return FALSE;
return TRUE;
}
/* Invoked at shutdown time by ostree-finalize-staged.service */
gboolean
_ostree_sysroot_finalize_staged (OstreeSysroot *self,
GCancellable *cancellable,
GError **error)
{
g_autoptr(GError) finalization_error = NULL;
if (!_ostree_sysroot_ensure_boot_fd (self, error))
return FALSE;
if (!_ostree_sysroot_finalize_staged_inner (self, cancellable, &finalization_error))
{
g_autoptr(GError) writing_error = NULL;
g_assert_cmpint (self->boot_fd, !=, -1);
if (!glnx_file_replace_contents_at (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH,
(guint8*)finalization_error->message, -1,
0, cancellable, &writing_error))
{
// We somehow failed to write the failure message...that's not great. Maybe ENOSPC on /boot.
g_printerr ("Failed to write %s: %s\n", _OSTREE_FINALIZE_STAGED_FAILURE_PATH, writing_error->message);
}
g_propagate_error (error, g_steal_pointer (&finalization_error));
return FALSE;
}
return TRUE;
}
/* Invoked at bootup time by ostree-boot-complete.service */
gboolean
_ostree_sysroot_boot_complete (OstreeSysroot *self,
GCancellable *cancellable,
GError **error)
{
if (!_ostree_sysroot_ensure_boot_fd (self, error))
return FALSE;
glnx_autofd int failure_fd = -1;
if (!ot_openat_ignore_enoent (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH, &failure_fd, error))
return FALSE;
// If we didn't find a failure log, then there's nothing to do right now.
// (Actually this unit shouldn't even be invoked, but we may do more in the future)
if (failure_fd == -1)
return TRUE;
g_autofree char *failure_data = glnx_fd_readall_utf8 (failure_fd, NULL, cancellable, error);
if (failure_data == NULL)
return glnx_prefix_error (error, "Reading from %s", _OSTREE_FINALIZE_STAGED_FAILURE_PATH);
// Remove the file; we don't want to continually error out.
(void) unlinkat (self->boot_fd, _OSTREE_FINALIZE_STAGED_FAILURE_PATH, 0);
return glnx_throw (error, "ostree-finalize-staged.service failed on previous boot: %s", failure_data);
}
/**
* ostree_sysroot_deployment_set_kargs:
* @self: Sysroot

View File

@ -96,6 +96,9 @@ struct OstreeSysroot {
#define _OSTREE_SYSROOT_BOOT_INITRAMFS_OVERLAYS "ostree/initramfs-overlays"
#define _OSTREE_SYSROOT_INITRAMFS_OVERLAYS "boot/" _OSTREE_SYSROOT_BOOT_INITRAMFS_OVERLAYS
// Relative to /boot, consumed by ostree-boot-complete.service
#define _OSTREE_FINALIZE_STAGED_FAILURE_PATH "ostree/finalize-failure.stamp"
gboolean
_ostree_sysroot_ensure_writable (OstreeSysroot *self,
GError **error);
@ -142,6 +145,10 @@ gboolean
_ostree_sysroot_finalize_staged (OstreeSysroot *self,
GCancellable *cancellable,
GError **error);
gboolean
_ostree_sysroot_boot_complete (OstreeSysroot *self,
GCancellable *cancellable,
GError **error);
OstreeDeployment *
_ostree_sysroot_deserialize_deployment_from_variant (GVariant *v,

View File

@ -357,6 +357,8 @@ _ostree_sysroot_ensure_writable (OstreeSysroot *self,
ostree_sysroot_unload (self);
if (!ensure_sysroot_fd (self, error))
return FALSE;
if (!_ostree_sysroot_ensure_boot_fd (self, error))
return FALSE;
return TRUE;
}

View File

@ -0,0 +1,58 @@
/*
* Copyright (C) 2022 Red Hat, Inc.
*
* SPDX-License-Identifier: LGPL-2.0+
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <https://www.gnu.org/licenses/>.
*/
#include "config.h"
#include <stdlib.h>
#include "ot-main.h"
#include "ot-admin-builtins.h"
#include "ot-admin-functions.h"
#include "ostree.h"
#include "otutil.h"
#include "ostree-cmdprivate.h"
static GOptionEntry options[] = {
{ NULL }
};
gboolean
ot_admin_builtin_boot_complete (int argc, char **argv, OstreeCommandInvocation *invocation, GCancellable *cancellable, GError **error)
{
/* Just a sanity check; we shouldn't be called outside of the service though.
*/
struct stat stbuf;
if (fstatat (AT_FDCWD, OSTREE_PATH_BOOTED, &stbuf, 0) < 0)
return TRUE;
// We must have been invoked via systemd which should have set up a mount namespace.
g_assert (getenv ("INVOCATION_ID"));
g_autoptr(GOptionContext) context = g_option_context_new ("");
g_autoptr(OstreeSysroot) sysroot = NULL;
if (!ostree_admin_option_context_parse (context, options, &argc, &argv,
OSTREE_ADMIN_BUILTIN_FLAG_SUPERUSER,
invocation, &sysroot, cancellable, error))
return FALSE;
if (!ostree_cmd__private__()->ostree_boot_complete (sysroot, cancellable, error))
return FALSE;
return TRUE;
}

View File

@ -39,6 +39,7 @@ BUILTINPROTO(deploy);
BUILTINPROTO(cleanup);
BUILTINPROTO(pin);
BUILTINPROTO(finalize_staged);
BUILTINPROTO(boot_complete);
BUILTINPROTO(unlock);
BUILTINPROTO(status);
BUILTINPROTO(set_origin);

View File

@ -43,6 +43,9 @@ static OstreeCommand admin_subcommands[] = {
{ "finalize-staged", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
ot_admin_builtin_finalize_staged,
"Internal command to run at shutdown time" },
{ "boot-complete", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
ot_admin_builtin_boot_complete,
"Internal command to run at boot after an update was applied" },
{ "init-fs", OSTREE_BUILTIN_FLAG_NO_REPO,
ot_admin_builtin_init_fs,
"Initialize a root filesystem" },

View File

@ -146,6 +146,18 @@ EOF
# Cleanup refs
ostree refs --delete staged-deploy nonstaged-deploy
echo "ok cleanup refs"
# Now finally, try breaking staged updates and verify that ostree-boot-complete fails on the next boot
unshare -m /bin/sh -c 'mount -o remount,rw /boot; chattr +i /boot'
rpm-ostree kargs --append=foo=bar
/tmp/autopkgtest-reboot "3"
;;
"3")
(systemctl status ostree-boot-complete.service || true) | tee out.txt
assert_file_has_content out.txt 'error: ostree-finalize-staged.service failed on previous boot.*Operation not permitted'
systemctl show -p Result ostree-boot-complete.service > out.txt
assert_file_has_content out.txt='Result=exit-code'
echo "ok boot-complete.service"
;;
*) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;;
esac