mirror of
https://github.com/systemd/systemd.git
synced 2025-01-09 01:18:19 +03:00
service: add 'debug' option to RestartMode=
One of the major pait points of managing fleets of headless nodes is that when something fails at startup, unless debug level was already enabled (which usually isn't, as it's a firehose), one needs to manually enable it and pray the issue can be reproduced, which often is really hard and time consuming, just to get extra info. Usually the extra log messages are enough to triage an issue. This new option makes it so that when a service fails and is restarted due to Restart=, log level for that unit is set to debug, so that all setup code in pid1 and sd-executor logs at debug level, and also a new DEBUG_INVOCATION=1 env var is passed to the service itself, so that it knows it should start with a higher log level. Once the unit succeeds or reaches the rate limit the original level is restored.
This commit is contained in:
parent
23e3bddc48
commit
7d8bbfbe08
@ -2115,6 +2115,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
|
||||
readonly as Refs = ['...', ...];
|
||||
readonly a(ss) ActivationDetails = [...];
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
|
||||
readonly b DebugInvocation = ...;
|
||||
};
|
||||
interface org.freedesktop.DBus.Peer { ... };
|
||||
interface org.freedesktop.DBus.Introspectable { ... };
|
||||
@ -2458,6 +2460,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ActivationDetails"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="DebugInvocation"/>
|
||||
|
||||
<!--End of Autogenerated section-->
|
||||
|
||||
<refsect2>
|
||||
@ -2644,6 +2648,13 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>. Note
|
||||
that new key value pair may be added at any time in future versions. Existing entries will not be
|
||||
removed.</para>
|
||||
|
||||
<para><varname>DebugInvocation</varname> contains a boolean that will change to
|
||||
<constant>true</constant> when the unit is restarted with log level set to debug due to an earlier
|
||||
failure, and will change back to <constant>false</constant> when either the unit successfully starts
|
||||
or the restart rate limit is reached. See the <literal>RestartMode=</literal> section in
|
||||
<citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
|
||||
for more details.</para>
|
||||
</refsect2>
|
||||
|
||||
<refsect2>
|
||||
@ -12079,6 +12090,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
||||
<para><function>QueueSignal()</function> was added in version 254.</para>
|
||||
<para><varname>SurviveFinalKillSignal</varname> was added in version 255.</para>
|
||||
<para><varname>WantsMountsFor</varname> was added in version 256.</para>
|
||||
<para><varname>DebugInvocation</varname> was added in version 257.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Service Unit Objects</title>
|
||||
|
@ -4184,6 +4184,18 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX
|
||||
<xi:include href="version-info.xml" xpointer="v254"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>$DEBUG_INVOCATION</varname></term>
|
||||
|
||||
<listitem><para>If <varname>RestartMode=debug</varname> is set, and a previous attempt at starting
|
||||
the unit failed, this variable will be passed to the service to indicate that additional logging
|
||||
should be enabled at startup. See
|
||||
<citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
|
||||
for more details.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v257"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>For system services, when <varname>PAMName=</varname> is enabled and <command>pam_systemd</command> is part
|
||||
|
@ -936,20 +936,40 @@
|
||||
<listitem>
|
||||
<para>Takes a string value that specifies how a service should restart:
|
||||
<itemizedlist>
|
||||
<listitem><para>If set to <option>normal</option> (the default), the service restarts by
|
||||
going through a failed/inactive state.</para></listitem>
|
||||
<listitem>
|
||||
<para>If set to <option>normal</option> (the default), the service restarts by going through
|
||||
a failed/inactive state.</para>
|
||||
|
||||
<listitem><para>If set to <option>direct</option>, the service transitions to the activating
|
||||
state directly during auto-restart, skipping failed/inactive state.
|
||||
<varname>ExecStopPost=</varname> is invoked.
|
||||
<varname>OnSuccess=</varname> and <varname>OnFailure=</varname> are skipped.</para></listitem>
|
||||
<xi:include href="version-info.xml" xpointer="v254"/>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>If set to <option>direct</option>, the service transitions to the activating
|
||||
state directly during auto-restart, skipping failed/inactive state.
|
||||
<varname>ExecStopPost=</varname> is invoked.
|
||||
<varname>OnSuccess=</varname> and <varname>OnFailure=</varname> are skipped.</para>
|
||||
|
||||
<para>This option is useful in cases where a dependency can fail temporarily but we don't
|
||||
want these temporary failures to make the dependent units fail. Dependent units are not
|
||||
notified of these temporary failures.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v254"/>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>If set to <option>debug</option>, the service manager will log messages that are
|
||||
related to this unit at debug level while automated restarts are attempted, until either the
|
||||
service hits the rate limit or it succeeds, and the <varname>$DEBUG_INVOCATION=1</varname>
|
||||
environment variable will be set for the unit. This is useful to be able to get additional
|
||||
information when a service fails to start, without needing to proactively or permanently
|
||||
enable debug level logging in systemd, which is very verbose. This is otherwise equivalent
|
||||
to <option>normal</option> mode.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v257"/>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>This option is useful in cases where a dependency can fail temporarily
|
||||
but we don't want these temporary failures to make the dependent units fail.
|
||||
When this option is set to <option>direct</option>, dependent units are not notified of these temporary failures.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v254"/>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
@ -920,6 +920,7 @@ const sd_bus_vtable bus_unit_vtable[] = {
|
||||
SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0),
|
||||
SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Unit, activation_details), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
|
||||
SD_BUS_PROPERTY("DebugInvocation", "b", bus_property_get_bool, offsetof(Unit, debug_invocation), 0),
|
||||
|
||||
SD_BUS_METHOD_WITH_ARGS("Start",
|
||||
SD_BUS_ARGS("s", mode),
|
||||
|
@ -4107,7 +4107,9 @@ int exec_invoke(
|
||||
|
||||
/* This should be mostly redundant, as the log level is also passed as an argument of the executor,
|
||||
* and is already applied earlier. Just for safety. */
|
||||
if (context->log_level_max >= 0)
|
||||
if (params->debug_invocation)
|
||||
log_set_max_level(LOG_PRI(LOG_DEBUG));
|
||||
else if (context->log_level_max >= 0)
|
||||
log_set_max_level(context->log_level_max);
|
||||
|
||||
/* Explicitly test for CVE-2021-4034 inspired invocations */
|
||||
|
@ -1419,6 +1419,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = serialize_bool_elide(f, "exec-parameters-debug-invocation", p->debug_invocation);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
fputc('\n', f); /* End marker */
|
||||
|
||||
return 0;
|
||||
@ -1681,6 +1685,12 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
|
||||
return r;
|
||||
|
||||
sd_id128_to_string(p->invocation_id, p->invocation_id_string);
|
||||
} else if ((val = startswith(l, "exec-parameters-debug-invocation="))) {
|
||||
r = parse_boolean(val);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
p->debug_invocation = r;
|
||||
} else
|
||||
log_warning("Failed to parse serialized line, ignoring: %s", l);
|
||||
}
|
||||
|
@ -927,6 +927,7 @@ void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) {
|
||||
"%sShallConfirmSpawn: %s\n"
|
||||
"%sWatchdogUSec: " USEC_FMT "\n"
|
||||
"%sNotifySocket: %s\n"
|
||||
"%sDebugInvocation: %s\n"
|
||||
"%sFallbackSmackProcessLabel: %s\n",
|
||||
prefix, runtime_scope_to_string(p->runtime_scope),
|
||||
prefix, p->flags,
|
||||
@ -939,6 +940,7 @@ void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) {
|
||||
prefix, yes_no(p->shall_confirm_spawn),
|
||||
prefix, p->watchdog_usec,
|
||||
prefix, strempty(p->notify_socket),
|
||||
prefix, yes_no(p->debug_invocation),
|
||||
prefix, strempty(p->fallback_smack_process_label));
|
||||
|
||||
strv_dump(f, prefix, "FdNames", p->fd_names);
|
||||
|
@ -462,6 +462,8 @@ struct ExecParameters {
|
||||
char *unit_id;
|
||||
sd_id128_t invocation_id;
|
||||
char invocation_id_string[SD_ID128_STRING_MAX];
|
||||
|
||||
bool debug_invocation;
|
||||
};
|
||||
|
||||
#define EXEC_PARAMETERS_INIT(_flags) \
|
||||
@ -629,7 +631,8 @@ bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||
const ExecContext *_c = (ec); \
|
||||
const ExecParameters *_p = (ep); \
|
||||
const int _l = (level); \
|
||||
bool _do_log = _c->log_level_max < 0 || \
|
||||
bool _do_log = _p->debug_invocation || \
|
||||
_c->log_level_max < 0 || \
|
||||
_c->log_level_max >= LOG_PRI(_l); \
|
||||
LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
|
||||
_c->n_log_extra_fields); \
|
||||
@ -674,7 +677,8 @@ bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||
const ExecContext *_c = (ec); \
|
||||
const ExecParameters *_p = (ep); \
|
||||
const int _l = (level); \
|
||||
bool _do_log = _c->log_level_max < 0 || \
|
||||
bool _do_log = _p->debug_invocation || \
|
||||
_c->log_level_max < 0 || \
|
||||
_c->log_level_max >= LOG_PRI(_l); \
|
||||
LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
|
||||
_c->n_log_extra_fields); \
|
||||
|
@ -1726,7 +1726,7 @@ static int service_spawn_internal(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
our_env = new0(char*, 13);
|
||||
our_env = new0(char*, 14);
|
||||
if (!our_env)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -1830,6 +1830,13 @@ static int service_spawn_internal(
|
||||
}
|
||||
}
|
||||
|
||||
if (s->restart_mode == SERVICE_RESTART_MODE_DEBUG && UNIT(s)->debug_invocation) {
|
||||
char *t = strdup("DEBUG_INVOCATION=1");
|
||||
if (!t)
|
||||
return -ENOMEM;
|
||||
our_env[n_env++] = t;
|
||||
}
|
||||
|
||||
if (UNIT(s)->activation_details) {
|
||||
r = activation_details_append_env(UNIT(s)->activation_details, &our_env);
|
||||
if (r < 0)
|
||||
@ -2006,6 +2013,22 @@ static ServiceState service_determine_dead_state(Service *s) {
|
||||
return s->fd_store && s->fd_store_preserve_mode == EXEC_PRESERVE_YES ? SERVICE_DEAD_RESOURCES_PINNED : SERVICE_DEAD;
|
||||
}
|
||||
|
||||
static void service_set_debug_invocation(Service *s, bool enable) {
|
||||
assert(s);
|
||||
|
||||
if (s->restart_mode != SERVICE_RESTART_MODE_DEBUG)
|
||||
return;
|
||||
|
||||
if (enable == UNIT(s)->debug_invocation)
|
||||
return; /* Nothing to do */
|
||||
|
||||
UNIT(s)->debug_invocation = enable;
|
||||
unit_overwrite_log_level_max(UNIT(s), enable ? LOG_PRI(LOG_DEBUG) : s->exec_context.log_level_max);
|
||||
|
||||
if (enable)
|
||||
log_unit_notice(UNIT(s), "Service failed, subsequent restarts will be executed with debug level logging.");
|
||||
}
|
||||
|
||||
static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) {
|
||||
ServiceState end_state, restart_state;
|
||||
int r;
|
||||
@ -2071,13 +2094,22 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
|
||||
|
||||
log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0));
|
||||
|
||||
/* If the relevant option is set, and the unit doesn't already have logging level set to
|
||||
* debug, enable it now. Make sure to overwrite the state in /run/systemd/units/ too, to
|
||||
* ensure journald doesn't prune the messages. The previous state is saved and restored
|
||||
* once the auto-restart flow ends. */
|
||||
service_set_debug_invocation(s, /* enable= */ true);
|
||||
|
||||
service_set_state(s, SERVICE_AUTO_RESTART);
|
||||
} else
|
||||
} else {
|
||||
/* If we shan't restart, the restart counter would be flushed out. But rather than doing that
|
||||
* immediately here, this is delegated to service_start(), i.e. next start, so that the user
|
||||
* can still introspect the counter. */
|
||||
service_set_state(s, end_state);
|
||||
|
||||
service_set_debug_invocation(s, /* enable= */ false);
|
||||
}
|
||||
|
||||
/* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC
|
||||
* queue, so that the fd store is possibly gc'ed again */
|
||||
unit_add_to_gc_queue(UNIT(s));
|
||||
@ -4861,6 +4893,8 @@ static void service_reset_failed(Unit *u) {
|
||||
s->reload_result = SERVICE_SUCCESS;
|
||||
s->clean_result = SERVICE_SUCCESS;
|
||||
s->n_restarts = 0;
|
||||
|
||||
service_set_debug_invocation(s, /* enable= */ false);
|
||||
}
|
||||
|
||||
static PidRef* service_main_pid(Unit *u, bool *ret_is_alien) {
|
||||
@ -5112,6 +5146,7 @@ DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart);
|
||||
static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = {
|
||||
[SERVICE_RESTART_MODE_NORMAL] = "normal",
|
||||
[SERVICE_RESTART_MODE_DIRECT] = "direct",
|
||||
[SERVICE_RESTART_MODE_DEBUG] = "debug",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode);
|
||||
|
@ -95,6 +95,7 @@ typedef enum ServiceTimeoutFailureMode {
|
||||
typedef enum ServiceRestartMode {
|
||||
SERVICE_RESTART_MODE_NORMAL,
|
||||
SERVICE_RESTART_MODE_DIRECT,
|
||||
SERVICE_RESTART_MODE_DEBUG,
|
||||
_SERVICE_RESTART_MODE_MAX,
|
||||
_SERVICE_RESTART_MODE_INVALID = -EINVAL,
|
||||
} ServiceRestartMode;
|
||||
|
@ -3830,6 +3830,7 @@ void unit_reset_failed(Unit *u) {
|
||||
|
||||
ratelimit_reset(&u->start_ratelimit);
|
||||
u->start_limit_hit = false;
|
||||
u->debug_invocation = false;
|
||||
}
|
||||
|
||||
Unit *unit_following(Unit *u) {
|
||||
@ -5406,6 +5407,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
|
||||
if (!p->unit_id)
|
||||
return -ENOMEM;
|
||||
|
||||
p->debug_invocation = u->debug_invocation;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5606,23 +5609,25 @@ static int unit_export_invocation_id(Unit *u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unit_export_log_level_max(Unit *u, const ExecContext *c) {
|
||||
static int unit_export_log_level_max(Unit *u, int log_level_max, bool overwrite) {
|
||||
const char *p;
|
||||
char buf[2];
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
assert(c);
|
||||
|
||||
if (u->exported_log_level_max)
|
||||
/* When the debug_invocation logic runs, overwrite will be true as we always want to switch the max
|
||||
* log level that the journal applies, and we want to always restore the previous level once done */
|
||||
|
||||
if (!overwrite && u->exported_log_level_max)
|
||||
return 0;
|
||||
|
||||
if (c->log_level_max < 0)
|
||||
if (log_level_max < 0)
|
||||
return 0;
|
||||
|
||||
assert(c->log_level_max <= 7);
|
||||
assert(log_level_max <= 7);
|
||||
|
||||
buf[0] = '0' + c->log_level_max;
|
||||
buf[0] = '0' + log_level_max;
|
||||
buf[1] = 0;
|
||||
|
||||
p = strjoina("/run/systemd/units/log-level-max:", u->id);
|
||||
@ -5772,7 +5777,7 @@ void unit_export_state_files(Unit *u) {
|
||||
|
||||
c = unit_get_exec_context(u);
|
||||
if (c) {
|
||||
(void) unit_export_log_level_max(u, c);
|
||||
(void) unit_export_log_level_max(u, c->log_level_max, /* overwrite= */ false);
|
||||
(void) unit_export_log_extra_fields(u, c);
|
||||
(void) unit_export_log_ratelimit_interval(u, c);
|
||||
(void) unit_export_log_ratelimit_burst(u, c);
|
||||
@ -5830,6 +5835,16 @@ void unit_unlink_state_files(Unit *u) {
|
||||
}
|
||||
}
|
||||
|
||||
int unit_overwrite_log_level_max(Unit *u, int log_level_max) {
|
||||
assert(u);
|
||||
|
||||
if (!u->exported_log_level_max)
|
||||
return 0; /* Nothing to do */
|
||||
|
||||
/* Ensure that the new log level is exported for the journal, in place of the previous one */
|
||||
return unit_export_log_level_max(u, log_level_max, /* overwrite= */ true);
|
||||
}
|
||||
|
||||
int unit_prepare_exec(Unit *u) {
|
||||
int r;
|
||||
|
||||
|
@ -421,6 +421,9 @@ typedef struct Unit {
|
||||
/* Is this a unit that is always running and cannot be stopped? */
|
||||
bool perpetual;
|
||||
|
||||
/* When true logs about this unit will be at debug level regardless of other log level settings */
|
||||
bool debug_invocation;
|
||||
|
||||
/* Booleans indicating membership of this unit in the various queues */
|
||||
bool in_load_queue:1;
|
||||
bool in_dbus_queue:1;
|
||||
@ -984,6 +987,7 @@ void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
|
||||
|
||||
void unit_export_state_files(Unit *u);
|
||||
void unit_unlink_state_files(Unit *u);
|
||||
int unit_overwrite_log_level_max(Unit *u, int log_level_max);
|
||||
|
||||
int unit_prepare_exec(Unit *u);
|
||||
|
||||
@ -998,8 +1002,9 @@ static inline bool unit_has_job_type(Unit *u, JobType type) {
|
||||
}
|
||||
|
||||
static inline bool unit_log_level_test(const Unit *u, int level) {
|
||||
assert(u);
|
||||
ExecContext *ec = unit_get_exec_context(u);
|
||||
return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level);
|
||||
return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level) || u->debug_invocation;
|
||||
}
|
||||
|
||||
/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done"
|
||||
|
@ -165,4 +165,25 @@ rm /run/notify-reload-test.sh
|
||||
|
||||
systemd-analyze log-level info
|
||||
|
||||
# Ensure that, with system log level info, we get debug level messages when a unit fails to start and is
|
||||
# restarted with RestartMode=debug
|
||||
cat >/run/systemd/system/testservice-fail-restart-debug-59.service <<EOF
|
||||
[Unit]
|
||||
Description=TEST-59-RELOADING-RESTART Restart=on-failure RestartMode=debug
|
||||
|
||||
[Service]
|
||||
ExecStart=echo hello
|
||||
Restart=on-failure
|
||||
RestartMode=debug
|
||||
StartLimitBurst=3
|
||||
MountAPIVFS=yes
|
||||
BindPaths=/nonexistent-debug-59
|
||||
EOF
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl start testservice-fail-restart-debug-59.service
|
||||
wait_on_state_or_fail "testservice-fail-restart-debug-59.service" "failed" "15"
|
||||
journalctl --sync
|
||||
journalctl -b | grep -q "Failed to follow symlinks on /nonexistent-debug-59: No such file or directory"
|
||||
|
||||
touch /testok
|
||||
|
Loading…
Reference in New Issue
Block a user