From 0c59d2e4abf383595069824b6504b6e9ba9e307a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 30 Jun 2023 18:16:55 +0200 Subject: [PATCH 1/4] service: re-linebreak some comments matching current coding style --- src/core/service.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/core/service.c b/src/core/service.c index 93545a252ab..7f6ee26209d 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -285,11 +285,11 @@ usec_t service_restart_usec_next(Service *s) { assert(s); - /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, - * we still need to add 1 to s->n_restarts manually because s->n_restarts is not updated - * until a restart job is enqueued. Note that for SERVICE_AUTO_RESTART, that might have been - * the case, i.e. s->n_restarts is already increased. But we assume it's not since the time - * between job enqueuing and running is usually neglectable compared to the time we'll be sleeping. */ + /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, we still need + * to add 1 to s->n_restarts manually because s->n_restarts is not updated until a restart job is + * enqueued. Note that for SERVICE_AUTO_RESTART, that might have been the case, i.e. s->n_restarts is + * already increased. But we assume it's not since the time between job enqueuing and running is + * usually neglectable compared to the time we'll be sleeping. */ n_restarts_next = s->n_restarts + 1; if (n_restarts_next <= 1 || @@ -2519,9 +2519,9 @@ static void service_enter_restart(Service *s) { if (r < 0) goto fail; - /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't fully - * stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset the counter - * explicitly however via the usual "systemctl reset-failure" logic. */ + /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't + * fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset + * the counter explicitly however via the usual "systemctl reset-failure" logic. */ s->n_restarts ++; s->flush_n_restarts = false; @@ -2537,10 +2537,8 @@ static void service_enter_restart(Service *s) { /* Notify clients about changed restart counter */ unit_add_to_dbus_queue(UNIT(s)); - /* Note that we stay in the SERVICE_AUTO_RESTART state here, - * it will be canceled as part of the service_stop() call that - * is executed as part of JOB_RESTART. */ - + /* Note that we stay in the SERVICE_AUTO_RESTART state here, it will be canceled as part of the + * service_stop() call that is executed as part of JOB_RESTART. */ return; fail: From 09d04ad325473e05e23e6ba8382d7de1dd819bda Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 30 Jun 2023 18:17:06 +0200 Subject: [PATCH 2/4] core: introduce a new job mode JOB_RESTART_DEPENDENCIES This new job mode will enqueue a start job for a unit, and all units depending on the unit will get a restart job enqueued. This is then used for automatic sevice restarts: the unit itself is only started, the depending units restarted. This way the unit will not go down unnecessarily, triggering OnSuccess= needlessly. This also introduces a new state SERVICE_AUTO_RESTART_QUEUED that is entered once the restart jobs are enqueued. Previously we'd stay in SERVICE_AUTO_RESTART, but that's problematic, since we'd lose information whether we still need to enqueue the restart job during a serialization/deserialization cycle or not. By having an explicit state for this we know exactly whether we still need to enqueue the job or not. It's also good since when we are in SERVICE_AUTO_RESTART_QUEUED we want to act on unit_start(), but on SERVICE_AUTO_RESTART we want to wait for the holdoff time to pass before we act on unit_start(). Fixes: #27722 --- src/basic/unit-def.c | 1 + src/basic/unit-def.h | 1 + src/core/job.c | 1 + src/core/job.h | 1 + src/core/manager.c | 6 +++++- src/core/service.c | 27 +++++++++++++------------- src/core/transaction.c | 44 +++++++++++++++++++++++------------------- src/core/transaction.h | 11 +++++++---- 8 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/basic/unit-def.c b/src/basic/unit-def.c index bfd748ada4a..595e4414c7e 100644 --- a/src/basic/unit-def.c +++ b/src/basic/unit-def.c @@ -204,6 +204,7 @@ static const char* const service_state_table[_SERVICE_STATE_MAX] = { [SERVICE_FAILED_BEFORE_AUTO_RESTART] = "failed-before-auto-restart", [SERVICE_DEAD_RESOURCES_PINNED] = "dead-resources-pinned", [SERVICE_AUTO_RESTART] = "auto-restart", + [SERVICE_AUTO_RESTART_QUEUED] = "auto-restart-queued", [SERVICE_CLEANING] = "cleaning", }; diff --git a/src/basic/unit-def.h b/src/basic/unit-def.h index 7bd86cb1d2f..f67fc1d620d 100644 --- a/src/basic/unit-def.h +++ b/src/basic/unit-def.h @@ -149,6 +149,7 @@ typedef enum ServiceState { SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_DEAD_RESOURCES_PINNED, /* Like SERVICE_DEAD, but with pinned resources */ SERVICE_AUTO_RESTART, + SERVICE_AUTO_RESTART_QUEUED, SERVICE_CLEANING, _SERVICE_STATE_MAX, _SERVICE_STATE_INVALID = -EINVAL, diff --git a/src/core/job.c b/src/core/job.c index 50f9581d727..3d5e4e42d12 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -1634,6 +1634,7 @@ static const char* const job_mode_table[_JOB_MODE_MAX] = { [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies", [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements", [JOB_TRIGGERING] = "triggering", + [JOB_RESTART_DEPENDENCIES] = "restart-dependencies", }; DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode); diff --git a/src/core/job.h b/src/core/job.h index f3e0b93fed5..d3b98d98b61 100644 --- a/src/core/job.h +++ b/src/core/job.h @@ -79,6 +79,7 @@ enum JobMode { JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */ JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */ JOB_TRIGGERING, /* Adds TRIGGERED_BY dependencies to the same transaction */ + JOB_RESTART_DEPENDENCIES,/* A "start" job for the specified unit becomes "restart" for depending units */ _JOB_MODE_MAX, _JOB_MODE_INVALID = -EINVAL, }; diff --git a/src/core/manager.c b/src/core/manager.c index f4591002944..cc4fc1679c2 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -2044,6 +2044,9 @@ int manager_add_job( if (mode == JOB_TRIGGERING && type != JOB_STOP) return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop."); + if (mode == JOB_RESTART_DEPENDENCIES && type != JOB_START) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=restart-dependencies is only valid for start."); + log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode)); type = job_type_collapse(type, unit); @@ -2059,7 +2062,8 @@ int manager_add_job( /* by= */ NULL, TRANSACTION_MATTERS | (IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS) ? TRANSACTION_IGNORE_REQUIREMENTS : 0) | - (mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0), + (mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0) | + (mode == JOB_RESTART_DEPENDENCIES ? TRANSACTION_PROPAGATE_START_AS_RESTART : 0), error); if (r < 0) return r; diff --git a/src/core/service.c b/src/core/service.c index 7f6ee26209d..0759838e3b9 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -72,6 +72,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = { [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, [SERVICE_CLEANING] = UNIT_MAINTENANCE, }; @@ -101,6 +102,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, [SERVICE_CLEANING] = UNIT_MAINTENANCE, }; @@ -1258,7 +1260,7 @@ static void service_set_state(Service *s, ServiceState state) { if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, - SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, + SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED, SERVICE_DEAD_RESOURCES_PINNED)) { unit_unwatch_all_pids(UNIT(s)); unit_dequeue_rewatch_pids(UNIT(s)); @@ -1363,7 +1365,7 @@ static int service_coldplug(Unit *u) { if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, - SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, + SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED, SERVICE_CLEANING, SERVICE_DEAD_RESOURCES_PINNED)) { (void) unit_enqueue_rewatch_pids(u); @@ -1945,7 +1947,7 @@ static bool service_will_restart(Unit *u) { assert(s); - if (IN_SET(s->state, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART)) + if (IN_SET(s->state, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED)) return true; return unit_will_restart_default(u); @@ -2511,11 +2513,9 @@ static void service_enter_restart(Service *s) { return; } - /* Any units that are bound to this service must also be - * restarted. We use JOB_RESTART (instead of the more obvious - * JOB_START) here so that those dependency jobs will be added - * as well. */ - r = manager_add_job(UNIT(s)->manager, JOB_RESTART, UNIT(s), JOB_REPLACE, NULL, &error, NULL); + /* Any units that are bound to this service must also be restarted. We use JOB_START for ourselves + * but then set JOB_RESTART_DEPENDENCIES which will enqueue JOB_RESTART for those dependency jobs. */ + r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT(s), JOB_RESTART_DEPENDENCIES, NULL, &error, NULL); if (r < 0) goto fail; @@ -2534,11 +2534,10 @@ static void service_enter_restart(Service *s) { "Scheduled restart job, restart counter is at %u.", s->n_restarts), "N_RESTARTS=%u", s->n_restarts); + service_set_state(s, SERVICE_AUTO_RESTART_QUEUED); + /* Notify clients about changed restart counter */ unit_add_to_dbus_queue(UNIT(s)); - - /* Note that we stay in the SERVICE_AUTO_RESTART state here, it will be canceled as part of the - * service_stop() call that is executed as part of JOB_RESTART. */ return; fail: @@ -2717,7 +2716,7 @@ static int service_start(Unit *u) { if (IN_SET(s->state, SERVICE_AUTO_RESTART, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART)) return -EAGAIN; - assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED)); + assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED, SERVICE_AUTO_RESTART_QUEUED)); r = unit_acquire_invocation_id(u); if (r < 0) @@ -2775,7 +2774,8 @@ static int service_stop(Unit *u) { return 0; case SERVICE_AUTO_RESTART: - /* A restart will be scheduled or is in progress. */ + case SERVICE_AUTO_RESTART_QUEUED: + /* Give up on the auto restart */ service_set_state(s, service_determine_dead_state(s)); return 0; @@ -3648,6 +3648,7 @@ static void service_notify_cgroup_empty_event(Unit *u) { /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean * up the cgroup earlier and should do it now. */ case SERVICE_AUTO_RESTART: + case SERVICE_AUTO_RESTART_QUEUED: unit_prune_cgroup(u); break; diff --git a/src/core/transaction.c b/src/core/transaction.c index 9c1523c47ad..68614de7a06 100644 --- a/src/core/transaction.c +++ b/src/core/transaction.c @@ -1054,34 +1054,38 @@ int transaction_add_job_and_dependencies( } } - if (IN_SET(type, JOB_STOP, JOB_RESTART)) { - _cleanup_set_free_ Set *propagated_restart = NULL; - /* We propagate STOP as STOP, but RESTART only as TRY_RESTART, in order not to start - * dependencies that are not around. */ + _cleanup_set_free_ Set *propagated_restart = NULL; - if (type == JOB_RESTART) - UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_RESTART) { - JobType nt; + if (type == JOB_RESTART || (type == JOB_START && FLAGS_SET(flags, TRANSACTION_PROPAGATE_START_AS_RESTART))) { - r = set_ensure_put(&propagated_restart, NULL, dep); - if (r < 0) + /* We propagate RESTART only as TRY_RESTART, in order not to start dependencies that + * are not around. */ + + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_RESTART) { + JobType nt; + + r = set_ensure_put(&propagated_restart, NULL, dep); + if (r < 0) + return r; + + nt = job_type_collapse(JOB_TRY_RESTART, dep); + if (nt == JOB_NOP) + continue; + + r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e); + if (r < 0) { + if (r != -EBADR) /* job type not applicable */ return r; - nt = job_type_collapse(JOB_TRY_RESTART, dep); - if (nt == JOB_NOP) - continue; - - r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e); - if (r < 0) { - if (r != -EBADR) /* job type not applicable */ - return r; - - sd_bus_error_free(e); - } + sd_bus_error_free(e); } + } + } + if (type == JOB_STOP) { /* The 'stop' part of a restart job is also propagated to units with * UNIT_ATOM_PROPAGATE_STOP */ + UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_STOP) { /* Units experienced restart propagation are skipped */ if (set_contains(propagated_restart, dep)) diff --git a/src/core/transaction.h b/src/core/transaction.h index 5874077aefd..db2d0566499 100644 --- a/src/core/transaction.h +++ b/src/core/transaction.h @@ -21,10 +21,13 @@ Transaction *transaction_abort_and_free(Transaction *tr); DEFINE_TRIVIAL_CLEANUP_FUNC(Transaction*, transaction_abort_and_free); typedef enum TransactionAddFlags { - TRANSACTION_MATTERS = 1 << 0, - TRANSACTION_CONFLICTS = 1 << 1, - TRANSACTION_IGNORE_REQUIREMENTS = 1 << 2, - TRANSACTION_IGNORE_ORDER = 1 << 3, + TRANSACTION_MATTERS = 1 << 0, + TRANSACTION_CONFLICTS = 1 << 1, + TRANSACTION_IGNORE_REQUIREMENTS = 1 << 2, + TRANSACTION_IGNORE_ORDER = 1 << 3, + + /* Propagate a START job to other units like a RESTART */ + TRANSACTION_PROPAGATE_START_AS_RESTART = 1 << 4, } TransactionAddFlags; void transaction_add_propagate_reload_jobs( From f4b24db7c3285bd333b3dd80eab21303b6d704ed Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 3 Jul 2023 14:49:46 +0200 Subject: [PATCH 3/4] test: add test case for recent OnSuccess=/OnFailure= state machine changes --- .../success-failure-test-failure.service | 3 ++ .../success-failure-test-success.service | 3 ++ test/units/success-failure-test.service | 9 ++++ test/units/testsuite-23.success-failure.sh | 49 +++++++++++++++++++ 4 files changed, 64 insertions(+) create mode 100644 test/units/success-failure-test-failure.service create mode 100644 test/units/success-failure-test-success.service create mode 100644 test/units/success-failure-test.service create mode 100755 test/units/testsuite-23.success-failure.sh diff --git a/test/units/success-failure-test-failure.service b/test/units/success-failure-test-failure.service new file mode 100644 index 00000000000..f4ce013da8d --- /dev/null +++ b/test/units/success-failure-test-failure.service @@ -0,0 +1,3 @@ +[Service] +Type=notify +ExecStart=bash -c "echo failure >> /tmp/success-failure-test-result && systemd-notify --ready && sleep infinity" diff --git a/test/units/success-failure-test-success.service b/test/units/success-failure-test-success.service new file mode 100644 index 00000000000..8503c451220 --- /dev/null +++ b/test/units/success-failure-test-success.service @@ -0,0 +1,3 @@ +[Service] +Type=notify +ExecStart=bash -c "echo success >> /tmp/success-failure-test-result && systemd-notify --ready && sleep infinity" diff --git a/test/units/success-failure-test.service b/test/units/success-failure-test.service new file mode 100644 index 00000000000..f66ff6cd6b8 --- /dev/null +++ b/test/units/success-failure-test.service @@ -0,0 +1,9 @@ +[Unit] +OnSuccess=success-failure-test-success.service +OnFailure=success-failure-test-failure.service + +[Service] +Type=notify +Restart=always +ExecStart=bash -c 'test -f /tmp/success-failure-test-ran && touch /tmp/success-failure-test-ran2 && systemd-notify --ready && sleep infinity' +ExecStopPost=touch /tmp/success-failure-test-ran diff --git a/test/units/testsuite-23.success-failure.sh b/test/units/testsuite-23.success-failure.sh new file mode 100755 index 00000000000..8fc9596cc57 --- /dev/null +++ b/test/units/testsuite-23.success-failure.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eux +set -o pipefail + +# Test OnSuccess=/OnFailure= in combination + +systemd-analyze log-level debug + +# Start-up should fail, but the automatic restart should fix it +(! systemctl start success-failure-test ) + +# Wait until the first invocation finished & failed +while test ! -f /tmp/success-failure-test-ran ; do + sleep .5 +done + +# Wait until the second invocation finished & succeeded +while test ! -f /tmp/success-failure-test-ran2 ; do + sleep .5 +done + +# Verify it is indeed running +systemctl is-active -q success-failure-test + +# The above should have caused the failure service to start (asynchronously) +while test "$(systemctl is-active success-failure-test-failure)" != "active" ; do + sleep .5 +done + +# But the success service should not have started +test "$(systemctl is-active success-failure-test-success)" = "inactive" + +systemctl stop success-failure-test-failure + +# Do a clean kill of the service now +systemctl kill success-failure-test + +# This should result in the success service to start +while test "$(systemctl is-active success-failure-test-success)" != "active" ; do + sleep .5 +done + +# But the failure service should not have started again +test "$(systemctl is-active success-failure-test-failure)" = "inactive" + +systemctl stop success-failure-test success-failure-test-success + +systemd-analyze log-level info From ba5e342c0ebc0bfc9036aa70a2009d1601561167 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Mon, 3 Jul 2023 22:32:36 +0800 Subject: [PATCH 4/4] core/service: show correct restart usec for services in SERVICE_AUTO_RESTART_QUEUED Follow-up for #28215 We can now correctly distinguish enqueued auto-restarts from those that are still pending. Let's take advantage of that. --- src/core/service.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/core/service.c b/src/core/service.c index 0759838e3b9..6831b847e87 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -288,11 +288,9 @@ usec_t service_restart_usec_next(Service *s) { assert(s); /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, we still need - * to add 1 to s->n_restarts manually because s->n_restarts is not updated until a restart job is - * enqueued. Note that for SERVICE_AUTO_RESTART, that might have been the case, i.e. s->n_restarts is - * already increased. But we assume it's not since the time between job enqueuing and running is - * usually neglectable compared to the time we'll be sleeping. */ - n_restarts_next = s->n_restarts + 1; + * to add 1 to s->n_restarts manually, because s->n_restarts is not updated until a restart job is + * enqueued, i.e. state has transitioned to SERVICE_AUTO_RESTART_QUEUED. */ + n_restarts_next = s->n_restarts + (s->state == SERVICE_AUTO_RESTART_QUEUED ? 0 : 1); if (n_restarts_next <= 1 || s->restart_steps == 0 || @@ -305,7 +303,7 @@ usec_t service_restart_usec_next(Service *s) { /* Enforced in service_verify() and above */ assert(s->restart_max_delay_usec > s->restart_usec); - /* ((restart_usec_max - restart_usec)^(1/restart_steps))^(n_restart_next - 1) */ + /* ((restart_max_delay_usec - restart_usec)^(1/restart_steps))^(n_restart_next - 1) */ value = usec_add(s->restart_usec, (usec_t) powl(s->restart_max_delay_usec - s->restart_usec, (long double) (n_restarts_next - 1) / s->restart_steps));