fix: store and execute desired action on emergency action

Fixes #7854

Talos runs an emergency handler if the sequence experience and
unrecoverable failure. The emergency handler was unconditionally
executing "reboot" action if no other action was received (which only
gets received if the sequence completes successfully), so the Shutdown
request might result in a Reboot behavior on error during shutdown
phase.

This is not a pretty fix, but it's hard to deliver the intent from one
part of the code to another right now, so instead use a global variable
which stores default emergency intention, and gets overridden early in
the Shutdown sequence.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2023-12-04 18:27:18 +04:00
parent 515ae2a184
commit 474fa0480d
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
5 changed files with 44 additions and 2 deletions

View File

@ -27,6 +27,7 @@ import (
"github.com/siderolabs/talos/internal/app/apid"
"github.com/siderolabs/talos/internal/app/dashboard"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/emergency"
v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1"
"github.com/siderolabs/talos/internal/app/machined/pkg/system"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/services"
@ -95,7 +96,7 @@ func syncNonVolatileStorageBuffers() {
//nolint:gocyclo
func handle(ctx context.Context, err error) {
rebootCmd := unix.LINUX_REBOOT_CMD_RESTART
rebootCmd := int(emergency.RebootCmd.Load())
var rebootErr runtime.RebootError

View File

@ -534,6 +534,13 @@ func (ctrl *PlatformConfigController) runWithRestarts(ctx context.Context, logge
return
}
// skip restarting if context is already done
select {
case <-ctx.Done():
return
default:
}
interval := backoff.NextBackOff()
logger.Error("restarting platform network config", zap.Duration("interval", interval), zap.Error(err))

View File

@ -0,0 +1,19 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Package emergency provides values to handle emergency (panic/unrecoverable error) handling for machined.
package emergency
import (
"sync/atomic"
"golang.org/x/sys/unix"
)
// RebootCmd is a command to reboot the system after an unrecoverable error.
var RebootCmd atomic.Int64
func init() {
RebootCmd.Store(unix.LINUX_REBOOT_CMD_RESTART)
}

View File

@ -375,7 +375,10 @@ func (*Sequencer) Reset(r runtime.Runtime, in runtime.ResetOptions) []runtime.Ph
// Shutdown is the shutdown sequence.
func (*Sequencer) Shutdown(r runtime.Runtime, in *machineapi.ShutdownRequest) []runtime.Phase {
phases := PhaseList{}.AppendWhen(
phases := PhaseList{}.Append(
"storeShudown",
StoreShutdownEmergency,
).AppendWhen(
!in.GetForce() && !r.Config().Machine().Kubelet().SkipNodeRegistration(),
"drain",
CordonAndDrainNode,

View File

@ -47,6 +47,7 @@ import (
installer "github.com/siderolabs/talos/cmd/installer/pkg/install"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/disk"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/emergency"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/bootloader/grub"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform"
"github.com/siderolabs/talos/internal/app/machined/pkg/system"
@ -2255,6 +2256,17 @@ func FlushMeta(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
}, "flushMeta"
}
// StoreShutdownEmergency stores shutdown emergency state.
func StoreShutdownEmergency(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error {
// for shutdown sequence, store power_off as the intent, it will be picked up
// by emergency handled in machined/main.go if the Shutdown sequence fails
emergency.RebootCmd.Store(unix.LINUX_REBOOT_CMD_POWER_OFF)
return nil
}, "storeShutdownEmergency"
}
func pauseOnFailure(callback func(runtime.Sequence, any) (runtime.TaskExecutionFunc, string),
timeout time.Duration,
) func(seq runtime.Sequence, data any) (runtime.TaskExecutionFunc, string) {