talos/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go
Andrey Smirnov 139c62d762
feat: allow upgrades in maintenance mode (only over SideroLink)
This implements a simple way to upgrade Talos node running in
maintenance mode (only if Talos is installed, i.e. if `STATE` and
`EPHEMERAL` partitions are wiped).

Upgrade is only available over SideroLink for security reasons.

Upgrade in maintenance mode doesn't support any options, and it works
without machine configuration, so proxy environment variables are not
available, registry mirrors can't be used, and extensions are not
installed.

Fixes #6224

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
2022-09-30 21:16:15 +04:00

421 lines
9.6 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package v1alpha1
import (
"context"
"errors"
"fmt"
"log"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"golang.org/x/sync/errgroup"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime/logging"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime/v1alpha1/acpi"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime/v1alpha2"
"github.com/talos-systems/talos/pkg/machinery/api/common"
"github.com/talos-systems/talos/pkg/machinery/api/machine"
)
// Controller represents the controller responsible for managing the execution
// of sequences.
type Controller struct {
s runtime.Sequencer
r *Runtime
v2 *v1alpha2.Controller
priorityLock *PriorityLock[runtime.Sequence]
}
// NewController intializes and returns a controller.
func NewController() (*Controller, error) {
// Wait for USB storage in the case that the install disk is supplied over
// USB. If we don't wait, there is the chance that we will fail to detect the
// install disk.
err := waitForUSBDelay()
if err != nil {
return nil, err
}
s, err := NewState()
if err != nil {
return nil, err
}
// TODO: this should be streaming capacity and probably some constant
e := NewEvents(1000, 10)
l := logging.NewCircularBufferLoggingManager(log.New(os.Stdout, "machined fallback logger: ", log.Flags()))
ctlr := &Controller{
r: NewRuntime(nil, s, e, l),
s: NewSequencer(),
priorityLock: NewPriorityLock[runtime.Sequence](),
}
ctlr.v2, err = v1alpha2.NewController(ctlr.r)
if err != nil {
return nil, err
}
return ctlr, nil
}
// Run executes all phases known to the controller in serial. `Controller`
// aborts immediately if any phase fails.
//
//nolint:gocyclo
func (c *Controller) Run(ctx context.Context, seq runtime.Sequence, data interface{}, setters ...runtime.LockOption) error {
// We must ensure that the runtime is configured since all sequences depend
// on the runtime.
if c.r == nil {
return runtime.ErrUndefinedRuntime
}
ctx, err := c.priorityLock.Lock(ctx, time.Minute, seq, setters...)
if err != nil {
if errors.Is(err, runtime.ErrLocked) {
c.Runtime().Events().Publish(context.Background(), &machine.SequenceEvent{
Sequence: seq.String(),
Action: machine.SequenceEvent_NOOP,
Error: &common.Error{
Code: common.Code_LOCKED,
Message: fmt.Sprintf("sequence not started: %s", runtime.ErrLocked.Error()),
},
})
}
return err
}
defer c.priorityLock.Unlock()
phases, err := c.phases(seq, data)
if err != nil {
return err
}
err = c.run(ctx, seq, phases, data)
if err != nil {
code := common.Code_FATAL
if errors.Is(err, context.Canceled) {
code = common.Code_CANCELED
}
c.Runtime().Events().Publish(ctx, &machine.SequenceEvent{
Sequence: seq.String(),
Action: machine.SequenceEvent_NOOP,
Error: &common.Error{
Code: code,
Message: fmt.Sprintf("sequence failed: %s", err.Error()),
},
})
return err
}
return nil
}
// V1Alpha2 implements the controller interface.
func (c *Controller) V1Alpha2() runtime.V1Alpha2Controller {
return c.v2
}
// Runtime implements the controller interface.
func (c *Controller) Runtime() runtime.Runtime {
return c.r
}
// Sequencer implements the controller interface.
func (c *Controller) Sequencer() runtime.Sequencer {
return c.s
}
// ListenForEvents starts the event listener. The listener will trigger a
// shutdown in response to a SIGTERM signal and ACPI button/power event.
func (c *Controller) ListenForEvents(ctx context.Context) error {
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGTERM)
errCh := make(chan error, 2)
go func() {
<-sigs
signal.Stop(sigs)
log.Printf("shutdown via SIGTERM received")
if err := c.Run(ctx, runtime.SequenceShutdown, &machine.ShutdownRequest{Force: true}, runtime.WithTakeover()); err != nil {
log.Printf("shutdown failed: %v", err)
}
errCh <- nil
}()
if c.r.State().Platform().Mode() == runtime.ModeContainer {
return nil
}
go func() {
if err := acpi.StartACPIListener(); err != nil {
errCh <- err
return
}
log.Printf("shutdown via ACPI received")
if err := c.Run(ctx, runtime.SequenceShutdown, &machine.ShutdownRequest{Force: true}, runtime.WithTakeover()); err != nil {
log.Printf("failed to run shutdown sequence: %s", err)
}
errCh <- nil
}()
err := <-errCh
return err
}
func (c *Controller) run(ctx context.Context, seq runtime.Sequence, phases []runtime.Phase, data interface{}) error {
c.Runtime().Events().Publish(ctx, &machine.SequenceEvent{
Sequence: seq.String(),
Action: machine.SequenceEvent_START,
})
defer c.Runtime().Events().Publish(ctx, &machine.SequenceEvent{
Sequence: seq.String(),
Action: machine.SequenceEvent_STOP,
})
start := time.Now()
var (
number int
phase runtime.Phase
err error
)
log.Printf("%s sequence: %d phase(s)", seq.String(), len(phases))
defer func() {
if err != nil {
if !runtime.IsRebootError(err) {
log.Printf("%s sequence: failed", seq.String())
}
} else {
log.Printf("%s sequence: done: %s", seq.String(), time.Since(start))
}
}()
for number, phase = range phases {
// Make the phase number human friendly.
number++
start := time.Now()
progress := fmt.Sprintf("%d/%d", number, len(phases))
log.Printf("phase %s (%s): %d tasks(s)", phase.Name, progress, len(phase.Tasks))
if err = c.runPhase(ctx, phase, seq, data); err != nil {
if !runtime.IsRebootError(err) {
log.Printf("phase %s (%s): failed", phase.Name, progress)
}
return fmt.Errorf("error running phase %d in %s sequence: %w", number, seq.String(), err)
}
log.Printf("phase %s (%s): done, %s", phase.Name, progress, time.Since(start))
select {
case <-ctx.Done():
return ctx.Err()
default:
}
}
return nil
}
func (c *Controller) runPhase(ctx context.Context, phase runtime.Phase, seq runtime.Sequence, data interface{}) error {
c.Runtime().Events().Publish(ctx, &machine.PhaseEvent{
Phase: phase.Name,
Action: machine.PhaseEvent_START,
})
defer c.Runtime().Events().Publish(ctx, &machine.PhaseEvent{
Phase: phase.Name,
Action: machine.PhaseEvent_START,
})
eg, ctx := errgroup.WithContext(ctx)
for number, task := range phase.Tasks {
// Make the task number human friendly.
number := number
number++
task := task
eg.Go(func() error {
progress := fmt.Sprintf("%d/%d", number, len(phase.Tasks))
if err := c.runTask(ctx, progress, task, seq, data); err != nil {
return fmt.Errorf("task %s: failed, %w", progress, err)
}
return nil
})
}
return eg.Wait()
}
func (c *Controller) runTask(ctx context.Context, progress string, f runtime.TaskSetupFunc, seq runtime.Sequence, data interface{}) error {
task, taskName := f(seq, data)
if task == nil {
return nil
}
start := time.Now()
c.Runtime().Events().Publish(ctx, &machine.TaskEvent{
Task: taskName,
Action: machine.TaskEvent_START,
})
var err error
log.Printf("task %s (%s): starting", taskName, progress)
defer func() {
if err != nil {
if !runtime.IsRebootError(err) {
log.Printf("task %s (%s): failed: %s", taskName, progress, err)
}
} else {
log.Printf("task %s (%s): done, %s", taskName, progress, time.Since(start))
}
}()
defer c.Runtime().Events().Publish(ctx, &machine.TaskEvent{
Task: taskName,
Action: machine.TaskEvent_STOP,
})
logger := log.New(log.Writer(), fmt.Sprintf("[talos] task %s (%s): ", taskName, progress), log.Flags())
err = task(ctx, logger, c.r)
return err
}
//nolint:gocyclo
func (c *Controller) phases(seq runtime.Sequence, data interface{}) ([]runtime.Phase, error) {
var phases []runtime.Phase
switch seq {
case runtime.SequenceBoot:
phases = c.s.Boot(c.r)
case runtime.SequenceInitialize:
phases = c.s.Initialize(c.r)
case runtime.SequenceInstall:
phases = c.s.Install(c.r)
case runtime.SequenceShutdown:
in, ok := data.(*machine.ShutdownRequest)
if !ok {
return nil, runtime.ErrInvalidSequenceData
}
phases = c.s.Shutdown(c.r, in)
case runtime.SequenceReboot:
phases = c.s.Reboot(c.r)
case runtime.SequenceUpgrade:
in, ok := data.(*machine.UpgradeRequest)
if !ok {
return nil, runtime.ErrInvalidSequenceData
}
phases = c.s.Upgrade(c.r, in)
case runtime.SequenceStageUpgrade:
in, ok := data.(*machine.UpgradeRequest)
if !ok {
return nil, runtime.ErrInvalidSequenceData
}
phases = c.s.StageUpgrade(c.r, in)
case runtime.SequenceMaintenanceUpgrade:
in, ok := data.(*machine.UpgradeRequest)
if !ok {
return nil, runtime.ErrInvalidSequenceData
}
phases = c.s.MaintenanceUpgrade(c.r, in)
case runtime.SequenceReset:
in, ok := data.(runtime.ResetOptions)
if !ok {
return nil, runtime.ErrInvalidSequenceData
}
phases = c.s.Reset(c.r, in)
case runtime.SequenceNoop:
default:
return nil, fmt.Errorf("sequence not implemented: %q", seq)
}
return phases, nil
}
func waitForUSBDelay() (err error) {
wait := true
file := "/sys/module/usb_storage/parameters/delay_use"
_, err = os.Stat(file)
if err != nil {
if os.IsNotExist(err) {
wait = false
} else {
return err
}
}
if wait {
var b []byte
b, err = os.ReadFile(file)
if err != nil {
return err
}
val := strings.TrimSuffix(string(b), "\n")
var i int
i, err = strconv.Atoi(val)
if err != nil {
return err
}
log.Printf("waiting %d second(s) for USB storage", i)
time.Sleep(time.Duration(i) * time.Second)
}
return nil
}