fix: service lifecycle issues

The core change is moving the context out of the `ServiceRunner` struct
to be a local variable, and using a channel to notify about shutdown
events.

Add more synchronization between Run and the moment service started to
avoid mis-identifying not running (yet) service as successfully finished.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
Co-authored-by: Dmitriy Matrenichev <dmitry.matrenichev@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2024-03-19 16:05:56 +04:00
parent ead37abf09
commit 89fc68b459
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
10 changed files with 1708 additions and 1551 deletions

View File

@ -206,6 +206,7 @@ message ServiceStateEvent {
FINISHED = 5;
FAILED = 6;
SKIPPED = 7;
STARTING = 8;
}
Action action = 2;
string message = 3;

View File

@ -30,12 +30,15 @@ const (
StateFinished
StateFailed
StateSkipped
StateStarting
)
func (state ServiceState) String() string {
switch state {
case StateInitialized:
return "Initialized"
case StateStarting:
return "Starting"
case StatePreparing:
return "Preparing"
case StateWaiting:

View File

@ -0,0 +1,18 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package system
import (
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/pkg/conditions"
)
func NewServices(runtime runtime.Runtime) *singleton { //nolint:revive
return newServices(runtime)
}
func WaitForServiceWithInstance(instance *singleton, event StateEvent, service string) conditions.Condition {
return waitForService(instance, event, service)
}

View File

@ -0,0 +1,93 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package system_test
import (
"context"
"io"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/system"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/events"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/goroutine"
"github.com/siderolabs/talos/pkg/conditions"
)
type TestCondition struct{}
func (TestCondition) String() string {
return "test-condition"
}
func (TestCondition) Wait(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(10 * time.Millisecond):
return nil
}
}
type TestService struct{}
func (TestService) ID(runtime.Runtime) string {
return "test-service"
}
func (TestService) PreFunc(ctx context.Context, r runtime.Runtime) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
return nil
}
}
func (TestService) Runner(r runtime.Runtime) (runner.Runner, error) {
return goroutine.NewRunner(r, "test-service", func(ctx context.Context, r runtime.Runtime, logOutput io.Writer) error {
<-ctx.Done()
return nil
}), nil
}
func (TestService) PostFunc(runtime.Runtime, events.ServiceState) error {
return nil
}
func (TestService) Condition(runtime.Runtime) conditions.Condition {
return TestCondition{}
}
func (TestService) DependsOn(runtime.Runtime) []string {
return nil
}
func TestRestartService(t *testing.T) {
deadline, ok := t.Deadline()
if !ok {
deadline = time.Now().Add(15 * time.Second)
}
ctx, cancel := context.WithDeadline(context.Background(), deadline)
defer cancel()
services := system.NewServices(nil)
services.Load(TestService{})
for i := 0; i < 100; i++ {
require.NoError(t, services.Start("test-service"))
require.NoError(t, system.WaitForServiceWithInstance(services, system.StateEventUp, "test-service").Wait(ctx))
require.NoError(t, services.Stop(ctx, "test-service"))
}
}

View File

@ -26,15 +26,16 @@ const (
type serviceCondition struct {
mu sync.Mutex
waitingRegister bool
instance *singleton
event StateEvent
service string
}
func (sc *serviceCondition) Wait(ctx context.Context) error {
instance.mu.Lock()
svcrunner := instance.state[sc.service]
instance.mu.Unlock()
sc.instance.mu.Lock()
svcrunner := sc.instance.state[sc.service]
sc.instance.mu.Unlock()
if svcrunner == nil {
return sc.waitRegister(ctx)
@ -68,9 +69,9 @@ func (sc *serviceCondition) waitRegister(ctx context.Context) error {
var svcrunner *ServiceRunner
for {
instance.mu.Lock()
svcrunner = instance.state[sc.service]
instance.mu.Unlock()
sc.instance.mu.Lock()
svcrunner = sc.instance.state[sc.service]
sc.instance.mu.Unlock()
if svcrunner != nil {
break
@ -103,8 +104,13 @@ func (sc *serviceCondition) String() string {
// WaitForService waits for service to reach some state event.
func WaitForService(event StateEvent, service string) conditions.Condition {
return waitForService(instance, event, service)
}
func waitForService(instance *singleton, event StateEvent, service string) conditions.Condition {
return &serviceCondition{
event: event,
service: service,
instance: instance,
event: event,
service: service,
}
}

View File

@ -33,9 +33,10 @@ var WaitConditionCheckInterval = time.Second
type ServiceRunner struct {
mu sync.Mutex
runtime runtime.Runtime
service Service
id string
runtime runtime.Runtime
service Service
id string
instance *singleton
state events.ServiceState
events events.ServiceEvents
@ -44,23 +45,19 @@ type ServiceRunner struct {
stateSubscribers map[StateEvent][]chan<- struct{}
ctxMu sync.Mutex
ctx context.Context //nolint:containedctx
ctxCancel context.CancelFunc
stopCh chan struct{}
}
// NewServiceRunner creates new ServiceRunner around Service instance.
func NewServiceRunner(service Service, runtime runtime.Runtime) *ServiceRunner {
ctx, ctxCancel := context.WithCancel(context.Background())
func NewServiceRunner(instance *singleton, service Service, runtime runtime.Runtime) *ServiceRunner {
return &ServiceRunner{
service: service,
instance: instance,
runtime: runtime,
id: service.ID(runtime),
state: events.StateInitialized,
stateSubscribers: make(map[StateEvent][]chan<- struct{}),
ctx: ctx,
ctxCancel: ctxCancel,
stopCh: make(chan struct{}, 1),
}
}
@ -192,23 +189,32 @@ var ErrSkip = errors.New("service skipped")
// Run returns an error when a service stops.
//
// Run should be run in a goroutine.
func (svcrunner *ServiceRunner) Run() error {
defer func() {
// reset context for the next run
svcrunner.ctxMu.Lock()
svcrunner.ctx, svcrunner.ctxCancel = context.WithCancel(context.Background())
svcrunner.ctxMu.Unlock()
//
//nolint:gocyclo
func (svcrunner *ServiceRunner) Run(notifyChannels ...chan<- struct{}) error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
select {
case <-ctx.Done():
return
case <-svcrunner.stopCh:
cancel()
}
}()
svcrunner.ctxMu.Lock()
ctx := svcrunner.ctx
svcrunner.ctxMu.Unlock()
svcrunner.UpdateState(ctx, events.StateStarting, "Starting service")
for _, notifyCh := range notifyChannels {
close(notifyCh)
}
condition := svcrunner.service.Condition(svcrunner.runtime)
dependencies := svcrunner.service.DependsOn(svcrunner.runtime)
if len(dependencies) > 0 {
serviceConditions := xslices.Map(dependencies, func(dep string) conditions.Condition { return WaitForService(StateEventUp, dep) })
serviceConditions := xslices.Map(dependencies, func(dep string) conditions.Condition { return waitForService(instance, StateEventUp, dep) })
serviceDependencies := conditions.WaitForAll(serviceConditions...)
if condition != nil {
@ -318,10 +324,6 @@ func (svcrunner *ServiceRunner) run(ctx context.Context, runnr runner.Runner) er
}()
}
// when service run finishes, cancel context, this is important if service
// terminates on its own before being terminated by Stop()
defer svcrunner.ctxCancel()
select {
case <-ctx.Done():
err := runnr.Stop()
@ -344,9 +346,10 @@ func (svcrunner *ServiceRunner) run(ctx context.Context, runnr runner.Runner) er
//
// Shutdown completes when Start() returns.
func (svcrunner *ServiceRunner) Shutdown() {
svcrunner.ctxMu.Lock()
defer svcrunner.ctxMu.Unlock()
svcrunner.ctxCancel()
select {
case svcrunner.stopCh <- struct{}{}:
default:
}
}
// AsProto returns protobuf struct with the state of the service runner.

View File

@ -32,7 +32,7 @@ func (suite *ServiceRunnerSuite) assertStateSequence(expectedStates []events.Ser
}
func (suite *ServiceRunnerSuite) TestFullFlow() {
sr := system.NewServiceRunner(&MockService{
sr := system.NewServiceRunner(system.Services(nil), &MockService{
condition: conditions.None(),
}, nil)
@ -62,6 +62,7 @@ func (suite *ServiceRunnerSuite) TestFullFlow() {
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
events.StatePreparing,
events.StatePreparing,
@ -72,11 +73,11 @@ func (suite *ServiceRunnerSuite) TestFullFlow() {
suite.Assert().Equal("MockRunner", protoService.Id)
suite.Assert().Equal("Running", protoService.State)
suite.Assert().True(protoService.Health.Unknown)
suite.Assert().Len(protoService.Events.Events, 4)
suite.Assert().Len(protoService.Events.Events, 5)
}
func (suite *ServiceRunnerSuite) TestFullFlowHealthy() {
sr := system.NewServiceRunner(&MockHealthcheckedService{}, nil)
sr := system.NewServiceRunner(system.Services(nil), &MockHealthcheckedService{}, nil)
errCh := make(chan error)
@ -104,6 +105,7 @@ func (suite *ServiceRunnerSuite) TestFullFlowHealthy() {
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StatePreparing,
events.StatePreparing,
events.StateRunning,
@ -117,7 +119,7 @@ func (suite *ServiceRunnerSuite) TestFullFlowHealthChanges() {
condition: conditions.None(),
},
}
sr := system.NewServiceRunner(&m, nil)
sr := system.NewServiceRunner(system.Services(nil), &m, nil)
errCh := make(chan error)
@ -161,6 +163,7 @@ func (suite *ServiceRunnerSuite) TestFullFlowHealthChanges() {
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
events.StatePreparing,
events.StatePreparing,
@ -181,7 +184,7 @@ func (suite *ServiceRunnerSuite) TestWaitingDescriptionChange() {
cond1 := NewMockCondition("cond1")
cond2 := NewMockCondition("cond2")
sr := system.NewServiceRunner(&MockService{
sr := system.NewServiceRunner(system.Services(nil), &MockService{
condition: conditions.WaitForAll(cond1, cond2),
}, nil)
@ -241,6 +244,7 @@ func (suite *ServiceRunnerSuite) TestWaitingDescriptionChange() {
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
events.StateWaiting,
events.StatePreparing,
@ -249,18 +253,19 @@ func (suite *ServiceRunnerSuite) TestWaitingDescriptionChange() {
}, sr)
events := sr.GetEventHistory(10000)
suite.Assert().Equal("Waiting for cond1, cond2", events[0].Message)
suite.Assert().Equal("Waiting for cond2", events[1].Message)
suite.Assert().Equal("Waiting for cond1, cond2", events[1].Message)
suite.Assert().Equal("Waiting for cond2", events[2].Message)
}
func (suite *ServiceRunnerSuite) TestPreStageFail() {
svc := &MockService{
preError: errors.New("pre failed"),
}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
err := sr.Run()
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StatePreparing,
}, sr)
suite.Assert().EqualError(err, "failed to run pre stage: pre failed")
@ -270,10 +275,11 @@ func (suite *ServiceRunnerSuite) TestRunnerStageFail() {
svc := &MockService{
runnerError: errors.New("runner failed"),
}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
err := sr.Run()
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StatePreparing,
events.StatePreparing,
}, sr)
@ -284,10 +290,11 @@ func (suite *ServiceRunnerSuite) TestRunnerStageSkipped() {
svc := &MockService{
nilRunner: true,
}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
err := sr.Run()
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StatePreparing,
events.StatePreparing,
}, sr)
@ -298,7 +305,7 @@ func (suite *ServiceRunnerSuite) TestAbortOnCondition() {
svc := &MockService{
condition: conditions.WaitForFileToExist("/doesntexistever"),
}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
errCh := make(chan error)
@ -326,6 +333,7 @@ func (suite *ServiceRunnerSuite) TestAbortOnCondition() {
suite.Assert().EqualError(<-errCh, "condition failed: context canceled")
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
}, sr)
}
@ -335,19 +343,23 @@ func (suite *ServiceRunnerSuite) TestPostStateFail() {
condition: conditions.None(),
postError: errors.New("post failed"),
}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
errCh := make(chan error)
runNotify := make(chan struct{})
go func() {
errCh <- sr.Run()
errCh <- sr.Run(runNotify)
}()
<-runNotify
sr.Shutdown()
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
events.StatePreparing,
events.StatePreparing,
@ -359,7 +371,7 @@ func (suite *ServiceRunnerSuite) TestPostStateFail() {
func (suite *ServiceRunnerSuite) TestRunFail() {
runner := &MockRunner{exitCh: make(chan error)}
svc := &MockService{runner: runner}
sr := system.NewServiceRunner(svc, nil)
sr := system.NewServiceRunner(system.Services(nil), svc, nil)
errCh := make(chan error)
@ -372,6 +384,7 @@ func (suite *ServiceRunnerSuite) TestRunFail() {
suite.Assert().EqualError(<-errCh, "failed running service: error running service: run failed")
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StatePreparing,
events.StatePreparing,
events.StateRunning,
@ -379,7 +392,7 @@ func (suite *ServiceRunnerSuite) TestRunFail() {
}
func (suite *ServiceRunnerSuite) TestFullFlowRestart() {
sr := system.NewServiceRunner(&MockService{
sr := system.NewServiceRunner(system.Services(nil), &MockService{
condition: conditions.None(),
}, nil)
@ -408,10 +421,14 @@ func (suite *ServiceRunnerSuite) TestFullFlowRestart() {
suite.Assert().NoError(<-errCh)
notifyCh := make(chan struct{})
go func() {
errCh <- sr.Run()
errCh <- sr.Run(notifyCh)
}()
<-notifyCh
suite.Require().NoError(retry.Constant(time.Minute, retry.WithUnits(10*time.Millisecond)).Retry(func() error {
state := sr.AsProto().State
if state != events.StateRunning.String() {
@ -432,10 +449,12 @@ func (suite *ServiceRunnerSuite) TestFullFlowRestart() {
suite.Assert().NoError(<-errCh)
suite.assertStateSequence([]events.ServiceState{
events.StateStarting,
events.StateWaiting,
events.StatePreparing,
events.StatePreparing,
events.StateRunning,
events.StateStarting,
events.StateWaiting,
events.StatePreparing,
events.StatePreparing,

View File

@ -47,16 +47,20 @@ var (
once sync.Once
)
func newServices(runtime runtime.Runtime) *singleton {
return &singleton{
runtime: runtime,
state: map[string]*ServiceRunner{},
running: map[string]struct{}{},
}
}
// Services returns the instance of the system services API.
//
//nolint:revive,golint
func Services(runtime runtime.Runtime) *singleton {
once.Do(func() {
instance = &singleton{
runtime: runtime,
state: map[string]*ServiceRunner{},
running: map[string]struct{}{},
}
instance = newServices(runtime)
})
return instance
@ -84,7 +88,7 @@ func (s *singleton) Load(services ...Service) []string {
continue
}
svcrunner := NewServiceRunner(service, s.runtime)
svcrunner := NewServiceRunner(s, service, s.runtime)
s.state[id] = svcrunner
}
@ -162,6 +166,8 @@ func (s *singleton) Start(serviceIDs ...string) error {
continue
}
runNotify := make(chan struct{})
s.wg.Add(1)
go func(id string, svcrunner *ServiceRunner) {
@ -173,7 +179,7 @@ func (s *singleton) Start(serviceIDs ...string) error {
}()
defer s.wg.Done()
return svcrunner.Run()
return svcrunner.Run(runNotify)
}()
switch {
@ -190,6 +196,9 @@ func (s *singleton) Start(serviceIDs ...string) error {
svcrunner.UpdateState(context.Background(), events.StateFailed, msg)
}
}(id, svcrunner)
// wait for svcrunner.Run to enter the running phase, and then return
<-runNotify
}
return multiErr.ErrorOrNil()
@ -347,12 +356,12 @@ func (s *singleton) stopServices(ctx context.Context, services []string, waitFor
for name, svcrunner := range servicesToStop {
shutdownWg.Add(1)
stoppedConds = append(stoppedConds, WaitForService(StateEventDown, name))
stoppedConds = append(stoppedConds, waitForService(s, StateEventDown, name))
go func(svcrunner *ServiceRunner, reverseDeps []string) {
defer shutdownWg.Done()
conds := xslices.Map(reverseDeps, func(dep string) conditions.Condition { return WaitForService(StateEventDown, dep) })
conds := xslices.Map(reverseDeps, func(dep string) conditions.Condition { return waitForService(s, StateEventDown, dep) })
allDeps := conditions.WaitForAll(conds...)
if err := allDeps.Wait(shutdownCtx); err != nil {

File diff suppressed because it is too large Load Diff

View File

@ -7714,6 +7714,7 @@ File type.
| FINISHED | 5 | |
| FAILED | 6 | |
| SKIPPED | 7 | |
| STARTING | 8 | |