fix: implement Unload() for services to make sure bootkube runs always
The problem was that flow to re-run the service with different parameters was not consistent: it depends on whether services was loaded before or not, but that is not reliable, as e.g. with bootstrap API `bootkube` is loaded for the bootstrap and stays until reboot, and never loaded for any other boot. `Unload()` stops and removes the service completely so that new instance of the service could be loaded and started. This fixes the edge case with recovery API not running bootkube properly before reboot after bootstrap. Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
parent
ac28b9c976
commit
d210d7f1a3
@ -1573,18 +1573,15 @@ func Recover(seq runtime.Sequence, data interface{}) runtime.TaskExecutionFunc {
|
||||
|
||||
svc := &services.Bootkube{Recover: true}
|
||||
|
||||
if r.Config().Machine().Type() == runtime.MachineTypeControlPlane {
|
||||
system.Services(r).LoadAndStart(svc)
|
||||
} else {
|
||||
loaded := system.Services(r).Reload(svc)
|
||||
// unload bootkube (if any instance ran before)
|
||||
if err = system.Services(r).Unload(ctx, svc.ID(r)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(loaded) == 0 {
|
||||
return fmt.Errorf("bootkube service is already running")
|
||||
}
|
||||
system.Services(r).Load(svc)
|
||||
|
||||
if err = system.Services(r).Start(svc.ID(r)); err != nil {
|
||||
return fmt.Errorf("failed to start bootkube: %w", err)
|
||||
}
|
||||
if err = system.Services(r).Start(svc.ID(r)); err != nil {
|
||||
return fmt.Errorf("failed to start bootkube: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -1633,7 +1630,15 @@ func BootstrapEtcd(seq runtime.Sequence, data interface{}) runtime.TaskExecution
|
||||
|
||||
svc := &services.Etcd{Bootstrap: true}
|
||||
|
||||
system.Services(r).ReloadAndStart(svc)
|
||||
if err = system.Services(r).Unload(ctx, svc.ID(r)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
system.Services(r).Load(svc)
|
||||
|
||||
if err = system.Services(r).Start(svc.ID(r)); err != nil {
|
||||
return fmt.Errorf("error starting etcd in bootstrap mode: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
|
||||
defer cancel()
|
||||
|
@ -14,7 +14,6 @@ import (
|
||||
"github.com/hashicorp/go-multierror"
|
||||
|
||||
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
|
||||
"github.com/talos-systems/talos/internal/app/machined/pkg/system/events"
|
||||
"github.com/talos-systems/talos/internal/pkg/conditions"
|
||||
)
|
||||
|
||||
@ -84,33 +83,35 @@ func (s *singleton) Load(services ...Service) []string {
|
||||
return ids
|
||||
}
|
||||
|
||||
// Reload adds service to the list of services managed by the runner.
|
||||
// Unload stops the service and removes it from the list of running services.
|
||||
//
|
||||
// Reload returns service IDs for each of the services.
|
||||
func (s *singleton) Reload(services ...Service) []string {
|
||||
// It is not an error to unload a service which was already removed or stopped.
|
||||
func (s *singleton) Unload(ctx context.Context, serviceIDs ...string) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
if s.terminating {
|
||||
s.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
ids := make([]string, 0, len(services))
|
||||
servicesToRemove := []string{}
|
||||
|
||||
for _, service := range services {
|
||||
id := service.ID(s.runtime)
|
||||
ids = append(ids, id)
|
||||
|
||||
if svc, exists := s.state[id]; exists {
|
||||
switch svc.GetState() {
|
||||
case events.StateFailed, events.StateFinished, events.StateSkipped:
|
||||
svcrunner := NewServiceRunner(service, s.runtime)
|
||||
s.state[id] = svcrunner
|
||||
|
||||
return ids
|
||||
}
|
||||
for _, id := range serviceIDs {
|
||||
if _, exists := s.state[id]; exists {
|
||||
servicesToRemove = append(servicesToRemove, id)
|
||||
}
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
if err := s.Stop(ctx, servicesToRemove...); err != nil {
|
||||
return fmt.Errorf("error stopping services %v: %w", servicesToRemove, err)
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
for _, id := range servicesToRemove {
|
||||
delete(s.state, id)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -189,15 +190,6 @@ func (s *singleton) LoadAndStart(services ...Service) {
|
||||
}
|
||||
}
|
||||
|
||||
// ReloadAndStart combines Reload and Start into single call.
|
||||
func (s *singleton) ReloadAndStart(services ...Service) {
|
||||
err := s.Start(s.Reload(services...)...)
|
||||
if err != nil {
|
||||
// should never happen
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown all the services.
|
||||
func (s *singleton) Shutdown() {
|
||||
s.mu.Lock()
|
||||
|
@ -22,9 +22,12 @@ func (suite *SystemServicesSuite) TestStartShutdown() {
|
||||
system.Services(nil).LoadAndStart(
|
||||
&MockService{name: "containerd"},
|
||||
&MockService{name: "trustd", dependencies: []string{"containerd"}},
|
||||
&MockService{name: "osd", dependencies: []string{"containerd", "osd"}},
|
||||
&MockService{name: "osd", dependencies: []string{"containerd", "trustd"}},
|
||||
)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
suite.Require().NoError(system.Services(nil).Unload(context.Background(), "trustd", "notrunning"))
|
||||
|
||||
system.Services(nil).Shutdown()
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user