fix: implement Unload() for services to make sure bootkube runs always

The problem was that flow to re-run the service with different
parameters was not consistent: it depends on whether services was loaded
before or not, but that is not reliable, as e.g. with bootstrap API
`bootkube` is loaded for the bootstrap and stays until reboot, and never
loaded for any other boot.

`Unload()` stops and removes the service completely so that new instance
of the service could be loaded and started.

This fixes the edge case with recovery API not running bootkube properly
before reboot after bootstrap.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
Andrey Smirnov 2020-07-08 16:53:45 +03:00 committed by talos-bot
parent ac28b9c976
commit d210d7f1a3
3 changed files with 40 additions and 40 deletions

View File

@ -1573,18 +1573,15 @@ func Recover(seq runtime.Sequence, data interface{}) runtime.TaskExecutionFunc {
svc := &services.Bootkube{Recover: true}
if r.Config().Machine().Type() == runtime.MachineTypeControlPlane {
system.Services(r).LoadAndStart(svc)
} else {
loaded := system.Services(r).Reload(svc)
// unload bootkube (if any instance ran before)
if err = system.Services(r).Unload(ctx, svc.ID(r)); err != nil {
return err
}
if len(loaded) == 0 {
return fmt.Errorf("bootkube service is already running")
}
system.Services(r).Load(svc)
if err = system.Services(r).Start(svc.ID(r)); err != nil {
return fmt.Errorf("failed to start bootkube: %w", err)
}
if err = system.Services(r).Start(svc.ID(r)); err != nil {
return fmt.Errorf("failed to start bootkube: %w", err)
}
return nil
@ -1633,7 +1630,15 @@ func BootstrapEtcd(seq runtime.Sequence, data interface{}) runtime.TaskExecution
svc := &services.Etcd{Bootstrap: true}
system.Services(r).ReloadAndStart(svc)
if err = system.Services(r).Unload(ctx, svc.ID(r)); err != nil {
return err
}
system.Services(r).Load(svc)
if err = system.Services(r).Start(svc.ID(r)); err != nil {
return fmt.Errorf("error starting etcd in bootstrap mode: %w", err)
}
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
defer cancel()

View File

@ -14,7 +14,6 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
"github.com/talos-systems/talos/internal/app/machined/pkg/system/events"
"github.com/talos-systems/talos/internal/pkg/conditions"
)
@ -84,33 +83,35 @@ func (s *singleton) Load(services ...Service) []string {
return ids
}
// Reload adds service to the list of services managed by the runner.
// Unload stops the service and removes it from the list of running services.
//
// Reload returns service IDs for each of the services.
func (s *singleton) Reload(services ...Service) []string {
// It is not an error to unload a service which was already removed or stopped.
func (s *singleton) Unload(ctx context.Context, serviceIDs ...string) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.terminating {
s.mu.Unlock()
return nil
}
ids := make([]string, 0, len(services))
servicesToRemove := []string{}
for _, service := range services {
id := service.ID(s.runtime)
ids = append(ids, id)
if svc, exists := s.state[id]; exists {
switch svc.GetState() {
case events.StateFailed, events.StateFinished, events.StateSkipped:
svcrunner := NewServiceRunner(service, s.runtime)
s.state[id] = svcrunner
return ids
}
for _, id := range serviceIDs {
if _, exists := s.state[id]; exists {
servicesToRemove = append(servicesToRemove, id)
}
}
s.mu.Unlock()
if err := s.Stop(ctx, servicesToRemove...); err != nil {
return fmt.Errorf("error stopping services %v: %w", servicesToRemove, err)
}
s.mu.Lock()
defer s.mu.Unlock()
for _, id := range servicesToRemove {
delete(s.state, id)
}
return nil
}
@ -189,15 +190,6 @@ func (s *singleton) LoadAndStart(services ...Service) {
}
}
// ReloadAndStart combines Reload and Start into single call.
func (s *singleton) ReloadAndStart(services ...Service) {
err := s.Start(s.Reload(services...)...)
if err != nil {
// should never happen
panic(err)
}
}
// Shutdown all the services.
func (s *singleton) Shutdown() {
s.mu.Lock()

View File

@ -22,9 +22,12 @@ func (suite *SystemServicesSuite) TestStartShutdown() {
system.Services(nil).LoadAndStart(
&MockService{name: "containerd"},
&MockService{name: "trustd", dependencies: []string{"containerd"}},
&MockService{name: "osd", dependencies: []string{"containerd", "osd"}},
&MockService{name: "osd", dependencies: []string{"containerd", "trustd"}},
)
time.Sleep(10 * time.Millisecond)
suite.Require().NoError(system.Services(nil).Unload(context.Background(), "trustd", "notrunning"))
system.Services(nil).Shutdown()
}