fix: properly cleanup legacy static pod manifests directory

When upgrading from older version of Talos using static pod manifests
directory to new version providing static pods via internal web server,
we need to make sure that legacy static pods are cleaned up, otherwise
kubelet receives "two" versions of the static pods which makes it fail
to run them.

The previous cleanup location wasn't working properly, as
`/etc/kubernetes/manifests` exists in the rootfs (and it's empty), while
actual contents are in `/var`, and they appear only when respective
overlay mount is done.

The controller tried to clean up on start, saw nothing (looking into
rootfs), then started doing other functions. The result was that when
overlay was mounted, static pods were still there, while the controller
will do next attempt only when it fails, and it fails next time when
kubelet is already running, and when it already picked up those stale
definitions.

Fix all of that by moving cleanup into sequencer after overlayfs mount.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
Andrey Smirnov 2022-10-27 23:09:47 +04:00
parent 6ee47bcc61
commit 7e50e24c01
No known key found for this signature in database
GPG Key ID: 7B26396447AB6DFD
3 changed files with 39 additions and 39 deletions

View File

@ -7,9 +7,6 @@ package k8s
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/cosi-project/runtime/pkg/controller"
@ -20,7 +17,6 @@ import (
k8sadapter "github.com/talos-systems/talos/internal/app/machined/pkg/adapters/k8s"
"github.com/talos-systems/talos/pkg/kubernetes/kubelet"
"github.com/talos-systems/talos/pkg/machinery/constants"
"github.com/talos-systems/talos/pkg/machinery/resources/k8s"
"github.com/talos-systems/talos/pkg/machinery/resources/secrets"
"github.com/talos-systems/talos/pkg/machinery/resources/v1alpha1"
@ -78,10 +74,6 @@ func (ctrl *KubeletStaticPodController) Outputs() []controller.Output {
//
//nolint:gocyclo,cyclop
func (ctrl *KubeletStaticPodController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
if err := ctrl.cleanupLegacyStaticPodFiles(logger); err != nil {
logger.Warn("error cleaning up legacy static pod files: %w", zap.Error(err))
}
var kubeletClient *kubelet.Client
refreshTicker := time.NewTicker(15 * time.Second) // refresh kubelet pods status every 15 seconds
@ -172,37 +164,6 @@ func (ctrl *KubeletStaticPodController) Run(ctx context.Context, r controller.Ru
}
}
func (ctrl *KubeletStaticPodController) cleanupLegacyStaticPodFiles(logger *zap.Logger) error {
manifestDir, err := os.Open(constants.ManifestsDirectory)
if err != nil {
return fmt.Errorf("error opening manifests directory: %w", err)
}
defer manifestDir.Close() //nolint:errcheck
manifests, err := manifestDir.Readdirnames(0)
if err != nil {
return fmt.Errorf("error listing manifests: %w", err)
}
for _, manifest := range manifests {
// skip manifests not owned by Talos
if !strings.HasPrefix(manifest, constants.TalosManifestPrefix) {
continue
}
podPath := filepath.Join(constants.ManifestsDirectory, manifest)
logger.Sugar().Infof("cleaning up legacy static pod file %q", podPath)
if err = os.Remove(podPath); err != nil {
return fmt.Errorf("error cleaning up legacy static pod file: %w", err)
}
}
return nil
}
func (ctrl *KubeletStaticPodController) teardownStatuses(ctx context.Context, r controller.Runtime) error {
statuses, err := r.List(ctx, resource.NewMetadata(k8s.NamespaceName, k8s.StaticPodStatusType, "", resource.VersionUndefined))
if err != nil {

View File

@ -210,6 +210,9 @@ func (*Sequencer) Boot(r runtime.Runtime) []runtime.Phase {
r.State().Platform().Mode() != runtime.ModeContainer,
"overlay",
MountOverlayFilesystems,
).Append(
"legacyCleanup",
CleanupLegacyStaticPodFiles,
).Append(
"udevSetup",
WriteUdevRules,

View File

@ -2043,6 +2043,42 @@ func ForceCleanup(seq runtime.Sequence, data interface{}) (runtime.TaskExecution
}, "forceCleanup"
}
// CleanupLegacyStaticPodFiles removes legacy static pod files in the manifests directory.
//
// This part of transition to Talos 1.3.0, as Talos 1.3.0 serves static pods from internal web server.
func CleanupLegacyStaticPodFiles(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error {
manifestDir, err := os.Open(constants.ManifestsDirectory)
if err != nil {
return fmt.Errorf("error opening manifests directory: %w", err)
}
defer manifestDir.Close() //nolint:errcheck
manifests, err := manifestDir.Readdirnames(0)
if err != nil {
return fmt.Errorf("error listing manifests: %w", err)
}
for _, manifest := range manifests {
// skip manifests not owned by Talos
if !strings.HasPrefix(manifest, constants.TalosManifestPrefix) {
continue
}
podPath := filepath.Join(constants.ManifestsDirectory, manifest)
logger.Printf("cleaning up legacy static pod file %q", podPath)
if err = os.Remove(podPath); err != nil {
return fmt.Errorf("error cleaning up legacy static pod file: %w", err)
}
}
return nil
}, "cleanupLegacyStaticPodFiles"
}
func pauseOnFailure(callback func(runtime.Sequence, interface{}) (runtime.TaskExecutionFunc, string),
timeout time.Duration,
) func(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {