From cf2ccc521f6a15b8b82bf5fbaab572f481f8edf7 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Thu, 23 Mar 2023 16:59:30 +0400 Subject: [PATCH] fix: always shutdown maintenance API service The problem was that `GracefulStop()` will hang forever if there is a running API call. So if there is a running streaming call, the maintenance service might hang until it is finished. The problem shows up with 'Upgrade' API in the maintenance mode if there is a concurrent streaming API call, e.g.: 1. Watch API is running against maintenance mode. 2. Upgrade API is issued, it tries to run the MaintenanceUpgrade sequence, which tries to take over the Initialize sequence. The Initialize sequence is canceled, maintenance API service context is canceled, but the service doesn't terminate, as it's stuck in `GracefulStop`. The sequence take over times out, as even the sequence is canceled, it hasn't terminated yet. Sample log: ``` [talos] upgrade request received: "ghcr.io/siderolabs/installer:v1.3.3" [talos] upgrade failed: failed to acquire lock: timeout [talos] task loadConfig (1/1): failed: failed to receive config via maintenance service: maintenance service failed: context canceled [talos] phase config (6/7): failed [talos] initialize sequence: failed ``` Signed-off-by: Andrey Smirnov --- internal/app/maintenance/main.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/app/maintenance/main.go b/internal/app/maintenance/main.go index e12e524d1..5e8bc4240 100644 --- a/internal/app/maintenance/main.go +++ b/internal/app/maintenance/main.go @@ -120,7 +120,12 @@ func Run(ctx context.Context, logger *log.Logger) ([]byte, error) { return nil, err } - defer server.GracefulStop() + defer func() { + shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second) + defer shutdownCancel() + + factory.ServerGracefulStop(server, shutdownCtx) + }() go func() { //nolint:errcheck @@ -156,12 +161,7 @@ func Run(ctx context.Context, logger *log.Logger) ([]byte, error) { select { case cfg := <-cfgCh: - shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second) - defer shutdownCancel() - - factory.ServerGracefulStop(server, shutdownCtx) - - return cfg, err + return cfg, nil case <-ctx.Done(): return nil, ctx.Err() }