fix: ignore completed pods in cluster health check

This fixes an error when integration test become stuck with the message like: ``` waiting for coredns to report ready: some pods are not ready: [coredns-868c687b7-g2z64] ``` After some random sequence of node restarts one of the pods might become "stuck" in `Completed` state (as it is shown in `kubectl get pods`) blocking the check, as the pod will never become ready. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
2022-05-13 23:03:42 +03:00 · 2022-05-13 23:03:42 +03:00 · 5a91f6076d
commit 5a91f6076d
parent 488ce753fd
1 changed files with 23 additions and 0 deletions
--- a/pkg/cluster/check/kubernetes.go
+++ b/pkg/cluster/check/kubernetes.go
@ -275,10 +275,33 @@ func K8sPodReadyAssertion(ctx context.Context, cluster cluster.K8sProvider, name
 	var notReadyPods, readyPods []string

 	for _, pod := range pods.Items {
+		// skip deleted pods
+		if pod.DeletionTimestamp != nil {
+			continue
+		}
+
+		// skip failed pods
 		if pod.Status.Phase == v1.PodFailed {
 			continue
 		}

+		// skip pods which `kubectl get pods` marks as 'Completed':
+		// * these pods have a phase 'Running', but all containers are terminated
+		// * such pods appear after a graceful kubelet shutdown
+		allContainersTerminated := true
+
+		for _, containerStatus := range pod.Status.ContainerStatuses {
+			if containerStatus.State.Terminated == nil {
+				allContainersTerminated = false
+
+				break
+			}
+		}
+
+		if allContainersTerminated {
+			continue
+		}
+
 		ready := false

 		for _, cond := range pod.Status.Conditions {