fix: ignore completed pods in cluster health check

This fixes an error when integration test become stuck with the message
like:

```
waiting for coredns to report ready: some pods are not ready: [coredns-868c687b7-g2z64]
```

After some random sequence of node restarts one of the pods might become
"stuck" in `Completed` state (as it is shown in `kubectl get pods`)
blocking the check, as the pod will never become ready.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
Andrey Smirnov 2022-05-13 23:03:42 +03:00
parent 488ce753fd
commit 5a91f6076d
No known key found for this signature in database
GPG Key ID: 7B26396447AB6DFD

View File

@ -275,10 +275,33 @@ func K8sPodReadyAssertion(ctx context.Context, cluster cluster.K8sProvider, name
var notReadyPods, readyPods []string
for _, pod := range pods.Items {
// skip deleted pods
if pod.DeletionTimestamp != nil {
continue
}
// skip failed pods
if pod.Status.Phase == v1.PodFailed {
continue
}
// skip pods which `kubectl get pods` marks as 'Completed':
// * these pods have a phase 'Running', but all containers are terminated
// * such pods appear after a graceful kubelet shutdown
allContainersTerminated := true
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.State.Terminated == nil {
allContainersTerminated = false
break
}
}
if allContainersTerminated {
continue
}
ready := false
for _, cond := range pod.Status.Conditions {