fix: improve etcd leave on reset process

When removing a member from `etcd`, the server does a pre-check to make
sure the member is connected to a quorum of other members, and the
remove request might fail. Add a retry to wait for the etcd to be fully
connected before giving up, as some parts of the reset flow alrady ran.

Also fix an issue which appears in the integration test, when `reset` is
called early in the boot sequence when local etcd hasn't started fully yet.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
Andrey Smirnov 2023-02-28 22:58:57 +04:00
parent 638dc9128f
commit 40e69af224
No known key found for this signature in database
GPG Key ID: 7B26396447AB6DFD
2 changed files with 20 additions and 2 deletions

View File

@ -14,6 +14,7 @@ import (
"time"
"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/go-retry/retry"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"go.etcd.io/etcd/client/pkg/v3/transport"
@ -155,7 +156,19 @@ func (c *Client) LeaveCluster(ctx context.Context, st state.State) error {
return err
}
if err := c.RemoveMemberByMemberID(ctx, memberID); err != nil {
if err := retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
err := c.RemoveMemberByMemberID(ctx, memberID)
if err == nil {
return nil
}
if errors.Is(err, rpctypes.ErrUnhealthy) {
// unhealthy is returned when the member hasn't established connections with quorum other members
return retry.ExpectedError(err)
}
return err
}); err != nil {
return err
}

View File

@ -7,6 +7,7 @@ package etcd
import (
"context"
"fmt"
"time"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
@ -16,10 +17,14 @@ import (
// GetLocalMemberID gets the etcd member id of the local node via resources.
func GetLocalMemberID(ctx context.Context, s state.State) (uint64, error) {
member, err := safe.ReaderGet[*etcd.Member](
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()
member, err := safe.StateWatchFor[*etcd.Member](
ctx,
s,
etcd.NewMember(etcd.NamespaceName, etcd.LocalMemberID).Metadata(),
state.WithEventTypes(state.Created),
)
if err != nil {
return 0, fmt.Errorf("failed to get local etcd member ID: %w", err)