fix: improve etcd leave on reset process
When removing a member from `etcd`, the server does a pre-check to make sure the member is connected to a quorum of other members, and the remove request might fail. Add a retry to wait for the etcd to be fully connected before giving up, as some parts of the reset flow alrady ran. Also fix an issue which appears in the integration test, when `reset` is called early in the boot sequence when local etcd hasn't started fully yet. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
parent
638dc9128f
commit
40e69af224
@ -14,6 +14,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/cosi-project/runtime/pkg/state"
|
||||
"github.com/siderolabs/go-retry/retry"
|
||||
"go.etcd.io/etcd/api/v3/etcdserverpb"
|
||||
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
|
||||
"go.etcd.io/etcd/client/pkg/v3/transport"
|
||||
@ -155,7 +156,19 @@ func (c *Client) LeaveCluster(ctx context.Context, st state.State) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := c.RemoveMemberByMemberID(ctx, memberID); err != nil {
|
||||
if err := retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
|
||||
err := c.RemoveMemberByMemberID(ctx, memberID)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if errors.Is(err, rpctypes.ErrUnhealthy) {
|
||||
// unhealthy is returned when the member hasn't established connections with quorum other members
|
||||
return retry.ExpectedError(err)
|
||||
}
|
||||
|
||||
return err
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@ package etcd
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/cosi-project/runtime/pkg/safe"
|
||||
"github.com/cosi-project/runtime/pkg/state"
|
||||
@ -16,10 +17,14 @@ import (
|
||||
|
||||
// GetLocalMemberID gets the etcd member id of the local node via resources.
|
||||
func GetLocalMemberID(ctx context.Context, s state.State) (uint64, error) {
|
||||
member, err := safe.ReaderGet[*etcd.Member](
|
||||
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
member, err := safe.StateWatchFor[*etcd.Member](
|
||||
ctx,
|
||||
s,
|
||||
etcd.NewMember(etcd.NamespaceName, etcd.LocalMemberID).Metadata(),
|
||||
state.WithEventTypes(state.Created),
|
||||
)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to get local etcd member ID: %w", err)
|
||||
|
Loading…
x
Reference in New Issue
Block a user