fix: workaround a race in CNI setup (talosctl cluster create)
When provisioning VMs, each launch process sets up CNI network, and from time to time CNI setup fails with something like: ``` error provisioning CNI network: plugin type="firewall" failed (add): running [/sbin/iptables -t filter -N CNI-ADMIN --wait]: exit status 4: iptables v1.8.10 (nf_tables) ``` This a race condition in the CNI plugins, and it looks like there is no fix for it (see e.g. https://github.com/hashicorp/nomad/issues/8838). As a workaround, take a mutex around CNI operation to serialize them. CNI setup happens in different processes, so use a file-based mutex. Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
parent
457507803d
commit
b2ad5dc5f8
1
go.mod
1
go.mod
@ -34,6 +34,7 @@ require (
|
||||
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azcertificates v1.1.0
|
||||
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.1.0
|
||||
github.com/BurntSushi/toml v1.3.2
|
||||
github.com/alexflint/go-filemutex v1.2.0
|
||||
github.com/aws/aws-sdk-go-v2/config v1.27.0
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.0
|
||||
github.com/aws/aws-sdk-go-v2/service/kms v1.28.1
|
||||
|
2
go.sum
2
go.sum
@ -82,6 +82,8 @@ github.com/ProtonMail/gopenpgp/v2 v2.7.5 h1:STOY3vgES59gNgoOt2w0nyHBjKViB/qSg7Nj
|
||||
github.com/ProtonMail/gopenpgp/v2 v2.7.5/go.mod h1:IhkNEDaxec6NyzSI0PlxapinnwPVIESk8/76da3Ct3g=
|
||||
github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls=
|
||||
github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E=
|
||||
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
|
||||
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
|
||||
github.com/apparentlymart/go-cidr v1.1.0 h1:2mAhrMoF+nhXqxTzSZMUzDHkLjmIHC+Zzn4tdgBZjnU=
|
||||
github.com/apparentlymart/go-cidr v1.1.0/go.mod h1:EBcsNrHc3zQeuaeCeCtQruQm+n9/YjEn/vI25Lg7Gwc=
|
||||
github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 h1:7Ip0wMmLHLRJdrloDxZfhMm0xrLXZS8+COSu2bXmEQs=
|
||||
|
@ -16,7 +16,9 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/alexflint/go-filemutex"
|
||||
"github.com/containernetworking/cni/libcni"
|
||||
"github.com/containernetworking/cni/pkg/types"
|
||||
types100 "github.com/containernetworking/cni/pkg/types/100"
|
||||
"github.com/containernetworking/plugins/pkg/ns"
|
||||
"github.com/containernetworking/plugins/pkg/testutils"
|
||||
@ -94,6 +96,39 @@ type tpm2Config struct {
|
||||
StateDir string
|
||||
}
|
||||
|
||||
// withCNIOperationLocked ensures that CNI operations don't run concurrently.
|
||||
//
|
||||
// There are race conditions in the CNI plugins that can cause a failure if called concurrently.
|
||||
func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) {
|
||||
var zeroT T
|
||||
|
||||
lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock"))
|
||||
if err != nil {
|
||||
return zeroT, fmt.Errorf("failed to create CNI lock: %w", err)
|
||||
}
|
||||
|
||||
if err = lock.Lock(); err != nil {
|
||||
return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := lock.Close(); err != nil {
|
||||
log.Printf("failed to release CNI lock: %s", err)
|
||||
}
|
||||
}()
|
||||
|
||||
return f()
|
||||
}
|
||||
|
||||
// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently.
|
||||
func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error {
|
||||
_, err := withCNIOperationLocked(config, func() (struct{}, error) {
|
||||
return struct{}{}, f()
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// withCNI creates network namespace, launches CNI and passes control to the next function
|
||||
// filling config with netNS and interface details.
|
||||
//
|
||||
@ -134,18 +169,33 @@ func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchCon
|
||||
}
|
||||
|
||||
// attempt to clean up network in case it was deployed previously
|
||||
err = cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
|
||||
err = withCNIOperationLockedNoResult(
|
||||
config,
|
||||
func() error {
|
||||
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error deleting CNI network: %w", err)
|
||||
}
|
||||
|
||||
res, err := cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
|
||||
res, err := withCNIOperationLocked(
|
||||
config,
|
||||
func() (types.Result, error) {
|
||||
return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error provisioning CNI network: %w", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if e := cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf); e != nil {
|
||||
if e := withCNIOperationLockedNoResult(
|
||||
config,
|
||||
func() error {
|
||||
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
|
||||
},
|
||||
); e != nil {
|
||||
log.Printf("error cleaning up CNI: %s", e)
|
||||
}
|
||||
}()
|
||||
|
Loading…
x
Reference in New Issue
Block a user