fix: workaround a race in CNI setup (talosctl cluster create)

When provisioning VMs, each launch process sets up CNI network, and from
time to time CNI setup fails with something like:

```
error provisioning CNI network: plugin type="firewall" failed (add): running [/sbin/iptables -t filter -N CNI-ADMIN --wait]: exit status 4: iptables v1.8.10 (nf_tables)
```

This a race condition in the CNI plugins, and it looks like there is no
fix for it (see e.g. https://github.com/hashicorp/nomad/issues/8838).

As a workaround, take a mutex around CNI operation to serialize them.
CNI setup happens in different processes, so use a file-based mutex.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2024-02-27 15:28:12 +04:00
parent 457507803d
commit b2ad5dc5f8
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
3 changed files with 56 additions and 3 deletions

1
go.mod
View File

@ -34,6 +34,7 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azcertificates v1.1.0
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.1.0
github.com/BurntSushi/toml v1.3.2
github.com/alexflint/go-filemutex v1.2.0
github.com/aws/aws-sdk-go-v2/config v1.27.0
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.0
github.com/aws/aws-sdk-go-v2/service/kms v1.28.1

2
go.sum
View File

@ -82,6 +82,8 @@ github.com/ProtonMail/gopenpgp/v2 v2.7.5 h1:STOY3vgES59gNgoOt2w0nyHBjKViB/qSg7Nj
github.com/ProtonMail/gopenpgp/v2 v2.7.5/go.mod h1:IhkNEDaxec6NyzSI0PlxapinnwPVIESk8/76da3Ct3g=
github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls=
github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E=
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
github.com/apparentlymart/go-cidr v1.1.0 h1:2mAhrMoF+nhXqxTzSZMUzDHkLjmIHC+Zzn4tdgBZjnU=
github.com/apparentlymart/go-cidr v1.1.0/go.mod h1:EBcsNrHc3zQeuaeCeCtQruQm+n9/YjEn/vI25Lg7Gwc=
github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 h1:7Ip0wMmLHLRJdrloDxZfhMm0xrLXZS8+COSu2bXmEQs=

View File

@ -16,7 +16,9 @@ import (
"strconv"
"strings"
"github.com/alexflint/go-filemutex"
"github.com/containernetworking/cni/libcni"
"github.com/containernetworking/cni/pkg/types"
types100 "github.com/containernetworking/cni/pkg/types/100"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/containernetworking/plugins/pkg/testutils"
@ -94,6 +96,39 @@ type tpm2Config struct {
StateDir string
}
// withCNIOperationLocked ensures that CNI operations don't run concurrently.
//
// There are race conditions in the CNI plugins that can cause a failure if called concurrently.
func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) {
var zeroT T
lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock"))
if err != nil {
return zeroT, fmt.Errorf("failed to create CNI lock: %w", err)
}
if err = lock.Lock(); err != nil {
return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err)
}
defer func() {
if err := lock.Close(); err != nil {
log.Printf("failed to release CNI lock: %s", err)
}
}()
return f()
}
// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently.
func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error {
_, err := withCNIOperationLocked(config, func() (struct{}, error) {
return struct{}{}, f()
})
return err
}
// withCNI creates network namespace, launches CNI and passes control to the next function
// filling config with netNS and interface details.
//
@ -134,18 +169,33 @@ func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchCon
}
// attempt to clean up network in case it was deployed previously
err = cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
err = withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error deleting CNI network: %w", err)
}
res, err := cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
res, err := withCNIOperationLocked(
config,
func() (types.Result, error) {
return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error provisioning CNI network: %w", err)
}
defer func() {
if e := cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf); e != nil {
if e := withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
); e != nil {
log.Printf("error cleaning up CNI: %s", e)
}
}()