fix: audit and fix cgroup reservations
Fixes: #7081 Review all reservations and limits set, test under stress load (using both memory and CPU). The goal: system components (Talos itself) and runtime (kubelet, CRI) should survive under extreme resource starvation (workloads consuming all CPU/memory). Uses #9337 to visualize changes, but doesn't depend on it. Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com> (cherry picked from commit 6b15ca19cd1291b8a245d72d5153827945cad037)
This commit is contained in:
parent
c8dedbe116
commit
920d8c8297
1
go.mod
1
go.mod
@ -80,6 +80,7 @@ require (
|
||||
github.com/gizak/termui/v3 v3.1.0
|
||||
github.com/godbus/dbus/v5 v5.1.0
|
||||
github.com/golang/mock v1.6.0
|
||||
github.com/google/cadvisor v0.50.0
|
||||
github.com/google/go-containerregistry v0.20.2
|
||||
github.com/google/go-tpm v0.9.1
|
||||
github.com/google/nftables v0.2.0
|
||||
|
2
go.sum
2
go.sum
@ -286,6 +286,8 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6
|
||||
github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU=
|
||||
github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
|
||||
github.com/google/cadvisor v0.50.0 h1:7w/hKIbJKBWqQsRTy+Hpj2vj+fnxrLXcEXFy+LW0Bsg=
|
||||
github.com/google/cadvisor v0.50.0/go.mod h1:VxCDwZalpFyENvmfabFqaIGsqNKLtDzE62a19rfVTB8=
|
||||
github.com/google/cel-go v0.21.0 h1:cl6uW/gxN+Hy50tNYvI691+sXxioCnstFzLp2WO4GCI=
|
||||
github.com/google/cel-go v0.21.0/go.mod h1:rHUlWCcBKgyEk+eV03RPdZUekPp6YcJwV0FxuUksYxc=
|
||||
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
|
||||
|
@ -27,8 +27,10 @@ import (
|
||||
v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
|
||||
"github.com/siderolabs/talos/internal/pkg/cgroup"
|
||||
"github.com/siderolabs/talos/pkg/argsbuilder"
|
||||
"github.com/siderolabs/talos/pkg/machinery/config/machine"
|
||||
"github.com/siderolabs/talos/pkg/machinery/constants"
|
||||
"github.com/siderolabs/talos/pkg/machinery/kubelet"
|
||||
"github.com/siderolabs/talos/pkg/machinery/resources/config"
|
||||
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
|
||||
)
|
||||
|
||||
@ -63,6 +65,12 @@ func (ctrl *KubeletSpecController) Inputs() []controller.Input {
|
||||
ID: optional.Some(k8s.KubeletID),
|
||||
Kind: controller.InputWeak,
|
||||
},
|
||||
{
|
||||
Namespace: config.NamespaceName,
|
||||
Type: config.MachineTypeType,
|
||||
ID: optional.Some(config.MachineTypeID),
|
||||
Kind: controller.InputWeak,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -100,6 +108,15 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime
|
||||
|
||||
kubeletVersion := compatibility.VersionFromImageRef(cfgSpec.Image)
|
||||
|
||||
machineType, err := safe.ReaderGetByID[*config.MachineType](ctx, r, config.MachineTypeID)
|
||||
if err != nil {
|
||||
if state.IsNotFoundError(err) {
|
||||
continue
|
||||
}
|
||||
|
||||
return fmt.Errorf("error getting machine type: %w", err)
|
||||
}
|
||||
|
||||
nodename, err := safe.ReaderGetByID[*k8s.Nodename](ctx, r, k8s.NodenameID)
|
||||
if err != nil {
|
||||
if state.IsNotFoundError(err) {
|
||||
@ -173,7 +190,7 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime
|
||||
args["image-credential-provider-config"] = constants.KubeletCredentialProviderConfig
|
||||
}
|
||||
|
||||
kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion)
|
||||
kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion, machineType.MachineType())
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating kubelet configuration: %w", err)
|
||||
}
|
||||
@ -242,7 +259,7 @@ func prepareExtraConfig(extraConfig map[string]any) (*kubeletconfig.KubeletConfi
|
||||
// NewKubeletConfiguration builds kubelet configuration with defaults and overrides from extraConfig.
|
||||
//
|
||||
//nolint:gocyclo,cyclop
|
||||
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version) (*kubeletconfig.KubeletConfiguration, error) {
|
||||
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version, machineType machine.Type) (*kubeletconfig.KubeletConfiguration, error) {
|
||||
config, err := prepareExtraConfig(cfgSpec.ExtraConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -333,10 +350,15 @@ func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion comp
|
||||
if len(config.SystemReserved) == 0 {
|
||||
config.SystemReserved = map[string]string{
|
||||
"cpu": constants.KubeletSystemReservedCPU,
|
||||
"memory": constants.KubeletSystemReservedMemory,
|
||||
"pid": constants.KubeletSystemReservedPid,
|
||||
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
|
||||
}
|
||||
|
||||
if machineType.IsControlPlane() {
|
||||
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
|
||||
} else {
|
||||
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryWorker
|
||||
}
|
||||
}
|
||||
|
||||
if config.Logging.Format == "" {
|
||||
|
@ -25,7 +25,9 @@ import (
|
||||
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest"
|
||||
k8sctrl "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/k8s"
|
||||
"github.com/siderolabs/talos/pkg/machinery/config/machine"
|
||||
"github.com/siderolabs/talos/pkg/machinery/constants"
|
||||
"github.com/siderolabs/talos/pkg/machinery/resources/config"
|
||||
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
|
||||
)
|
||||
|
||||
@ -60,6 +62,10 @@ func (suite *KubeletSpecSuite) TestReconcileDefault() {
|
||||
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
|
||||
|
||||
machineType := config.NewMachineType()
|
||||
machineType.SetMachineType(machine.TypeWorker)
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
|
||||
|
||||
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
|
||||
spec := kubeletSpec.TypedSpec()
|
||||
|
||||
@ -97,6 +103,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {
|
||||
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
|
||||
|
||||
machineType := config.NewMachineType()
|
||||
machineType.SetMachineType(machine.TypeWorker)
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
|
||||
|
||||
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
|
||||
spec := kubeletSpec.TypedSpec()
|
||||
|
||||
@ -114,7 +124,7 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {
|
||||
})
|
||||
}
|
||||
|
||||
func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {
|
||||
func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEndpointFlag() {
|
||||
cfg := k8s.NewKubeletConfig(k8s.NamespaceName, k8s.KubeletID)
|
||||
cfg.TypedSpec().Image = "kubelet:v1.25.0"
|
||||
cfg.TypedSpec().ClusterDNS = []string{"10.96.0.10"}
|
||||
@ -128,6 +138,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {
|
||||
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
|
||||
|
||||
machineType := config.NewMachineType()
|
||||
machineType.SetMachineType(machine.TypeWorker)
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
|
||||
|
||||
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
|
||||
spec := kubeletSpec.TypedSpec()
|
||||
|
||||
@ -180,6 +194,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExtraConfig() {
|
||||
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))
|
||||
|
||||
machineType := config.NewMachineType()
|
||||
machineType.SetMachineType(machine.TypeWorker)
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
|
||||
|
||||
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
|
||||
spec := kubeletSpec.TypedSpec()
|
||||
|
||||
@ -219,6 +237,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithSkipNodeRegistration() {
|
||||
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))
|
||||
|
||||
machineType := config.NewMachineType()
|
||||
machineType.SetMachineType(machine.TypeWorker)
|
||||
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
|
||||
|
||||
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
|
||||
spec := kubeletSpec.TypedSpec()
|
||||
|
||||
@ -307,7 +329,7 @@ func TestNewKubeletConfigurationFail(t *testing.T) {
|
||||
tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""))
|
||||
_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""), machine.TypeWorker)
|
||||
require.Error(t, err)
|
||||
|
||||
assert.EqualError(t, err, tt.expectedErr)
|
||||
@ -352,7 +374,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
FailSwapOn: pointer.To(false),
|
||||
SystemReserved: map[string]string{
|
||||
"cpu": constants.KubeletSystemReservedCPU,
|
||||
"memory": constants.KubeletSystemReservedMemory,
|
||||
"memory": constants.KubeletSystemReservedMemoryWorker,
|
||||
"pid": constants.KubeletSystemReservedPid,
|
||||
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
|
||||
},
|
||||
@ -373,6 +395,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
cfgSpec *k8s.KubeletConfigSpec
|
||||
kubeletVersion compatibility.Version
|
||||
expectedOverrides func(*kubeletconfig.KubeletConfiguration)
|
||||
machineType machine.Type
|
||||
}{
|
||||
{
|
||||
name: "override some",
|
||||
@ -389,6 +412,19 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
kc.OOMScoreAdj = pointer.To[int32](-300)
|
||||
kc.EnableDebuggingHandlers = pointer.To(true)
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
{
|
||||
name: "controlplane",
|
||||
cfgSpec: &k8s.KubeletConfigSpec{
|
||||
ClusterDNS: []string{"10.0.0.5"},
|
||||
ClusterDomain: "cluster.local",
|
||||
},
|
||||
kubeletVersion: compatibility.VersionFromImageRef("ghcr.io/siderolabs/kubelet:v1.29.0"),
|
||||
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
|
||||
kc.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
|
||||
},
|
||||
machineType: machine.TypeControlPlane,
|
||||
},
|
||||
{
|
||||
name: "disable graceful shutdown",
|
||||
@ -405,6 +441,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
kc.ShutdownGracePeriod = metav1.Duration{}
|
||||
kc.ShutdownGracePeriodCriticalPods = metav1.Duration{}
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
{
|
||||
name: "enable seccomp default",
|
||||
@ -417,6 +454,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
|
||||
kc.SeccompDefault = pointer.To(true)
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
{
|
||||
name: "enable skipNodeRegistration",
|
||||
@ -430,6 +468,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
kc.Authentication.Webhook.Enabled = pointer.To(false)
|
||||
kc.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
{
|
||||
name: "disable manifests directory",
|
||||
@ -442,6 +481,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
|
||||
kc.StaticPodPath = ""
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
{
|
||||
name: "enable local FS quota monitoring",
|
||||
@ -456,19 +496,20 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
|
||||
"LocalStorageCapacityIsolationFSQuotaMonitoring": true,
|
||||
}
|
||||
},
|
||||
machineType: machine.TypeWorker,
|
||||
},
|
||||
} {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
expected := defaultKubeletConfig
|
||||
tt.expectedOverrides(&expected)
|
||||
expected := defaultKubeletConfig.DeepCopy()
|
||||
tt.expectedOverrides(expected)
|
||||
|
||||
config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion)
|
||||
config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion, tt.machineType)
|
||||
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, &expected, config)
|
||||
assert.Equal(t, expected, config)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
|
||||
Min: pointer.To[int64](constants.CgroupInitReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupInitMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -191,15 +194,42 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
|
||||
Min: pointer.To[int64](constants.CgroupSystemReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupSystemRuntime,
|
||||
resources: &cgroup2.Resources{},
|
||||
name: constants.CgroupSystemRuntime,
|
||||
resources: &cgroup2.Resources{
|
||||
Memory: &cgroup2.Memory{
|
||||
Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemRuntimeMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupUdevd,
|
||||
resources: &cgroup2.Resources{},
|
||||
name: constants.CgroupUdevd,
|
||||
resources: &cgroup2.Resources{
|
||||
Memory: &cgroup2.Memory{
|
||||
Min: pointer.To[int64](constants.CgroupUdevdReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupUdevdMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupPodRuntimeRoot,
|
||||
resources: &cgroup2.Resources{
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeRootMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupPodRuntime,
|
||||
@ -208,6 +238,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
|
||||
Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -217,14 +250,45 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
|
||||
Min: pointer.To[int64](constants.CgroupKubeletReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupKubeletMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupDashboard,
|
||||
resources: &cgroup2.Resources{
|
||||
Memory: &cgroup2.Memory{
|
||||
Min: pointer.To[int64](constants.CgroupDashboardReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupDashboardLowMemory),
|
||||
Max: pointer.To[int64](constants.CgroupDashboardMaxMemory),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupApid,
|
||||
resources: &cgroup2.Resources{
|
||||
Memory: &cgroup2.Memory{
|
||||
Min: pointer.To[int64](constants.CgroupApidReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2),
|
||||
Max: pointer.To[int64](constants.CgroupApidMaxMemory),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: constants.CgroupTrustd,
|
||||
resources: &cgroup2.Resources{
|
||||
Memory: &cgroup2.Memory{
|
||||
Min: pointer.To[int64](constants.CgroupTrustdReservedMemory),
|
||||
Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2),
|
||||
Max: pointer.To[int64](constants.CgroupTrustdMaxMemory),
|
||||
},
|
||||
CPU: &cgroup2.CPU{
|
||||
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))),
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -13,18 +13,6 @@ import (
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
)
|
||||
|
||||
// WithMemoryLimit sets the linux resource memory limit field.
|
||||
func WithMemoryLimit(limit int64) oci.SpecOpts {
|
||||
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
|
||||
s.Linux.Resources.Memory = &specs.LinuxMemory{
|
||||
Limit: &limit,
|
||||
// DisableOOMKiller: &disable,
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// WithRootfsPropagation sets the root filesystem propagation.
|
||||
func WithRootfsPropagation(rp string) oci.SpecOpts {
|
||||
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
|
||||
|
@ -6,15 +6,18 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
containerd "github.com/containerd/containerd/v2/client"
|
||||
ocicontainers "github.com/containerd/containerd/v2/core/containers"
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/siderolabs/gen/maps"
|
||||
"github.com/siderolabs/gen/optional"
|
||||
"github.com/siderolabs/go-pointer"
|
||||
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/logging"
|
||||
@ -220,3 +223,20 @@ func WithUID(uid uint32) Option {
|
||||
args.UID = uid
|
||||
}
|
||||
}
|
||||
|
||||
// WithMemoryReservation sets the memory reservation limit as on OCI spec.
|
||||
func WithMemoryReservation(limit uint64) oci.SpecOpts {
|
||||
return func(_ context.Context, _ oci.Client, _ *ocicontainers.Container, s *oci.Spec) error {
|
||||
if s.Linux.Resources == nil {
|
||||
s.Linux.Resources = &specs.LinuxResources{}
|
||||
}
|
||||
|
||||
if s.Linux.Resources.Memory == nil {
|
||||
s.Linux.Resources.Memory = &specs.LinuxMemory{}
|
||||
}
|
||||
|
||||
s.Linux.Resources.Memory.Reservation = pointer.To(int64(limit))
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ import (
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/cap"
|
||||
@ -164,6 +165,7 @@ func (o *APID) Runner(r runtime.Runtime) (runner.Runner, error) {
|
||||
|
||||
env := []string{
|
||||
constants.TcellMinimizeEnvironment,
|
||||
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupApidMaxMemory/5*4),
|
||||
}
|
||||
|
||||
for _, value := range environment.Get(r.Config()) {
|
||||
|
@ -8,6 +8,7 @@ package services
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/system/events"
|
||||
@ -61,6 +62,7 @@ func (d *Dashboard) Runner(r runtime.Runtime) (runner.Runner, error) {
|
||||
runner.WithEnv([]string{
|
||||
"TERM=linux",
|
||||
constants.TcellMinimizeEnvironment,
|
||||
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupDashboardMaxMemory/5*4),
|
||||
}),
|
||||
runner.WithStdinFile(tty),
|
||||
runner.WithStdoutFile(tty),
|
||||
|
@ -36,6 +36,7 @@ import (
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner"
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/containerd"
|
||||
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/restart"
|
||||
"github.com/siderolabs/talos/internal/pkg/cgroup"
|
||||
"github.com/siderolabs/talos/internal/pkg/containers/image"
|
||||
"github.com/siderolabs/talos/internal/pkg/environment"
|
||||
"github.com/siderolabs/talos/internal/pkg/etcd"
|
||||
@ -224,6 +225,8 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) {
|
||||
oci.WithHostNamespace(specs.NetworkNamespace),
|
||||
oci.WithMounts(mounts),
|
||||
oci.WithUser(fmt.Sprintf("%d:%d", constants.EtcdUserID, constants.EtcdUserID)),
|
||||
runner.WithMemoryReservation(constants.CgroupEtcdReservedMemory),
|
||||
oci.WithCPUShares(uint64(cgroup.MilliCoresToShares(constants.CgroupEtcdMillicores))),
|
||||
),
|
||||
runner.WithOOMScoreAdj(-998),
|
||||
),
|
||||
|
@ -12,6 +12,7 @@ import (
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/cap"
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
@ -142,7 +143,10 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
|
||||
}
|
||||
|
||||
env := environment.Get(r.Config())
|
||||
env = append(env, constants.TcellMinimizeEnvironment)
|
||||
env = append(env,
|
||||
constants.TcellMinimizeEnvironment,
|
||||
"GOMEMLIMIT="+strconv.Itoa(constants.CgroupTrustdMaxMemory/5*4),
|
||||
)
|
||||
|
||||
if debug.RaceEnabled {
|
||||
env = append(env, "GORACE=halt_on_error=1")
|
||||
@ -156,7 +160,6 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
|
||||
runner.WithEnv(env),
|
||||
runner.WithCgroupPath(constants.CgroupTrustd),
|
||||
runner.WithOCISpecOpts(
|
||||
containerd.WithMemoryLimit(int64(1000000*512)),
|
||||
oci.WithDroppedCapabilities(cap.Known()),
|
||||
oci.WithHostNamespace(specs.NetworkNamespace),
|
||||
oci.WithMounts(mounts),
|
||||
|
52
internal/pkg/cgroup/cpu.go
Normal file
52
internal/pkg/cgroup/cpu.go
Normal file
@ -0,0 +1,52 @@
|
||||
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
package cgroup
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
"github.com/google/cadvisor/utils/sysfs"
|
||||
"github.com/google/cadvisor/utils/sysinfo"
|
||||
)
|
||||
|
||||
var availableCPUCores = sync.OnceValue(func() int {
|
||||
_, cores, err := sysinfo.GetNodesInfo(sysfs.NewRealSysFs())
|
||||
if err != nil || cores < 1 {
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
|
||||
return cores
|
||||
})
|
||||
|
||||
// MilliCores represents a CPU value in milli-cores.
|
||||
type MilliCores uint
|
||||
|
||||
// AvailableMilliCores returns the number of available CPU cores in milli-cores.
|
||||
func AvailableMilliCores() MilliCores {
|
||||
return MilliCores(availableCPUCores()) * 1000
|
||||
}
|
||||
|
||||
// CPUShare represents a CPU share value.
|
||||
type CPUShare uint64
|
||||
|
||||
// MilliCoresToShares converts milli-cores to CPU shares.
|
||||
func MilliCoresToShares(milliCores MilliCores) CPUShare {
|
||||
return CPUShare(milliCores) * 1024 / 1000
|
||||
}
|
||||
|
||||
// SharesToCPUWeight converts CPU shares to CPU weight.
|
||||
func SharesToCPUWeight(shares CPUShare) uint64 {
|
||||
return uint64((((shares - 2) * 9999) / 262142) + 1)
|
||||
}
|
||||
|
||||
// MillicoresToCPUWeight converts milli-cores to CPU weight.
|
||||
//
|
||||
// It limits millicores to available CPU cores.
|
||||
func MillicoresToCPUWeight(requested MilliCores) uint64 {
|
||||
requested = min(requested, AvailableMilliCores())
|
||||
|
||||
return SharesToCPUWeight(MilliCoresToShares(requested))
|
||||
}
|
38
internal/pkg/cgroup/cpu_test.go
Normal file
38
internal/pkg/cgroup/cpu_test.go
Normal file
@ -0,0 +1,38 @@
|
||||
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
package cgroup_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/siderolabs/talos/internal/pkg/cgroup"
|
||||
)
|
||||
|
||||
func TestAvailableMillicores(t *testing.T) {
|
||||
t.Logf("Available CPU milli-cores: %d", cgroup.AvailableMilliCores())
|
||||
|
||||
assert.GreaterOrEqual(t, cgroup.AvailableMilliCores(), cgroup.MilliCores(1000))
|
||||
}
|
||||
|
||||
func TestMillicoresToShares(t *testing.T) {
|
||||
assert.Equal(t, cgroup.CPUShare(102), cgroup.MilliCoresToShares(100))
|
||||
assert.Equal(t, cgroup.CPUShare(1024), cgroup.MilliCoresToShares(1000))
|
||||
assert.Equal(t, cgroup.CPUShare(2560), cgroup.MilliCoresToShares(2500))
|
||||
}
|
||||
|
||||
func TestSharesToCPUWeight(t *testing.T) {
|
||||
assert.Equal(t, uint64(4), cgroup.SharesToCPUWeight(102))
|
||||
assert.Equal(t, uint64(79), cgroup.SharesToCPUWeight(2048))
|
||||
assert.Equal(t, uint64(313), cgroup.SharesToCPUWeight(8192))
|
||||
}
|
||||
|
||||
func TestMillicoresToCPUWeight(t *testing.T) {
|
||||
// depends on number of CPUs available, but for < 1000 millicores it should be same result
|
||||
assert.Equal(t, uint64(4), cgroup.MillicoresToCPUWeight(100))
|
||||
assert.Equal(t, uint64(20), cgroup.MillicoresToCPUWeight(500))
|
||||
assert.Equal(t, uint64(39), cgroup.MillicoresToCPUWeight(1000))
|
||||
}
|
@ -32,7 +32,7 @@ func CGroupMountPoints() (mountpoints *Points, err error) {
|
||||
func cgroupMountPointsV2() (mountpoints *Points, err error) {
|
||||
cgroups := NewMountPoints()
|
||||
|
||||
cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate"))
|
||||
cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate,memory_recursiveprot"))
|
||||
|
||||
return cgroups, nil
|
||||
}
|
||||
|
@ -402,8 +402,11 @@ const (
|
||||
// KubeletSystemReservedCPU cpu system reservation value for kubelet kubeconfig.
|
||||
KubeletSystemReservedCPU = "50m"
|
||||
|
||||
// KubeletSystemReservedMemory memory system reservation value for kubelet kubeconfig.
|
||||
KubeletSystemReservedMemory = "192Mi"
|
||||
// KubeletSystemReservedMemoryControlPlane memory system reservation value for kubelet kubeconfig (controlplane nodes).
|
||||
KubeletSystemReservedMemoryControlPlane = "512Mi"
|
||||
|
||||
// KubeletSystemReservedMemoryWorker memory system reservation value for kubelet kubeconfig (worker nodes).
|
||||
KubeletSystemReservedMemoryWorker = "384Mi"
|
||||
|
||||
// KubeletSystemReservedPid pid system reservation value for kubelet kubeconfig.
|
||||
KubeletSystemReservedPid = "100"
|
||||
@ -672,50 +675,104 @@ const (
|
||||
// CgroupInitReservedMemory is the hard memory protection for the init process.
|
||||
CgroupInitReservedMemory = 96 * 1024 * 1024
|
||||
|
||||
// CgroupInitMillicores is the CPU weight for the init process.
|
||||
CgroupInitMillicores = 2000
|
||||
|
||||
// CgroupSystem is the cgroup name for system processes.
|
||||
CgroupSystem = "/system"
|
||||
|
||||
// CgroupSystemMillicores is the CPU weight for the system cgroup.
|
||||
CgroupSystemMillicores = 1500
|
||||
|
||||
// CgroupSystemReservedMemory is the hard memory protection for the system processes.
|
||||
CgroupSystemReservedMemory = 96 * 1024 * 1024
|
||||
|
||||
// CgroupSystemRuntime is the cgroup name for containerd runtime processes.
|
||||
CgroupSystemRuntime = CgroupSystem + "/runtime"
|
||||
|
||||
// CgroupSystemRuntimeReservedMemory is the hard memory protection for the system containerd process.
|
||||
CgroupSystemRuntimeReservedMemory = 48 * 1024 * 1024
|
||||
|
||||
// CgroupSystemRuntimeMillicores is the CPU weight for the system containerd process.
|
||||
CgroupSystemRuntimeMillicores = 500
|
||||
|
||||
// CgroupApid is the cgroup name for apid runtime processes.
|
||||
CgroupApid = CgroupSystem + "/apid"
|
||||
|
||||
// CgroupApidReservedMemory is the hard memory protection for the apid processes.
|
||||
CgroupApidReservedMemory = 16 * 1024 * 1024
|
||||
|
||||
// CgroupApidMaxMemory is the hard memory limit for the apid process.
|
||||
CgroupApidMaxMemory = 40 * 1024 * 1024
|
||||
|
||||
// CgroupApidMillicores is the CPU weight for the apid process.
|
||||
CgroupApidMillicores = 500
|
||||
|
||||
// CgroupTrustd is the cgroup name for trustd runtime processes.
|
||||
CgroupTrustd = CgroupSystem + "/trustd"
|
||||
|
||||
// CgroupTrustdReservedMemory is the hard memory protection for the trustd processes.
|
||||
CgroupTrustdReservedMemory = 8 * 1024 * 1024
|
||||
|
||||
// CgroupTrustdMaxMemory is the hard memory limit for the trustd process.
|
||||
CgroupTrustdMaxMemory = 24 * 1024 * 1024
|
||||
|
||||
// CgroupTrustdMillicores is the CPU weight for the trustd process.
|
||||
CgroupTrustdMillicores = 250
|
||||
|
||||
// CgroupUdevd is the cgroup name for udevd runtime processes.
|
||||
CgroupUdevd = CgroupSystem + "/udevd"
|
||||
|
||||
// CgroupUdevdReservedMemory is the hard memory protection for the udevd processes.
|
||||
CgroupUdevdReservedMemory = 8 * 1024 * 1024
|
||||
|
||||
// CgroupUdevdMillicores is the CPU weight for the udevd process.
|
||||
CgroupUdevdMillicores = 250
|
||||
|
||||
// CgroupExtensions is the cgroup name for system extension processes.
|
||||
CgroupExtensions = CgroupSystem + "/extensions"
|
||||
|
||||
// CgroupDashboard is the cgroup name for dashboard process.
|
||||
CgroupDashboard = CgroupSystem + "/dashboard"
|
||||
|
||||
// CgroupPodRuntimeRoot is the cgroup containing Kubernetes runtime components.
|
||||
CgroupPodRuntimeRoot = "/podruntime"
|
||||
|
||||
// CgroupPodRuntimeRootMillicores is the CPU weight for the pod runtime cgroup.
|
||||
CgroupPodRuntimeRootMillicores = 4000
|
||||
|
||||
// CgroupPodRuntime is the cgroup name for kubernetes containerd runtime processes.
|
||||
CgroupPodRuntime = "/podruntime/runtime"
|
||||
CgroupPodRuntime = CgroupPodRuntimeRoot + "/runtime"
|
||||
|
||||
// CgroupPodRuntimeMillicores is the CPU weight for the pod runtime cgroup.
|
||||
CgroupPodRuntimeMillicores = 1000
|
||||
|
||||
// CgroupPodRuntimeReservedMemory is the hard memory protection for the cri runtime processes.
|
||||
CgroupPodRuntimeReservedMemory = 128 * 1024 * 1024
|
||||
CgroupPodRuntimeReservedMemory = 196 * 1024 * 1024
|
||||
|
||||
// CgroupEtcd is the cgroup name for etcd process.
|
||||
CgroupEtcd = "/podruntime/etcd"
|
||||
CgroupEtcd = CgroupPodRuntimeRoot + "/etcd"
|
||||
|
||||
// CgroupEtcdReservedMemory is the soft memory protection for the etcd processes.
|
||||
CgroupEtcdReservedMemory = 256 * 1024 * 1024
|
||||
|
||||
// CgroupEtcdMillicores is the CPU weight for the etcd process.
|
||||
CgroupEtcdMillicores = 2000
|
||||
|
||||
// CgroupKubelet is the cgroup name for kubelet process.
|
||||
CgroupKubelet = "/podruntime/kubelet"
|
||||
CgroupKubelet = CgroupPodRuntimeRoot + "/kubelet"
|
||||
|
||||
// CgroupKubeletReservedMemory is the hard memory protection for the kubelet processes.
|
||||
CgroupKubeletReservedMemory = 64 * 1024 * 1024
|
||||
CgroupKubeletReservedMemory = 96 * 1024 * 1024
|
||||
|
||||
// CgroupDashboardReservedMemory is the hard memory protection for the dashboard process.
|
||||
CgroupDashboardReservedMemory = 85 * 1024 * 1024
|
||||
// CgroupKubeletMillicores is the CPU weight for the kubelet process.
|
||||
CgroupKubeletMillicores = 1000
|
||||
|
||||
// CgroupDashboardLowMemory is the low memory value for the dashboard process.
|
||||
CgroupDashboardLowMemory = 100 * 1024 * 1024
|
||||
// CgroupDashboardMaxMemory is the hard memory limit for the dashboard process.
|
||||
CgroupDashboardMaxMemory = 196 * 1024 * 1024
|
||||
|
||||
// CgroupDashboardMillicores is the CPU weight for the dashboard process.
|
||||
CgroupDashboardMillicores = 200
|
||||
|
||||
// FlannelCNI is the string to use Tanos-managed Flannel CNI (default).
|
||||
FlannelCNI = "flannel"
|
||||
|
Loading…
x
Reference in New Issue
Block a user