fix: audit and fix cgroup reservations

Fixes: #7081

Review all reservations and limits set, test under stress load (using
both memory and CPU).

The goal: system components (Talos itself) and runtime (kubelet, CRI)
should survive under extreme resource starvation (workloads consuming
all CPU/memory).

Uses #9337 to visualize changes, but doesn't depend on it.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
(cherry picked from commit 6b15ca19cd1291b8a245d72d5153827945cad037)
This commit is contained in:
Andrey Smirnov 2024-09-19 21:23:16 +04:00
parent c8dedbe116
commit 920d8c8297
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
15 changed files with 337 additions and 42 deletions

1
go.mod
View File

@ -80,6 +80,7 @@ require (
github.com/gizak/termui/v3 v3.1.0
github.com/godbus/dbus/v5 v5.1.0
github.com/golang/mock v1.6.0
github.com/google/cadvisor v0.50.0
github.com/google/go-containerregistry v0.20.2
github.com/google/go-tpm v0.9.1
github.com/google/nftables v0.2.0

2
go.sum
View File

@ -286,6 +286,8 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6
github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU=
github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/cadvisor v0.50.0 h1:7w/hKIbJKBWqQsRTy+Hpj2vj+fnxrLXcEXFy+LW0Bsg=
github.com/google/cadvisor v0.50.0/go.mod h1:VxCDwZalpFyENvmfabFqaIGsqNKLtDzE62a19rfVTB8=
github.com/google/cel-go v0.21.0 h1:cl6uW/gxN+Hy50tNYvI691+sXxioCnstFzLp2WO4GCI=
github.com/google/cel-go v0.21.0/go.mod h1:rHUlWCcBKgyEk+eV03RPdZUekPp6YcJwV0FxuUksYxc=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=

View File

@ -27,8 +27,10 @@ import (
v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/pkg/cgroup"
"github.com/siderolabs/talos/pkg/argsbuilder"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/kubelet"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
)
@ -63,6 +65,12 @@ func (ctrl *KubeletSpecController) Inputs() []controller.Input {
ID: optional.Some(k8s.KubeletID),
Kind: controller.InputWeak,
},
{
Namespace: config.NamespaceName,
Type: config.MachineTypeType,
ID: optional.Some(config.MachineTypeID),
Kind: controller.InputWeak,
},
}
}
@ -100,6 +108,15 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime
kubeletVersion := compatibility.VersionFromImageRef(cfgSpec.Image)
machineType, err := safe.ReaderGetByID[*config.MachineType](ctx, r, config.MachineTypeID)
if err != nil {
if state.IsNotFoundError(err) {
continue
}
return fmt.Errorf("error getting machine type: %w", err)
}
nodename, err := safe.ReaderGetByID[*k8s.Nodename](ctx, r, k8s.NodenameID)
if err != nil {
if state.IsNotFoundError(err) {
@ -173,7 +190,7 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime
args["image-credential-provider-config"] = constants.KubeletCredentialProviderConfig
}
kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion)
kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion, machineType.MachineType())
if err != nil {
return fmt.Errorf("error creating kubelet configuration: %w", err)
}
@ -242,7 +259,7 @@ func prepareExtraConfig(extraConfig map[string]any) (*kubeletconfig.KubeletConfi
// NewKubeletConfiguration builds kubelet configuration with defaults and overrides from extraConfig.
//
//nolint:gocyclo,cyclop
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version) (*kubeletconfig.KubeletConfiguration, error) {
func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version, machineType machine.Type) (*kubeletconfig.KubeletConfiguration, error) {
config, err := prepareExtraConfig(cfgSpec.ExtraConfig)
if err != nil {
return nil, err
@ -333,10 +350,15 @@ func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion comp
if len(config.SystemReserved) == 0 {
config.SystemReserved = map[string]string{
"cpu": constants.KubeletSystemReservedCPU,
"memory": constants.KubeletSystemReservedMemory,
"pid": constants.KubeletSystemReservedPid,
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
}
if machineType.IsControlPlane() {
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
} else {
config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryWorker
}
}
if config.Logging.Format == "" {

View File

@ -25,7 +25,9 @@ import (
"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest"
k8sctrl "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/k8s"
"github.com/siderolabs/talos/pkg/machinery/config/machine"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/resources/config"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
)
@ -60,6 +62,10 @@ func (suite *KubeletSpecSuite) TestReconcileDefault() {
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()
@ -97,6 +103,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()
@ -114,7 +124,7 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() {
})
}
func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {
func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEndpointFlag() {
cfg := k8s.NewKubeletConfig(k8s.NamespaceName, k8s.KubeletID)
cfg.TypedSpec().Image = "kubelet:v1.25.0"
cfg.TypedSpec().ClusterDNS = []string{"10.96.0.10"}
@ -128,6 +138,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() {
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename))
machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()
@ -180,6 +194,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExtraConfig() {
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))
machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()
@ -219,6 +237,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithSkipNodeRegistration() {
suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP))
machineType := config.NewMachineType()
machineType.SetMachineType(machine.TypeWorker)
suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType))
rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) {
spec := kubeletSpec.TypedSpec()
@ -307,7 +329,7 @@ func TestNewKubeletConfigurationFail(t *testing.T) {
tt.name, func(t *testing.T) {
t.Parallel()
_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""))
_, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""), machine.TypeWorker)
require.Error(t, err)
assert.EqualError(t, err, tt.expectedErr)
@ -352,7 +374,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
FailSwapOn: pointer.To(false),
SystemReserved: map[string]string{
"cpu": constants.KubeletSystemReservedCPU,
"memory": constants.KubeletSystemReservedMemory,
"memory": constants.KubeletSystemReservedMemoryWorker,
"pid": constants.KubeletSystemReservedPid,
"ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage,
},
@ -373,6 +395,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
cfgSpec *k8s.KubeletConfigSpec
kubeletVersion compatibility.Version
expectedOverrides func(*kubeletconfig.KubeletConfiguration)
machineType machine.Type
}{
{
name: "override some",
@ -389,6 +412,19 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.OOMScoreAdj = pointer.To[int32](-300)
kc.EnableDebuggingHandlers = pointer.To(true)
},
machineType: machine.TypeWorker,
},
{
name: "controlplane",
cfgSpec: &k8s.KubeletConfigSpec{
ClusterDNS: []string{"10.0.0.5"},
ClusterDomain: "cluster.local",
},
kubeletVersion: compatibility.VersionFromImageRef("ghcr.io/siderolabs/kubelet:v1.29.0"),
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane
},
machineType: machine.TypeControlPlane,
},
{
name: "disable graceful shutdown",
@ -405,6 +441,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.ShutdownGracePeriod = metav1.Duration{}
kc.ShutdownGracePeriodCriticalPods = metav1.Duration{}
},
machineType: machine.TypeWorker,
},
{
name: "enable seccomp default",
@ -417,6 +454,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.SeccompDefault = pointer.To(true)
},
machineType: machine.TypeWorker,
},
{
name: "enable skipNodeRegistration",
@ -430,6 +468,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
kc.Authentication.Webhook.Enabled = pointer.To(false)
kc.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow
},
machineType: machine.TypeWorker,
},
{
name: "disable manifests directory",
@ -442,6 +481,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) {
kc.StaticPodPath = ""
},
machineType: machine.TypeWorker,
},
{
name: "enable local FS quota monitoring",
@ -456,19 +496,20 @@ func TestNewKubeletConfigurationMerge(t *testing.T) {
"LocalStorageCapacityIsolationFSQuotaMonitoring": true,
}
},
machineType: machine.TypeWorker,
},
} {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
expected := defaultKubeletConfig
tt.expectedOverrides(&expected)
expected := defaultKubeletConfig.DeepCopy()
tt.expectedOverrides(expected)
config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion)
config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion, tt.machineType)
require.NoError(t, err)
assert.Equal(t, &expected, config)
assert.Equal(t, expected, config)
})
}
}

View File

@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupInitReservedMemory),
Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupInitMillicores))),
},
},
},
{
@ -191,15 +194,42 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupSystemReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemMillicores))),
},
},
},
{
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{},
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemRuntimeMillicores))),
},
},
},
{
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{},
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupUdevdReservedMemory),
Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupUdevdMillicores))),
},
},
},
{
name: constants.CgroupPodRuntimeRoot,
resources: &cgroup2.Resources{
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeRootMillicores))),
},
},
},
{
name: constants.CgroupPodRuntime,
@ -208,6 +238,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeMillicores))),
},
},
},
{
@ -217,14 +250,45 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupKubeletReservedMemory),
Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupKubeletMillicores))),
},
},
},
{
name: constants.CgroupDashboard,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupDashboardReservedMemory),
Low: pointer.To[int64](constants.CgroupDashboardLowMemory),
Max: pointer.To[int64](constants.CgroupDashboardMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))),
},
},
},
{
name: constants.CgroupApid,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupApidReservedMemory),
Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupApidMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))),
},
},
},
{
name: constants.CgroupTrustd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupTrustdReservedMemory),
Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupTrustdMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))),
},
},
},

View File

@ -13,18 +13,6 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
)
// WithMemoryLimit sets the linux resource memory limit field.
func WithMemoryLimit(limit int64) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
s.Linux.Resources.Memory = &specs.LinuxMemory{
Limit: &limit,
// DisableOOMKiller: &disable,
}
return nil
}
}
// WithRootfsPropagation sets the root filesystem propagation.
func WithRootfsPropagation(rp string) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {

View File

@ -6,15 +6,18 @@
package runner
import (
"context"
"fmt"
"io"
"time"
containerd "github.com/containerd/containerd/v2/client"
ocicontainers "github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/siderolabs/gen/maps"
"github.com/siderolabs/gen/optional"
"github.com/siderolabs/go-pointer"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/logging"
@ -220,3 +223,20 @@ func WithUID(uid uint32) Option {
args.UID = uid
}
}
// WithMemoryReservation sets the memory reservation limit as on OCI spec.
func WithMemoryReservation(limit uint64) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *ocicontainers.Container, s *oci.Spec) error {
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}
if s.Linux.Resources.Memory == nil {
s.Linux.Resources.Memory = &specs.LinuxMemory{}
}
s.Linux.Resources.Memory.Reservation = pointer.To(int64(limit))
return nil
}
}

View File

@ -12,6 +12,7 @@ import (
"net"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/containerd/containerd/v2/pkg/cap"
@ -164,6 +165,7 @@ func (o *APID) Runner(r runtime.Runtime) (runner.Runner, error) {
env := []string{
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupApidMaxMemory/5*4),
}
for _, value := range environment.Get(r.Config()) {

View File

@ -8,6 +8,7 @@ package services
import (
"context"
"fmt"
"strconv"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/events"
@ -61,6 +62,7 @@ func (d *Dashboard) Runner(r runtime.Runtime) (runner.Runner, error) {
runner.WithEnv([]string{
"TERM=linux",
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupDashboardMaxMemory/5*4),
}),
runner.WithStdinFile(tty),
runner.WithStdoutFile(tty),

View File

@ -36,6 +36,7 @@ import (
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/containerd"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/restart"
"github.com/siderolabs/talos/internal/pkg/cgroup"
"github.com/siderolabs/talos/internal/pkg/containers/image"
"github.com/siderolabs/talos/internal/pkg/environment"
"github.com/siderolabs/talos/internal/pkg/etcd"
@ -224,6 +225,8 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) {
oci.WithHostNamespace(specs.NetworkNamespace),
oci.WithMounts(mounts),
oci.WithUser(fmt.Sprintf("%d:%d", constants.EtcdUserID, constants.EtcdUserID)),
runner.WithMemoryReservation(constants.CgroupEtcdReservedMemory),
oci.WithCPUShares(uint64(cgroup.MilliCoresToShares(constants.CgroupEtcdMillicores))),
),
runner.WithOOMScoreAdj(-998),
),

View File

@ -12,6 +12,7 @@ import (
"net"
"os"
"path/filepath"
"strconv"
"github.com/containerd/containerd/v2/pkg/cap"
"github.com/containerd/containerd/v2/pkg/oci"
@ -142,7 +143,10 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
}
env := environment.Get(r.Config())
env = append(env, constants.TcellMinimizeEnvironment)
env = append(env,
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT="+strconv.Itoa(constants.CgroupTrustdMaxMemory/5*4),
)
if debug.RaceEnabled {
env = append(env, "GORACE=halt_on_error=1")
@ -156,7 +160,6 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
runner.WithEnv(env),
runner.WithCgroupPath(constants.CgroupTrustd),
runner.WithOCISpecOpts(
containerd.WithMemoryLimit(int64(1000000*512)),
oci.WithDroppedCapabilities(cap.Known()),
oci.WithHostNamespace(specs.NetworkNamespace),
oci.WithMounts(mounts),

View File

@ -0,0 +1,52 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package cgroup
import (
"runtime"
"sync"
"github.com/google/cadvisor/utils/sysfs"
"github.com/google/cadvisor/utils/sysinfo"
)
var availableCPUCores = sync.OnceValue(func() int {
_, cores, err := sysinfo.GetNodesInfo(sysfs.NewRealSysFs())
if err != nil || cores < 1 {
return runtime.NumCPU()
}
return cores
})
// MilliCores represents a CPU value in milli-cores.
type MilliCores uint
// AvailableMilliCores returns the number of available CPU cores in milli-cores.
func AvailableMilliCores() MilliCores {
return MilliCores(availableCPUCores()) * 1000
}
// CPUShare represents a CPU share value.
type CPUShare uint64
// MilliCoresToShares converts milli-cores to CPU shares.
func MilliCoresToShares(milliCores MilliCores) CPUShare {
return CPUShare(milliCores) * 1024 / 1000
}
// SharesToCPUWeight converts CPU shares to CPU weight.
func SharesToCPUWeight(shares CPUShare) uint64 {
return uint64((((shares - 2) * 9999) / 262142) + 1)
}
// MillicoresToCPUWeight converts milli-cores to CPU weight.
//
// It limits millicores to available CPU cores.
func MillicoresToCPUWeight(requested MilliCores) uint64 {
requested = min(requested, AvailableMilliCores())
return SharesToCPUWeight(MilliCoresToShares(requested))
}

View File

@ -0,0 +1,38 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package cgroup_test
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/siderolabs/talos/internal/pkg/cgroup"
)
func TestAvailableMillicores(t *testing.T) {
t.Logf("Available CPU milli-cores: %d", cgroup.AvailableMilliCores())
assert.GreaterOrEqual(t, cgroup.AvailableMilliCores(), cgroup.MilliCores(1000))
}
func TestMillicoresToShares(t *testing.T) {
assert.Equal(t, cgroup.CPUShare(102), cgroup.MilliCoresToShares(100))
assert.Equal(t, cgroup.CPUShare(1024), cgroup.MilliCoresToShares(1000))
assert.Equal(t, cgroup.CPUShare(2560), cgroup.MilliCoresToShares(2500))
}
func TestSharesToCPUWeight(t *testing.T) {
assert.Equal(t, uint64(4), cgroup.SharesToCPUWeight(102))
assert.Equal(t, uint64(79), cgroup.SharesToCPUWeight(2048))
assert.Equal(t, uint64(313), cgroup.SharesToCPUWeight(8192))
}
func TestMillicoresToCPUWeight(t *testing.T) {
// depends on number of CPUs available, but for < 1000 millicores it should be same result
assert.Equal(t, uint64(4), cgroup.MillicoresToCPUWeight(100))
assert.Equal(t, uint64(20), cgroup.MillicoresToCPUWeight(500))
assert.Equal(t, uint64(39), cgroup.MillicoresToCPUWeight(1000))
}

View File

@ -32,7 +32,7 @@ func CGroupMountPoints() (mountpoints *Points, err error) {
func cgroupMountPointsV2() (mountpoints *Points, err error) {
cgroups := NewMountPoints()
cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate"))
cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate,memory_recursiveprot"))
return cgroups, nil
}

View File

@ -402,8 +402,11 @@ const (
// KubeletSystemReservedCPU cpu system reservation value for kubelet kubeconfig.
KubeletSystemReservedCPU = "50m"
// KubeletSystemReservedMemory memory system reservation value for kubelet kubeconfig.
KubeletSystemReservedMemory = "192Mi"
// KubeletSystemReservedMemoryControlPlane memory system reservation value for kubelet kubeconfig (controlplane nodes).
KubeletSystemReservedMemoryControlPlane = "512Mi"
// KubeletSystemReservedMemoryWorker memory system reservation value for kubelet kubeconfig (worker nodes).
KubeletSystemReservedMemoryWorker = "384Mi"
// KubeletSystemReservedPid pid system reservation value for kubelet kubeconfig.
KubeletSystemReservedPid = "100"
@ -672,50 +675,104 @@ const (
// CgroupInitReservedMemory is the hard memory protection for the init process.
CgroupInitReservedMemory = 96 * 1024 * 1024
// CgroupInitMillicores is the CPU weight for the init process.
CgroupInitMillicores = 2000
// CgroupSystem is the cgroup name for system processes.
CgroupSystem = "/system"
// CgroupSystemMillicores is the CPU weight for the system cgroup.
CgroupSystemMillicores = 1500
// CgroupSystemReservedMemory is the hard memory protection for the system processes.
CgroupSystemReservedMemory = 96 * 1024 * 1024
// CgroupSystemRuntime is the cgroup name for containerd runtime processes.
CgroupSystemRuntime = CgroupSystem + "/runtime"
// CgroupSystemRuntimeReservedMemory is the hard memory protection for the system containerd process.
CgroupSystemRuntimeReservedMemory = 48 * 1024 * 1024
// CgroupSystemRuntimeMillicores is the CPU weight for the system containerd process.
CgroupSystemRuntimeMillicores = 500
// CgroupApid is the cgroup name for apid runtime processes.
CgroupApid = CgroupSystem + "/apid"
// CgroupApidReservedMemory is the hard memory protection for the apid processes.
CgroupApidReservedMemory = 16 * 1024 * 1024
// CgroupApidMaxMemory is the hard memory limit for the apid process.
CgroupApidMaxMemory = 40 * 1024 * 1024
// CgroupApidMillicores is the CPU weight for the apid process.
CgroupApidMillicores = 500
// CgroupTrustd is the cgroup name for trustd runtime processes.
CgroupTrustd = CgroupSystem + "/trustd"
// CgroupTrustdReservedMemory is the hard memory protection for the trustd processes.
CgroupTrustdReservedMemory = 8 * 1024 * 1024
// CgroupTrustdMaxMemory is the hard memory limit for the trustd process.
CgroupTrustdMaxMemory = 24 * 1024 * 1024
// CgroupTrustdMillicores is the CPU weight for the trustd process.
CgroupTrustdMillicores = 250
// CgroupUdevd is the cgroup name for udevd runtime processes.
CgroupUdevd = CgroupSystem + "/udevd"
// CgroupUdevdReservedMemory is the hard memory protection for the udevd processes.
CgroupUdevdReservedMemory = 8 * 1024 * 1024
// CgroupUdevdMillicores is the CPU weight for the udevd process.
CgroupUdevdMillicores = 250
// CgroupExtensions is the cgroup name for system extension processes.
CgroupExtensions = CgroupSystem + "/extensions"
// CgroupDashboard is the cgroup name for dashboard process.
CgroupDashboard = CgroupSystem + "/dashboard"
// CgroupPodRuntimeRoot is the cgroup containing Kubernetes runtime components.
CgroupPodRuntimeRoot = "/podruntime"
// CgroupPodRuntimeRootMillicores is the CPU weight for the pod runtime cgroup.
CgroupPodRuntimeRootMillicores = 4000
// CgroupPodRuntime is the cgroup name for kubernetes containerd runtime processes.
CgroupPodRuntime = "/podruntime/runtime"
CgroupPodRuntime = CgroupPodRuntimeRoot + "/runtime"
// CgroupPodRuntimeMillicores is the CPU weight for the pod runtime cgroup.
CgroupPodRuntimeMillicores = 1000
// CgroupPodRuntimeReservedMemory is the hard memory protection for the cri runtime processes.
CgroupPodRuntimeReservedMemory = 128 * 1024 * 1024
CgroupPodRuntimeReservedMemory = 196 * 1024 * 1024
// CgroupEtcd is the cgroup name for etcd process.
CgroupEtcd = "/podruntime/etcd"
CgroupEtcd = CgroupPodRuntimeRoot + "/etcd"
// CgroupEtcdReservedMemory is the soft memory protection for the etcd processes.
CgroupEtcdReservedMemory = 256 * 1024 * 1024
// CgroupEtcdMillicores is the CPU weight for the etcd process.
CgroupEtcdMillicores = 2000
// CgroupKubelet is the cgroup name for kubelet process.
CgroupKubelet = "/podruntime/kubelet"
CgroupKubelet = CgroupPodRuntimeRoot + "/kubelet"
// CgroupKubeletReservedMemory is the hard memory protection for the kubelet processes.
CgroupKubeletReservedMemory = 64 * 1024 * 1024
CgroupKubeletReservedMemory = 96 * 1024 * 1024
// CgroupDashboardReservedMemory is the hard memory protection for the dashboard process.
CgroupDashboardReservedMemory = 85 * 1024 * 1024
// CgroupKubeletMillicores is the CPU weight for the kubelet process.
CgroupKubeletMillicores = 1000
// CgroupDashboardLowMemory is the low memory value for the dashboard process.
CgroupDashboardLowMemory = 100 * 1024 * 1024
// CgroupDashboardMaxMemory is the hard memory limit for the dashboard process.
CgroupDashboardMaxMemory = 196 * 1024 * 1024
// CgroupDashboardMillicores is the CPU weight for the dashboard process.
CgroupDashboardMillicores = 200
// FlannelCNI is the string to use Tanos-managed Flannel CNI (default).
FlannelCNI = "flannel"