fix: stalls in local Docker cluster boot
Problem was triggered by udevd trigger, root cause is not clear, but workaround is to disable it for container mode. Implement CPU/mem limits for `osctl cluster create`, apply defaults, bump defaults for cicd. Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
parent
aadbad44f0
commit
ae54f7e40d
@ -9,6 +9,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"math/big"
|
||||
"net"
|
||||
"sync"
|
||||
|
||||
@ -18,6 +19,7 @@ import (
|
||||
"github.com/docker/docker/api/types/network"
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/talos-systems/talos/cmd/osctl/cmd/cluster/pkg/node"
|
||||
"github.com/talos-systems/talos/cmd/osctl/pkg/client/config"
|
||||
@ -27,11 +29,13 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
clusterName string
|
||||
image string
|
||||
networkMTU string
|
||||
workers int
|
||||
masters int
|
||||
clusterName string
|
||||
image string
|
||||
networkMTU string
|
||||
workers int
|
||||
masters int
|
||||
clusterCpus string
|
||||
clusterMemory int
|
||||
)
|
||||
|
||||
const baseNetwork = "10.5.0.%d"
|
||||
@ -79,6 +83,12 @@ func create() (err error) {
|
||||
helpers.Fatalf("number of masters can't be less than 1")
|
||||
}
|
||||
|
||||
nanoCPUs, err := parseCPUShare()
|
||||
if err != nil {
|
||||
helpers.Fatalf("error parsing --cpus: %s", err)
|
||||
}
|
||||
memory := int64(clusterMemory) * 1024 * 1024
|
||||
|
||||
// Ensure the image is present.
|
||||
|
||||
if err = ensureImageExists(ctx, cli, image); err != nil {
|
||||
@ -112,10 +122,12 @@ func create() (err error) {
|
||||
requests := make([]*node.Request, masters)
|
||||
for i := range requests {
|
||||
requests[i] = &node.Request{
|
||||
Input: *input,
|
||||
Image: image,
|
||||
Name: fmt.Sprintf("master-%d", i+1),
|
||||
IP: net.ParseIP(ips[i]),
|
||||
Input: *input,
|
||||
Image: image,
|
||||
Name: fmt.Sprintf("master-%d", i+1),
|
||||
IP: net.ParseIP(ips[i]),
|
||||
Memory: memory,
|
||||
NanoCPUs: nanoCPUs,
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
@ -134,10 +146,12 @@ func create() (err error) {
|
||||
requests = []*node.Request{}
|
||||
for i := 1; i <= workers; i++ {
|
||||
r := &node.Request{
|
||||
Type: generate.TypeJoin,
|
||||
Input: *input,
|
||||
Image: image,
|
||||
Name: fmt.Sprintf("worker-%d", i),
|
||||
Type: generate.TypeJoin,
|
||||
Input: *input,
|
||||
Image: image,
|
||||
Name: fmt.Sprintf("worker-%d", i),
|
||||
Memory: memory,
|
||||
NanoCPUs: nanoCPUs,
|
||||
}
|
||||
requests = append(requests, r)
|
||||
}
|
||||
@ -331,11 +345,25 @@ func saveConfig(input *generate.Input) (err error) {
|
||||
return c.Save(talosconfig)
|
||||
}
|
||||
|
||||
func parseCPUShare() (int64, error) {
|
||||
cpu, ok := new(big.Rat).SetString(clusterCpus)
|
||||
if !ok {
|
||||
return 0, errors.Errorf("failed to parsing as a rational number: %s", clusterCpus)
|
||||
}
|
||||
nano := cpu.Mul(cpu, big.NewRat(1e9, 1))
|
||||
if !nano.IsInt() {
|
||||
return 0, errors.New("value is too precise")
|
||||
}
|
||||
return nano.Num().Int64(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
clusterUpCmd.Flags().StringVar(&image, "image", "docker.io/autonomy/talos:"+version.Tag, "the image to use")
|
||||
clusterUpCmd.Flags().StringVar(&networkMTU, "mtu", "1500", "MTU of the docker bridge network")
|
||||
clusterUpCmd.Flags().IntVar(&workers, "workers", 1, "the number of workers to create")
|
||||
clusterUpCmd.Flags().IntVar(&masters, "masters", 3, "the number of masters to create")
|
||||
clusterUpCmd.Flags().StringVar(&clusterCpus, "cpus", "1.5", "the share of CPUs as fraction (each container)")
|
||||
clusterUpCmd.Flags().IntVar(&clusterMemory, "memory", 1024, "the limit on memory usage in MB (each container)")
|
||||
clusterCmd.PersistentFlags().StringVar(&clusterName, "name", "talos_default", "the name of the cluster")
|
||||
clusterCmd.AddCommand(clusterUpCmd)
|
||||
clusterCmd.AddCommand(clusterDownCmd)
|
||||
|
@ -25,6 +25,11 @@ type Request struct {
|
||||
Image string
|
||||
Name string
|
||||
IP net.IP
|
||||
|
||||
// Share of CPUs, in 1e-9 fractions
|
||||
NanoCPUs int64
|
||||
// Memory limit in bytes
|
||||
Memory int64
|
||||
}
|
||||
|
||||
// NewNode creates a node as a container.
|
||||
@ -60,6 +65,10 @@ func NewNode(clusterName string, req *Request) (err error) {
|
||||
hostConfig := &container.HostConfig{
|
||||
Privileged: true,
|
||||
SecurityOpt: []string{"seccomp:unconfined"},
|
||||
Resources: container.Resources{
|
||||
NanoCPUs: req.NanoCPUs,
|
||||
Memory: req.Memory,
|
||||
},
|
||||
}
|
||||
|
||||
// Ensure that the container is created in the talos network.
|
||||
|
@ -27,7 +27,7 @@ run() {
|
||||
k8s.gcr.io/hyperkube:${KUBERNETES_VERSION} -c "${1}"
|
||||
}
|
||||
|
||||
${OSCTL} cluster create --name integration --image ${TALOS_IMG} --mtu 1440
|
||||
${OSCTL} cluster create --name integration --image ${TALOS_IMG} --mtu 1440 --cpus 4.0
|
||||
${OSCTL} config target 10.5.0.2
|
||||
|
||||
## Fetch kubeconfig
|
||||
|
@ -23,17 +23,19 @@ func NewServicesTask() phase.Task {
|
||||
|
||||
// RuntimeFunc returns the runtime function.
|
||||
func (task *Services) RuntimeFunc(mode runtime.Mode) phase.RuntimeFunc {
|
||||
return task.runtime
|
||||
return func(platform platform.Platform, data *userdata.UserData) error {
|
||||
return task.runtime(data, mode)
|
||||
}
|
||||
}
|
||||
|
||||
func (task *Services) runtime(platform platform.Platform, data *userdata.UserData) (err error) {
|
||||
task.startSystemServices(data)
|
||||
func (task *Services) runtime(data *userdata.UserData, mode runtime.Mode) (err error) {
|
||||
task.startSystemServices(data, mode)
|
||||
task.startKubernetesServices(data)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (task *Services) startSystemServices(data *userdata.UserData) {
|
||||
func (task *Services) startSystemServices(data *userdata.UserData, mode runtime.Mode) {
|
||||
svcs := system.Services(data)
|
||||
// Start the services common to all nodes.
|
||||
svcs.Load(
|
||||
@ -41,10 +43,18 @@ func (task *Services) startSystemServices(data *userdata.UserData) {
|
||||
&services.Networkd{},
|
||||
&services.Containerd{},
|
||||
&services.Udevd{},
|
||||
&services.UdevdTrigger{},
|
||||
&services.OSD{},
|
||||
&services.NTPd{},
|
||||
)
|
||||
|
||||
if mode != runtime.Container {
|
||||
// udevd-trigger is causing stalls/unresponsive stuff when running in local mode
|
||||
// TODO: investigate root cause, but workaround for now is to skip it in container mode
|
||||
svcs.Load(
|
||||
&services.UdevdTrigger{},
|
||||
)
|
||||
}
|
||||
|
||||
// Start the services common to all master nodes.
|
||||
if data.Services.Kubeadm.IsControlPlane() {
|
||||
svcs.Load(
|
||||
|
Loading…
Reference in New Issue
Block a user