fix: stalls in local Docker cluster boot

Problem was triggered by udevd trigger, root cause is not clear, but
workaround is to disable it for container mode.

Implement CPU/mem limits for `osctl cluster create`, apply defaults,
bump defaults for cicd.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
Andrey Smirnov 2019-08-10 00:25:20 +03:00 committed by Andrey Smirnov
parent aadbad44f0
commit ae54f7e40d
4 changed files with 66 additions and 19 deletions

View File

@ -9,6 +9,7 @@ import (
"fmt"
"io"
"io/ioutil"
"math/big"
"net"
"sync"
@ -18,6 +19,7 @@ import (
"github.com/docker/docker/api/types/network"
"github.com/docker/docker/client"
"github.com/hashicorp/go-multierror"
"github.com/pkg/errors"
"github.com/spf13/cobra"
"github.com/talos-systems/talos/cmd/osctl/cmd/cluster/pkg/node"
"github.com/talos-systems/talos/cmd/osctl/pkg/client/config"
@ -27,11 +29,13 @@ import (
)
var (
clusterName string
image string
networkMTU string
workers int
masters int
clusterName string
image string
networkMTU string
workers int
masters int
clusterCpus string
clusterMemory int
)
const baseNetwork = "10.5.0.%d"
@ -79,6 +83,12 @@ func create() (err error) {
helpers.Fatalf("number of masters can't be less than 1")
}
nanoCPUs, err := parseCPUShare()
if err != nil {
helpers.Fatalf("error parsing --cpus: %s", err)
}
memory := int64(clusterMemory) * 1024 * 1024
// Ensure the image is present.
if err = ensureImageExists(ctx, cli, image); err != nil {
@ -112,10 +122,12 @@ func create() (err error) {
requests := make([]*node.Request, masters)
for i := range requests {
requests[i] = &node.Request{
Input: *input,
Image: image,
Name: fmt.Sprintf("master-%d", i+1),
IP: net.ParseIP(ips[i]),
Input: *input,
Image: image,
Name: fmt.Sprintf("master-%d", i+1),
IP: net.ParseIP(ips[i]),
Memory: memory,
NanoCPUs: nanoCPUs,
}
if i == 0 {
@ -134,10 +146,12 @@ func create() (err error) {
requests = []*node.Request{}
for i := 1; i <= workers; i++ {
r := &node.Request{
Type: generate.TypeJoin,
Input: *input,
Image: image,
Name: fmt.Sprintf("worker-%d", i),
Type: generate.TypeJoin,
Input: *input,
Image: image,
Name: fmt.Sprintf("worker-%d", i),
Memory: memory,
NanoCPUs: nanoCPUs,
}
requests = append(requests, r)
}
@ -331,11 +345,25 @@ func saveConfig(input *generate.Input) (err error) {
return c.Save(talosconfig)
}
func parseCPUShare() (int64, error) {
cpu, ok := new(big.Rat).SetString(clusterCpus)
if !ok {
return 0, errors.Errorf("failed to parsing as a rational number: %s", clusterCpus)
}
nano := cpu.Mul(cpu, big.NewRat(1e9, 1))
if !nano.IsInt() {
return 0, errors.New("value is too precise")
}
return nano.Num().Int64(), nil
}
func init() {
clusterUpCmd.Flags().StringVar(&image, "image", "docker.io/autonomy/talos:"+version.Tag, "the image to use")
clusterUpCmd.Flags().StringVar(&networkMTU, "mtu", "1500", "MTU of the docker bridge network")
clusterUpCmd.Flags().IntVar(&workers, "workers", 1, "the number of workers to create")
clusterUpCmd.Flags().IntVar(&masters, "masters", 3, "the number of masters to create")
clusterUpCmd.Flags().StringVar(&clusterCpus, "cpus", "1.5", "the share of CPUs as fraction (each container)")
clusterUpCmd.Flags().IntVar(&clusterMemory, "memory", 1024, "the limit on memory usage in MB (each container)")
clusterCmd.PersistentFlags().StringVar(&clusterName, "name", "talos_default", "the name of the cluster")
clusterCmd.AddCommand(clusterUpCmd)
clusterCmd.AddCommand(clusterDownCmd)

View File

@ -25,6 +25,11 @@ type Request struct {
Image string
Name string
IP net.IP
// Share of CPUs, in 1e-9 fractions
NanoCPUs int64
// Memory limit in bytes
Memory int64
}
// NewNode creates a node as a container.
@ -60,6 +65,10 @@ func NewNode(clusterName string, req *Request) (err error) {
hostConfig := &container.HostConfig{
Privileged: true,
SecurityOpt: []string{"seccomp:unconfined"},
Resources: container.Resources{
NanoCPUs: req.NanoCPUs,
Memory: req.Memory,
},
}
// Ensure that the container is created in the talos network.

View File

@ -27,7 +27,7 @@ run() {
k8s.gcr.io/hyperkube:${KUBERNETES_VERSION} -c "${1}"
}
${OSCTL} cluster create --name integration --image ${TALOS_IMG} --mtu 1440
${OSCTL} cluster create --name integration --image ${TALOS_IMG} --mtu 1440 --cpus 4.0
${OSCTL} config target 10.5.0.2
## Fetch kubeconfig

View File

@ -23,17 +23,19 @@ func NewServicesTask() phase.Task {
// RuntimeFunc returns the runtime function.
func (task *Services) RuntimeFunc(mode runtime.Mode) phase.RuntimeFunc {
return task.runtime
return func(platform platform.Platform, data *userdata.UserData) error {
return task.runtime(data, mode)
}
}
func (task *Services) runtime(platform platform.Platform, data *userdata.UserData) (err error) {
task.startSystemServices(data)
func (task *Services) runtime(data *userdata.UserData, mode runtime.Mode) (err error) {
task.startSystemServices(data, mode)
task.startKubernetesServices(data)
return nil
}
func (task *Services) startSystemServices(data *userdata.UserData) {
func (task *Services) startSystemServices(data *userdata.UserData, mode runtime.Mode) {
svcs := system.Services(data)
// Start the services common to all nodes.
svcs.Load(
@ -41,10 +43,18 @@ func (task *Services) startSystemServices(data *userdata.UserData) {
&services.Networkd{},
&services.Containerd{},
&services.Udevd{},
&services.UdevdTrigger{},
&services.OSD{},
&services.NTPd{},
)
if mode != runtime.Container {
// udevd-trigger is causing stalls/unresponsive stuff when running in local mode
// TODO: investigate root cause, but workaround for now is to skip it in container mode
svcs.Load(
&services.UdevdTrigger{},
)
}
// Start the services common to all master nodes.
if data.Services.Kubeadm.IsControlPlane() {
svcs.Load(