feat: add network chaos to qemu development environment
Add flags for configuring the qemu bridge interface with chaos options: - network-chaos-enabled - network-jitter - network-latency - network-packet-loss - network-packet-reorder - network-packet-corrupt - network-bandwidth These flags are used in /pkg/provision/providers/vm/network.go at the end of the CreateNetwork function to first see if the network-chaos-enabled flag is set, and then check if bandwidth is set. This will allow developers to simulate clusters having a degraded WAN connection in the development environment and testing pipelines. If bandwidth is not set, it will then enable the other options. - Note that if bandwidth is set, the other options such as jitter, latency, packet loss, reordering and corruption will not be used. This is for two reasons: - Restriction the bandwidth can often intoduce many of the other issues being set by the other options. - Setting the bandwidth uses a separate queuing discipline (Token Bucket Filter) from the other options (Network Emulator) and requires a much more complex configuration using a Heirarchial Token Bucket Filter which cannot be configured at a granular enough level using the vishvananda/netlink library. Adding both queuing disciplines to the same interface may be an option to look into in the future, but would take more extensive testing and control over many more variables which I believe is out of the scope of this PR. It is also possible to add custom profiles, but will also take more research to develop common scenarios which combine different options in a realistic manner. Signed-off-by: Christian Rolland <christian.rolland@siderolabs.com> Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
parent
47986cb79e
commit
e6dde8ffc5
@ -450,7 +450,13 @@ local integration_cilium_strict = Step('e2e-cilium-strict', target='e2e-qemu', p
|
||||
WITH_CONFIG_PATCH: '[{"op": "add", "path": "/cluster/network", "value": {"cni": {"name": "none"}}}, {"op": "add", "path": "/cluster/proxy", "value": {"disabled": true}}]',
|
||||
IMAGE_REGISTRY: local_registry,
|
||||
});
|
||||
local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={
|
||||
|
||||
local integration_network_chaos = Step('e2e-network-chaos', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={
|
||||
SHORT_INTEGRATION_TEST: 'yes',
|
||||
WITH_NETWORK_CHAOS: 'true',
|
||||
REGISTRY: local_registry,
|
||||
});
|
||||
local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[integration_network_chaos], environment={
|
||||
INTEGRATION_TEST_RUN: 'TestIntegration/api.ResetSuite/TestResetWithSpec',
|
||||
CUSTOM_CNI_URL: 'https://raw.githubusercontent.com/projectcalico/calico/v3.25.0/manifests/canal.yaml',
|
||||
REGISTRY: local_registry,
|
||||
@ -548,6 +554,7 @@ local integration_pipelines = [
|
||||
Pipeline('integration-provision-1', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_1]) + integration_trigger(['integration-provision', 'integration-provision-1']),
|
||||
Pipeline('integration-provision-2', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_2]) + integration_trigger(['integration-provision', 'integration-provision-2']),
|
||||
Pipeline('integration-misc', default_pipeline_steps + [
|
||||
integration_network_chaos,
|
||||
integration_canal_reset,
|
||||
integration_bios_cgroupsv1,
|
||||
integration_disk_image,
|
||||
|
@ -154,6 +154,13 @@ var (
|
||||
controlPlanePort int
|
||||
dhcpSkipHostname bool
|
||||
skipBootPhaseFinishedCheck bool
|
||||
networkChaos bool
|
||||
jitter time.Duration
|
||||
latency time.Duration
|
||||
packetLoss float64
|
||||
packetReorder float64
|
||||
packetCorrupt float64
|
||||
bandwidth int
|
||||
)
|
||||
|
||||
// createCmd represents the cluster up command.
|
||||
@ -269,6 +276,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Validate network chaos flags
|
||||
if !networkChaos {
|
||||
if jitter != 0 || latency != 0 || packetLoss != 0 || packetReorder != 0 || packetCorrupt != 0 || bandwidth != 0 {
|
||||
return fmt.Errorf("network chaos flags can only be used with --with-network-chaos")
|
||||
}
|
||||
}
|
||||
|
||||
provisioner, err := providers.Factory(ctx, provisionerName)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -296,6 +310,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) {
|
||||
},
|
||||
DHCPSkipHostname: dhcpSkipHostname,
|
||||
DockerDisableIPv6: dockerDisableIPv6,
|
||||
NetworkChaos: networkChaos,
|
||||
Jitter: jitter,
|
||||
Latency: latency,
|
||||
PacketLoss: packetLoss,
|
||||
PacketReorder: packetReorder,
|
||||
PacketCorrupt: packetCorrupt,
|
||||
Bandwidth: bandwidth,
|
||||
},
|
||||
|
||||
Image: nodeImage,
|
||||
@ -945,6 +966,15 @@ func init() {
|
||||
createCmd.Flags().IntVar(&controlPlanePort, controlPlanePortFlag, constants.DefaultControlPlanePort, "control plane port (load balancer and local API port)")
|
||||
createCmd.Flags().BoolVar(&dhcpSkipHostname, "disable-dhcp-hostname", false, "skip announcing hostname via DHCP (QEMU only)")
|
||||
createCmd.Flags().BoolVar(&skipBootPhaseFinishedCheck, "skip-boot-phase-finished-check", false, "skip waiting for node to finish boot phase")
|
||||
createCmd.Flags().BoolVar(&networkChaos, "with-network-chaos", false, "enable to use network chaos parameters when creating a qemu cluster")
|
||||
createCmd.Flags().DurationVar(&jitter, "with-network-jitter", 0, "specify jitter on the bridge interface when creating a qemu cluster")
|
||||
createCmd.Flags().DurationVar(&latency, "with-network-latency", 0, "specify latency on the bridge interface when creating a qemu cluster")
|
||||
createCmd.Flags().Float64Var(&packetLoss, "with-network-packet-loss", 0.0, "specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
|
||||
createCmd.Flags().Float64Var(&packetReorder, "with-network-packet-reorder", 0.0,
|
||||
"specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
|
||||
createCmd.Flags().Float64Var(&packetCorrupt, "with-network-packet-corrupt", 0.0,
|
||||
"specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
|
||||
createCmd.Flags().IntVar(&bandwidth, "with-network-bandwidth", 0, "specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster")
|
||||
|
||||
Cmd.AddCommand(createCmd)
|
||||
}
|
||||
|
@ -80,6 +80,14 @@ case "${DISABLE_DHCP_HOSTNAME:-false}" in
|
||||
;;
|
||||
esac
|
||||
|
||||
case "${WITH_NETWORK_CHAOS:-false}" in
|
||||
false)
|
||||
;;
|
||||
*)
|
||||
QEMU_FLAGS="${QEMU_FLAGS} --with-network-chaos --with-network-packet-loss=0.01 --with-network-latency=15ms --with-network-jitter=5ms"
|
||||
;;
|
||||
esac
|
||||
|
||||
case "${USE_DISK_IMAGE:-false}" in
|
||||
false)
|
||||
DISK_IMAGE_FLAG=
|
||||
|
@ -54,8 +54,8 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error {
|
||||
|
||||
fmt.Fprintln(out, "waiting for API")
|
||||
|
||||
err = retry.Constant(5*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
|
||||
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
|
||||
err = retry.Constant(10*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
|
||||
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if _, err = cli.Version(retryCtx); err != nil {
|
||||
@ -72,7 +72,7 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error {
|
||||
fmt.Fprintln(out, "bootstrapping cluster")
|
||||
|
||||
return retry.Constant(backoff.DefaultConfig.MaxDelay, retry.WithUnits(100*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
|
||||
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
|
||||
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err = cli.Bootstrap(retryCtx, &machineapi.BootstrapRequest{}); err != nil {
|
||||
|
@ -49,7 +49,7 @@ func (p *provisioner) Create(ctx context.Context, request provision.ClusterReque
|
||||
|
||||
fmt.Fprintln(options.LogWriter, "creating network", request.Network.Name)
|
||||
|
||||
if err = p.CreateNetwork(ctx, state, request.Network); err != nil {
|
||||
if err = p.CreateNetwork(ctx, state, request.Network, options); err != nil {
|
||||
return nil, fmt.Errorf("unable to provision CNI network: %w", err)
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"github.com/jsimonetti/rtnetlink"
|
||||
"github.com/siderolabs/gen/slices"
|
||||
sideronet "github.com/siderolabs/net"
|
||||
"github.com/vishvananda/netlink"
|
||||
|
||||
"github.com/siderolabs/talos/pkg/provision"
|
||||
)
|
||||
@ -30,8 +31,8 @@ import (
|
||||
// so that interface name is defined by network name, and different networks have
|
||||
// different bridge interfaces.
|
||||
//
|
||||
//nolint:gocyclo
|
||||
func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest) error {
|
||||
//nolint:gocyclo,cyclop
|
||||
func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest, options provision.Options) error {
|
||||
networkNameHash := sha256.Sum256([]byte(network.Name))
|
||||
state.BridgeName = fmt.Sprintf("%s%s", "talos", hex.EncodeToString(networkNameHash[:])[:8])
|
||||
|
||||
@ -129,6 +130,82 @@ func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network p
|
||||
return fmt.Errorf("error parsing VM CNI config: %w", err)
|
||||
}
|
||||
|
||||
// configure bridge interface with network chaos if flag is set
|
||||
if network.NetworkChaos {
|
||||
if err = p.configureNetworkChaos(network, state, options); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
//nolint:gocyclo
|
||||
func (p *Provisioner) configureNetworkChaos(network provision.NetworkRequest, state *State, options provision.Options) error {
|
||||
if (network.Bandwidth != 0) && (network.Latency != 0 || network.Jitter != 0 || network.PacketLoss != 0 || network.PacketReorder != 0 || network.PacketCorrupt != 0) {
|
||||
return fmt.Errorf("bandwidth and other chaos options cannot be used together")
|
||||
}
|
||||
|
||||
link, err := netlink.LinkByName(state.BridgeName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get link: %v", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(options.LogWriter, "network chaos enabled on interface:", state.BridgeName)
|
||||
|
||||
if network.Bandwidth != 0 {
|
||||
fmt.Fprintf(options.LogWriter, " bandwidth: %4d kbps\n", network.Bandwidth)
|
||||
|
||||
rate := network.Bandwidth * 1000 / 8
|
||||
|
||||
buffer := rate / 10
|
||||
|
||||
limit := buffer * 5
|
||||
|
||||
qdisc := &netlink.Tbf{
|
||||
QdiscAttrs: netlink.QdiscAttrs{
|
||||
LinkIndex: link.Attrs().Index,
|
||||
Handle: netlink.MakeHandle(1, 0),
|
||||
Parent: netlink.HANDLE_ROOT,
|
||||
},
|
||||
Rate: uint64(rate),
|
||||
Buffer: uint32(buffer),
|
||||
Limit: uint32(limit),
|
||||
}
|
||||
|
||||
if err := netlink.QdiscAdd(qdisc); err != nil {
|
||||
return fmt.Errorf("could not add netem qdisc: %v", err)
|
||||
}
|
||||
} else {
|
||||
packetLoss := network.PacketLoss * 100
|
||||
packetReorder := network.PacketReorder * 100
|
||||
packetCorrupt := network.PacketCorrupt * 100
|
||||
|
||||
fmt.Fprintf(options.LogWriter, " jitter: %4dms\n", network.Jitter.Milliseconds())
|
||||
fmt.Fprintf(options.LogWriter, " latency: %4dms\n", network.Latency.Milliseconds())
|
||||
fmt.Fprintf(options.LogWriter, " packet loss: %4v%%\n", packetLoss)
|
||||
fmt.Fprintf(options.LogWriter, " packet reordering: %4v%%\n", packetReorder)
|
||||
fmt.Fprintf(options.LogWriter, " packet corruption: %4v%%\n", packetCorrupt)
|
||||
|
||||
qdisc := netlink.NewNetem(
|
||||
netlink.QdiscAttrs{
|
||||
LinkIndex: link.Attrs().Index,
|
||||
Handle: netlink.MakeHandle(1, 0),
|
||||
Parent: netlink.HANDLE_ROOT,
|
||||
},
|
||||
netlink.NetemQdiscAttrs{
|
||||
Jitter: uint32(network.Jitter / 1000),
|
||||
Latency: uint32(network.Latency / 1000),
|
||||
Loss: float32(packetLoss),
|
||||
ReorderProb: float32(packetReorder),
|
||||
CorruptProb: float32(packetCorrupt),
|
||||
},
|
||||
)
|
||||
if err := netlink.QdiscAdd(qdisc); err != nil {
|
||||
return fmt.Errorf("could not add netem qdisc: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@ package provision
|
||||
import (
|
||||
"fmt"
|
||||
"net/netip"
|
||||
"time"
|
||||
|
||||
"github.com/siderolabs/go-procfs/procfs"
|
||||
|
||||
@ -62,6 +63,15 @@ type NetworkRequest struct {
|
||||
|
||||
// Docker-specific parameters.
|
||||
DockerDisableIPv6 bool
|
||||
|
||||
// Network chaos parameters.
|
||||
NetworkChaos bool
|
||||
Jitter time.Duration
|
||||
Latency time.Duration
|
||||
PacketLoss float64
|
||||
PacketReorder float64
|
||||
PacketCorrupt float64
|
||||
Bandwidth int
|
||||
}
|
||||
|
||||
// NodeRequests is a list of NodeRequest.
|
||||
|
@ -156,6 +156,13 @@ talosctl cluster create [flags]
|
||||
--with-debug enable debug in Talos config to send service logs to the console
|
||||
--with-init-node create the cluster with an init node
|
||||
--with-kubespan enable KubeSpan system
|
||||
--with-network-bandwidth int specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster
|
||||
--with-network-chaos enable to use network chaos parameters when creating a qemu cluster
|
||||
--with-network-jitter duration specify jitter on the bridge interface when creating a qemu cluster
|
||||
--with-network-latency duration specify latency on the bridge interface when creating a qemu cluster
|
||||
--with-network-packet-corrupt float specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
|
||||
--with-network-packet-loss float specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
|
||||
--with-network-packet-reorder float specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
|
||||
--with-secureboot enforce secure boot
|
||||
--with-tpm2 enable TPM2 emulation support using swtpm
|
||||
--with-uefi enable UEFI on x86_64 architecture (default true)
|
||||
|
Loading…
x
Reference in New Issue
Block a user