feat: add network chaos to qemu development environment

Add flags for configuring the qemu bridge interface with chaos options:
- network-chaos-enabled
- network-jitter
- network-latency
- network-packet-loss
- network-packet-reorder
- network-packet-corrupt
- network-bandwidth

These flags are used in /pkg/provision/providers/vm/network.go at the end of the CreateNetwork function to first see if the network-chaos-enabled flag is set, and then check if bandwidth is set.  This will allow developers to simulate clusters having a degraded WAN connection in the development environment and testing pipelines.

If bandwidth is not set, it will then enable the other options.
- Note that if bandwidth is set, the other options such as jitter, latency, packet loss, reordering and corruption will not be used.  This is for two reasons:
	- Restriction the bandwidth can often intoduce many of the other issues being set by the other options.
	- Setting the bandwidth uses a separate queuing discipline (Token Bucket Filter) from the other options (Network Emulator) and requires a much more complex configuration using a Heirarchial Token Bucket Filter which cannot be configured at a granular enough level using the vishvananda/netlink library.

Adding both queuing disciplines to the same interface may be an option to look into in the future, but would take more extensive testing and control over many more variables which I believe is out of the scope of this PR.  It is also possible to add custom profiles, but will also take more research to develop common scenarios which combine different options in a realistic manner.

Signed-off-by: Christian Rolland <christian.rolland@siderolabs.com>
Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
Christian Rolland 2023-06-01 07:22:44 -04:00 committed by Andrey Smirnov
parent 47986cb79e
commit e6dde8ffc5
No known key found for this signature in database
GPG Key ID: 7B26396447AB6DFD
8 changed files with 146 additions and 7 deletions

View File

@ -450,7 +450,13 @@ local integration_cilium_strict = Step('e2e-cilium-strict', target='e2e-qemu', p
WITH_CONFIG_PATCH: '[{"op": "add", "path": "/cluster/network", "value": {"cni": {"name": "none"}}}, {"op": "add", "path": "/cluster/proxy", "value": {"disabled": true}}]',
IMAGE_REGISTRY: local_registry,
});
local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={
local integration_network_chaos = Step('e2e-network-chaos', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={
SHORT_INTEGRATION_TEST: 'yes',
WITH_NETWORK_CHAOS: 'true',
REGISTRY: local_registry,
});
local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[integration_network_chaos], environment={
INTEGRATION_TEST_RUN: 'TestIntegration/api.ResetSuite/TestResetWithSpec',
CUSTOM_CNI_URL: 'https://raw.githubusercontent.com/projectcalico/calico/v3.25.0/manifests/canal.yaml',
REGISTRY: local_registry,
@ -548,6 +554,7 @@ local integration_pipelines = [
Pipeline('integration-provision-1', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_1]) + integration_trigger(['integration-provision', 'integration-provision-1']),
Pipeline('integration-provision-2', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_2]) + integration_trigger(['integration-provision', 'integration-provision-2']),
Pipeline('integration-misc', default_pipeline_steps + [
integration_network_chaos,
integration_canal_reset,
integration_bios_cgroupsv1,
integration_disk_image,

View File

@ -154,6 +154,13 @@ var (
controlPlanePort int
dhcpSkipHostname bool
skipBootPhaseFinishedCheck bool
networkChaos bool
jitter time.Duration
latency time.Duration
packetLoss float64
packetReorder float64
packetCorrupt float64
bandwidth int
)
// createCmd represents the cluster up command.
@ -269,6 +276,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) {
}
}
// Validate network chaos flags
if !networkChaos {
if jitter != 0 || latency != 0 || packetLoss != 0 || packetReorder != 0 || packetCorrupt != 0 || bandwidth != 0 {
return fmt.Errorf("network chaos flags can only be used with --with-network-chaos")
}
}
provisioner, err := providers.Factory(ctx, provisionerName)
if err != nil {
return err
@ -296,6 +310,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) {
},
DHCPSkipHostname: dhcpSkipHostname,
DockerDisableIPv6: dockerDisableIPv6,
NetworkChaos: networkChaos,
Jitter: jitter,
Latency: latency,
PacketLoss: packetLoss,
PacketReorder: packetReorder,
PacketCorrupt: packetCorrupt,
Bandwidth: bandwidth,
},
Image: nodeImage,
@ -945,6 +966,15 @@ func init() {
createCmd.Flags().IntVar(&controlPlanePort, controlPlanePortFlag, constants.DefaultControlPlanePort, "control plane port (load balancer and local API port)")
createCmd.Flags().BoolVar(&dhcpSkipHostname, "disable-dhcp-hostname", false, "skip announcing hostname via DHCP (QEMU only)")
createCmd.Flags().BoolVar(&skipBootPhaseFinishedCheck, "skip-boot-phase-finished-check", false, "skip waiting for node to finish boot phase")
createCmd.Flags().BoolVar(&networkChaos, "with-network-chaos", false, "enable to use network chaos parameters when creating a qemu cluster")
createCmd.Flags().DurationVar(&jitter, "with-network-jitter", 0, "specify jitter on the bridge interface when creating a qemu cluster")
createCmd.Flags().DurationVar(&latency, "with-network-latency", 0, "specify latency on the bridge interface when creating a qemu cluster")
createCmd.Flags().Float64Var(&packetLoss, "with-network-packet-loss", 0.0, "specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
createCmd.Flags().Float64Var(&packetReorder, "with-network-packet-reorder", 0.0,
"specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
createCmd.Flags().Float64Var(&packetCorrupt, "with-network-packet-corrupt", 0.0,
"specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)")
createCmd.Flags().IntVar(&bandwidth, "with-network-bandwidth", 0, "specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster")
Cmd.AddCommand(createCmd)
}

View File

@ -80,6 +80,14 @@ case "${DISABLE_DHCP_HOSTNAME:-false}" in
;;
esac
case "${WITH_NETWORK_CHAOS:-false}" in
false)
;;
*)
QEMU_FLAGS="${QEMU_FLAGS} --with-network-chaos --with-network-packet-loss=0.01 --with-network-latency=15ms --with-network-jitter=5ms"
;;
esac
case "${USE_DISK_IMAGE:-false}" in
false)
DISK_IMAGE_FLAG=

View File

@ -54,8 +54,8 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error {
fmt.Fprintln(out, "waiting for API")
err = retry.Constant(5*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
err = retry.Constant(10*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
defer cancel()
if _, err = cli.Version(retryCtx); err != nil {
@ -72,7 +72,7 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error {
fmt.Fprintln(out, "bootstrapping cluster")
return retry.Constant(backoff.DefaultConfig.MaxDelay, retry.WithUnits(100*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error {
retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond)
retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second)
defer cancel()
if err = cli.Bootstrap(retryCtx, &machineapi.BootstrapRequest{}); err != nil {

View File

@ -49,7 +49,7 @@ func (p *provisioner) Create(ctx context.Context, request provision.ClusterReque
fmt.Fprintln(options.LogWriter, "creating network", request.Network.Name)
if err = p.CreateNetwork(ctx, state, request.Network); err != nil {
if err = p.CreateNetwork(ctx, state, request.Network, options); err != nil {
return nil, fmt.Errorf("unable to provision CNI network: %w", err)
}

View File

@ -22,6 +22,7 @@ import (
"github.com/jsimonetti/rtnetlink"
"github.com/siderolabs/gen/slices"
sideronet "github.com/siderolabs/net"
"github.com/vishvananda/netlink"
"github.com/siderolabs/talos/pkg/provision"
)
@ -30,8 +31,8 @@ import (
// so that interface name is defined by network name, and different networks have
// different bridge interfaces.
//
//nolint:gocyclo
func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest) error {
//nolint:gocyclo,cyclop
func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest, options provision.Options) error {
networkNameHash := sha256.Sum256([]byte(network.Name))
state.BridgeName = fmt.Sprintf("%s%s", "talos", hex.EncodeToString(networkNameHash[:])[:8])
@ -129,6 +130,82 @@ func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network p
return fmt.Errorf("error parsing VM CNI config: %w", err)
}
// configure bridge interface with network chaos if flag is set
if network.NetworkChaos {
if err = p.configureNetworkChaos(network, state, options); err != nil {
return err
}
}
return nil
}
//nolint:gocyclo
func (p *Provisioner) configureNetworkChaos(network provision.NetworkRequest, state *State, options provision.Options) error {
if (network.Bandwidth != 0) && (network.Latency != 0 || network.Jitter != 0 || network.PacketLoss != 0 || network.PacketReorder != 0 || network.PacketCorrupt != 0) {
return fmt.Errorf("bandwidth and other chaos options cannot be used together")
}
link, err := netlink.LinkByName(state.BridgeName)
if err != nil {
return fmt.Errorf("could not get link: %v", err)
}
fmt.Fprintln(options.LogWriter, "network chaos enabled on interface:", state.BridgeName)
if network.Bandwidth != 0 {
fmt.Fprintf(options.LogWriter, " bandwidth: %4d kbps\n", network.Bandwidth)
rate := network.Bandwidth * 1000 / 8
buffer := rate / 10
limit := buffer * 5
qdisc := &netlink.Tbf{
QdiscAttrs: netlink.QdiscAttrs{
LinkIndex: link.Attrs().Index,
Handle: netlink.MakeHandle(1, 0),
Parent: netlink.HANDLE_ROOT,
},
Rate: uint64(rate),
Buffer: uint32(buffer),
Limit: uint32(limit),
}
if err := netlink.QdiscAdd(qdisc); err != nil {
return fmt.Errorf("could not add netem qdisc: %v", err)
}
} else {
packetLoss := network.PacketLoss * 100
packetReorder := network.PacketReorder * 100
packetCorrupt := network.PacketCorrupt * 100
fmt.Fprintf(options.LogWriter, " jitter: %4dms\n", network.Jitter.Milliseconds())
fmt.Fprintf(options.LogWriter, " latency: %4dms\n", network.Latency.Milliseconds())
fmt.Fprintf(options.LogWriter, " packet loss: %4v%%\n", packetLoss)
fmt.Fprintf(options.LogWriter, " packet reordering: %4v%%\n", packetReorder)
fmt.Fprintf(options.LogWriter, " packet corruption: %4v%%\n", packetCorrupt)
qdisc := netlink.NewNetem(
netlink.QdiscAttrs{
LinkIndex: link.Attrs().Index,
Handle: netlink.MakeHandle(1, 0),
Parent: netlink.HANDLE_ROOT,
},
netlink.NetemQdiscAttrs{
Jitter: uint32(network.Jitter / 1000),
Latency: uint32(network.Latency / 1000),
Loss: float32(packetLoss),
ReorderProb: float32(packetReorder),
CorruptProb: float32(packetCorrupt),
},
)
if err := netlink.QdiscAdd(qdisc); err != nil {
return fmt.Errorf("could not add netem qdisc: %v", err)
}
}
return nil
}

View File

@ -7,6 +7,7 @@ package provision
import (
"fmt"
"net/netip"
"time"
"github.com/siderolabs/go-procfs/procfs"
@ -62,6 +63,15 @@ type NetworkRequest struct {
// Docker-specific parameters.
DockerDisableIPv6 bool
// Network chaos parameters.
NetworkChaos bool
Jitter time.Duration
Latency time.Duration
PacketLoss float64
PacketReorder float64
PacketCorrupt float64
Bandwidth int
}
// NodeRequests is a list of NodeRequest.

View File

@ -156,6 +156,13 @@ talosctl cluster create [flags]
--with-debug enable debug in Talos config to send service logs to the console
--with-init-node create the cluster with an init node
--with-kubespan enable KubeSpan system
--with-network-bandwidth int specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster
--with-network-chaos enable to use network chaos parameters when creating a qemu cluster
--with-network-jitter duration specify jitter on the bridge interface when creating a qemu cluster
--with-network-latency duration specify latency on the bridge interface when creating a qemu cluster
--with-network-packet-corrupt float specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
--with-network-packet-loss float specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
--with-network-packet-reorder float specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)
--with-secureboot enforce secure boot
--with-tpm2 enable TPM2 emulation support using swtpm
--with-uefi enable UEFI on x86_64 architecture (default true)