From b90d0f83b20f54fd52e9f932b5ce24871a9e7dcb Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 8 Nov 2021 06:34:43 +0900 Subject: [PATCH] network/netdev: add support to create IPoIB subinterface --- man/systemd.netdev.xml | 46 +++++++ man/systemd.network.xml | 5 +- src/network/meson.build | 2 + src/network/netdev/ipoib.c | 119 ++++++++++++++++++ src/network/netdev/ipoib.h | 28 +++++ src/network/netdev/netdev-gperf.gperf | 4 + src/network/netdev/netdev.c | 45 +++++-- src/network/netdev/netdev.h | 4 +- src/network/netdev/veth.c | 2 +- src/network/networkd-network-gperf.gperf | 1 + src/network/networkd-network.c | 1 + .../fuzz/fuzz-netdev-parser/directives.netdev | 4 + .../fuzz-network-parser/directives.network | 1 + 13 files changed, 248 insertions(+), 14 deletions(-) create mode 100644 src/network/netdev/ipoib.c create mode 100644 src/network/netdev/ipoib.h diff --git a/man/systemd.netdev.xml b/man/systemd.netdev.xml index e4e7e611e7..255b85f404 100644 --- a/man/systemd.netdev.xml +++ b/man/systemd.netdev.xml @@ -189,6 +189,9 @@ batadv B.A.T.M.A.N. Advanced is a routing protocol for multi-hop mobile ad-hoc networks which operates on layer 2. + + ipoib + An IP over Infiniband subinterface. @@ -2125,6 +2128,49 @@ + + [IPoIB] Section Options + The [IPoIB] section only applies for netdevs of kind ipoib and accepts the + following keys: + + + + PartitionKey= + + Takes an integer in the range 1…0xffff, except for 0x8000. Defaults to unset, and the + kernel's default is used. + + + + + Mode= + + Takes one of the special values datagram or + connected. Defaults to unset, and the kernel's default is used. + + When datagram, the Infiniband unreliable datagram (UD) transport is + used, and so the interface MTU is equal to the IB L2 MTU minus the IPoIB encapsulation + header (4 bytes). For example, in a typical IB fabric with a 2K MTU, the IPoIB MTU will be + 2048 - 4 = 2044 bytes. + + When connected, the Infiniband reliable connected (RC) transport is + used. Connected mode takes advantage of the connected nature of the IB transport and allows + an MTU up to the maximal IP packet size of 64K, which reduces the number of IP packets needed + for handling large UDP datagrams, TCP segments, etc and increases the performance for large + messages. + + + + + IgnoreUserspaceMulticastGroup= + + Takes an boolean value. When true, the kernel ignores multicast groups handled by + userspace. Defaults to unset, and the kernel's default is used. + + + + + Examples diff --git a/man/systemd.network.xml b/man/systemd.network.xml index 50367ecdcd..1de7bb0538 100644 --- a/man/systemd.network.xml +++ b/man/systemd.network.xml @@ -902,6 +902,7 @@ Table=1234 + IPoIB= IPVLAN= IPVTAP= L2TP= @@ -913,8 +914,8 @@ Table=1234 VXLAN= Xfrm= - The name of an IPVLAN, IPVTAP, L2TP, MACsec, MACVLAN, MACVTAP, tunnel, VLAN, VXLAN, or - Xfrm to be created on the link. See + The name of an IPoIB, IPVLAN, IPVTAP, L2TP, MACsec, MACVLAN, MACVTAP, tunnel, VLAN, + VXLAN, or Xfrm to be created on the link. See systemd.netdev5. This option may be specified more than once. diff --git a/src/network/meson.build b/src/network/meson.build index cfa16a8ecf..c1cf227ffc 100644 --- a/src/network/meson.build +++ b/src/network/meson.build @@ -13,6 +13,8 @@ sources = files(''' netdev/dummy.h netdev/ifb.c netdev/ifb.h + netdev/ipoib.c + netdev/ipoib.h netdev/ipvlan.c netdev/ipvlan.h netdev/macvlan.c diff --git a/src/network/netdev/ipoib.c b/src/network/netdev/ipoib.c new file mode 100644 index 0000000000..b341001bc4 --- /dev/null +++ b/src/network/netdev/ipoib.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "ipoib.h" +#include "parse-util.h" +#include "string-table.h" + +assert_cc((int) IP_OVER_INFINIBAND_MODE_DATAGRAM == (int) IPOIB_MODE_DATAGRAM); +assert_cc((int) IP_OVER_INFINIBAND_MODE_CONNECTED == (int) IPOIB_MODE_CONNECTED); + +static void netdev_ipoib_init(NetDev *netdev) { + IPoIB *ipoib; + + assert(netdev); + + ipoib = IPOIB(netdev); + + assert(ipoib); + + ipoib->mode = _IP_OVER_INFINIBAND_MODE_INVALID; + ipoib->umcast = -1; +} + +static int netdev_ipoib_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) { + IPoIB *ipoib; + int r; + + assert(netdev); + assert(link); + assert(m); + + ipoib = IPOIB(netdev); + + assert(ipoib); + + if (ipoib->pkey > 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_PKEY, ipoib->pkey); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_PKEY attribute: %m"); + } + + if (ipoib->mode >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_MODE, ipoib->mode); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_MODE attribute: %m"); + } + + if (ipoib->umcast >= 0) { + r = sd_netlink_message_append_u16(m, IFLA_IPOIB_UMCAST, ipoib->umcast); + if (r < 0) + return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_UMCAST attribute: %m"); + } + + return 0; +} + +static const char * const ipoib_mode_table[_IP_OVER_INFINIBAND_MODE_MAX] = { + [IP_OVER_INFINIBAND_MODE_DATAGRAM] = "datagram", + [IP_OVER_INFINIBAND_MODE_CONNECTED] = "connected", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(ipoib_mode, IPoIBMode); +DEFINE_CONFIG_PARSE_ENUM(config_parse_ipoib_mode, ipoib_mode, IPoIBMode, "Failed to parse IPoIB mode"); + +int config_parse_ipoib_pkey( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint16_t u, *pkey = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + if (isempty(rvalue)) { + *pkey = 0; /* 0 means unset. */ + return 0; + } + + r = safe_atou16(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, + "Failed to parse IPoIB pkey '%s', ignoring assignment: %m", + rvalue); + return 0; + } + if (u == 0 || u == 0x8000) { + log_syntax(unit, LOG_WARNING, filename, line, 0, + "IPoIB pkey cannot be 0 nor 0x8000, ignoring assignment: %s", + rvalue); + return 0; + } + + *pkey = u; + return 0; +} + + +const NetDevVTable ipoib_vtable = { + .object_size = sizeof(IPoIB), + .sections = NETDEV_COMMON_SECTIONS "IPoIB\0", + .init = netdev_ipoib_init, + .fill_message_create = netdev_ipoib_fill_message_create, + .create_type = NETDEV_CREATE_STACKED, + .iftype = ARPHRD_INFINIBAND, + .generate_mac = true, +}; diff --git a/src/network/netdev/ipoib.h b/src/network/netdev/ipoib.h new file mode 100644 index 0000000000..d2f5d9350f --- /dev/null +++ b/src/network/netdev/ipoib.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include + +#include "conf-parser.h" +#include "netdev.h" + +typedef enum IPoIBMode { + IP_OVER_INFINIBAND_MODE_DATAGRAM, + IP_OVER_INFINIBAND_MODE_CONNECTED, + _IP_OVER_INFINIBAND_MODE_MAX, + _IP_OVER_INFINIBAND_MODE_INVALID = -EINVAL, +} IPoIBMode; + +typedef struct IPoIB { + NetDev meta; + + uint16_t pkey; + IPoIBMode mode; + int umcast; +} IPoIB; + +DEFINE_NETDEV_CAST(IPOIB, IPoIB); +extern const NetDevVTable ipoib_vtable; + +CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_pkey); +CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_mode); diff --git a/src/network/netdev/netdev-gperf.gperf b/src/network/netdev/netdev-gperf.gperf index 37a0d9fa5d..a948ec2c8a 100644 --- a/src/network/netdev/netdev-gperf.gperf +++ b/src/network/netdev/netdev-gperf.gperf @@ -11,6 +11,7 @@ _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") #include "conf-parser.h" #include "fou-tunnel.h" #include "geneve.h" +#include "ipoib.h" #include "ipvlan.h" #include "l2tp-tunnel.h" #include "macsec.h" @@ -253,3 +254,6 @@ BatmanAdvanced.GatewayBandwidthUp, config_parse_badadv_bandwidth, BatmanAdvanced.HopPenalty, config_parse_uint8, 0, offsetof(BatmanAdvanced, hop_penalty) BatmanAdvanced.OriginatorIntervalSec, config_parse_sec, 0, offsetof(BatmanAdvanced, originator_interval) BatmanAdvanced.RoutingAlgorithm, config_parse_batadv_routing_algorithm, 0, offsetof(BatmanAdvanced, routing_algorithm) +IPoIB.PartitionKey, config_parse_ipoib_pkey, 0, offsetof(IPoIB, pkey) +IPoIB.Mode, config_parse_ipoib_mode, 0, offsetof(IPoIB, mode) +IPoIB.IgnoreUserspaceMulticastGroups, config_parse_tristate, 0, offsetof(IPoIB, umcast) diff --git a/src/network/netdev/netdev.c b/src/network/netdev/netdev.c index 6c6b4c3068..f673292594 100644 --- a/src/network/netdev/netdev.c +++ b/src/network/netdev/netdev.c @@ -18,6 +18,7 @@ #include "fou-tunnel.h" #include "geneve.h" #include "ifb.h" +#include "ipoib.h" #include "ipvlan.h" #include "l2tp-tunnel.h" #include "list.h" @@ -64,6 +65,7 @@ const NetDevVTable * const netdev_vtable[_NETDEV_KIND_MAX] = { [NETDEV_KIND_IP6GRETAP] = &ip6gretap_vtable, [NETDEV_KIND_IP6TNL] = &ip6tnl_vtable, [NETDEV_KIND_IPIP] = &ipip_vtable, + [NETDEV_KIND_IPOIB] = &ipoib_vtable, [NETDEV_KIND_IPVLAN] = &ipvlan_vtable, [NETDEV_KIND_IPVTAP] = &ipvtap_vtable, [NETDEV_KIND_L2TP] = &l2tptnl_vtable, @@ -103,6 +105,7 @@ static const char* const netdev_kind_table[_NETDEV_KIND_MAX] = { [NETDEV_KIND_IP6GRETAP] = "ip6gretap", [NETDEV_KIND_IP6TNL] = "ip6tnl", [NETDEV_KIND_IPIP] = "ipip", + [NETDEV_KIND_IPOIB] = "ipoib", [NETDEV_KIND_IPVLAN] = "ipvlan", [NETDEV_KIND_IPVTAP] = "ipvtap", [NETDEV_KIND_L2TP] = "l2tp", @@ -393,6 +396,7 @@ int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *message) { int netdev_generate_hw_addr( NetDev *netdev, + Link *parent, const char *name, const struct hw_addr_data *hw_addr, struct hw_addr_data *ret) { @@ -419,7 +423,7 @@ int netdev_generate_hw_addr( if (!NETDEV_VTABLE(netdev)->generate_mac) goto finalize; - if (NETDEV_VTABLE(netdev)->iftype != ARPHRD_ETHER) + if (!IN_SET(NETDEV_VTABLE(netdev)->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND)) goto finalize; r = net_get_unique_predictable_data_from_name(name, &HASH_KEY, &result); @@ -430,21 +434,42 @@ int netdev_generate_hw_addr( } a.length = arphrd_to_hw_addr_len(NETDEV_VTABLE(netdev)->iftype); - assert(a.length <= sizeof(result)); - memcpy(a.bytes, &result, a.length); - if (ether_addr_is_null(&a.ether) || ether_addr_is_broadcast(&a.ether)) { - log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), - "Failed to generate persistent MAC address, ignoring: %m"); - a = HW_ADDR_NULL; - goto finalize; + switch (NETDEV_VTABLE(netdev)->iftype) { + case ARPHRD_ETHER: + assert(a.length <= sizeof(result)); + memcpy(a.bytes, &result, a.length); + + if (ether_addr_is_null(&a.ether) || ether_addr_is_broadcast(&a.ether)) { + log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Failed to generate persistent MAC address, ignoring: %m"); + a = HW_ADDR_NULL; + goto finalize; + } + + break; + case ARPHRD_INFINIBAND: + if (result == 0) { + log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL), + "Failed to generate persistent MAC address: %m"); + goto finalize; + } + + assert(a.length >= sizeof(result)); + memzero(a.bytes, a.length - sizeof(result)); + memcpy(a.bytes + a.length - sizeof(result), &result, sizeof(result)); + break; + default: + assert_not_reached(); } + } else { a = *hw_addr; warn_invalid = true; } - r = net_verify_hardware_address(name, warn_invalid, NETDEV_VTABLE(netdev)->iftype, NULL, &a); + r = net_verify_hardware_address(name, warn_invalid, NETDEV_VTABLE(netdev)->iftype, + parent ? &parent->hw_addr : NULL, &a); if (r < 0) return r; @@ -481,7 +506,7 @@ static int netdev_create(NetDev *netdev, Link *link, link_netlink_message_handle if (r < 0) return log_netdev_error_errno(netdev, r, "Could not append IFLA_IFNAME, attribute: %m"); - r = netdev_generate_hw_addr(netdev, netdev->ifname, &netdev->hw_addr, &hw_addr); + r = netdev_generate_hw_addr(netdev, link, netdev->ifname, &netdev->hw_addr, &hw_addr); if (r < 0) return r; diff --git a/src/network/netdev/netdev.h b/src/network/netdev/netdev.h index b226cf20a7..c7262f550a 100644 --- a/src/network/netdev/netdev.h +++ b/src/network/netdev/netdev.h @@ -22,6 +22,7 @@ "-Bridge\0" \ "-FooOverUDP\0" \ "-GENEVE\0" \ + "-IPoIB\0" \ "-IPVLAN\0" \ "-IPVTAP\0" \ "-L2TP\0" \ @@ -60,6 +61,7 @@ typedef enum NetDevKind { NETDEV_KIND_IP6GRETAP, NETDEV_KIND_IP6TNL, NETDEV_KIND_IPIP, + NETDEV_KIND_IPOIB, NETDEV_KIND_IPVLAN, NETDEV_KIND_IPVTAP, NETDEV_KIND_L2TP, @@ -201,7 +203,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(NetDev*, netdev_unref); bool netdev_is_managed(NetDev *netdev); int netdev_get(Manager *manager, const char *name, NetDev **ret); int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *newlink); -int netdev_generate_hw_addr(NetDev *netdev, const char *name, +int netdev_generate_hw_addr(NetDev *netdev, Link *link, const char *name, const struct hw_addr_data *hw_addr, struct hw_addr_data *ret); int netdev_join(NetDev *netdev, Link *link, link_netlink_message_handler_t cb); diff --git a/src/network/netdev/veth.c b/src/network/netdev/veth.c index 5dd8586a3a..c946e81fc0 100644 --- a/src/network/netdev/veth.c +++ b/src/network/netdev/veth.c @@ -32,7 +32,7 @@ static int netdev_veth_fill_message_create(NetDev *netdev, Link *link, sd_netlin return log_netdev_error_errno(netdev, r, "Failed to add netlink interface name: %m"); } - r = netdev_generate_hw_addr(netdev, v->ifname_peer, &v->hw_addr_peer, &hw_addr); + r = netdev_generate_hw_addr(netdev, NULL, v->ifname_peer, &v->hw_addr_peer, &hw_addr); if (r < 0) return r; diff --git a/src/network/networkd-network-gperf.gperf b/src/network/networkd-network-gperf.gperf index df9721a9bc..4ac58a26ad 100644 --- a/src/network/networkd-network-gperf.gperf +++ b/src/network/networkd-network-gperf.gperf @@ -87,6 +87,7 @@ Network.BatmanAdvanced, config_parse_ifname, Network.Bond, config_parse_ifname, 0, offsetof(Network, bond_name) Network.Bridge, config_parse_ifname, 0, offsetof(Network, bridge_name) Network.VRF, config_parse_ifname, 0, offsetof(Network, vrf_name) +Network.IPoIB, config_parse_stacked_netdev, NETDEV_KIND_IPOIB, offsetof(Network, stacked_netdev_names) Network.IPVLAN, config_parse_stacked_netdev, NETDEV_KIND_IPVLAN, offsetof(Network, stacked_netdev_names) Network.IPVTAP, config_parse_stacked_netdev, NETDEV_KIND_IPVTAP, offsetof(Network, stacked_netdev_names) Network.L2TP, config_parse_stacked_netdev, NETDEV_KIND_L2TP, offsetof(Network, stacked_netdev_names) diff --git a/src/network/networkd-network.c b/src/network/networkd-network.c index 443222f610..7640429f46 100644 --- a/src/network/networkd-network.c +++ b/src/network/networkd-network.c @@ -852,6 +852,7 @@ int config_parse_stacked_netdev( assert(rvalue); assert(data); assert(IN_SET(kind, + NETDEV_KIND_IPOIB, NETDEV_KIND_IPVLAN, NETDEV_KIND_IPVTAP, NETDEV_KIND_L2TP, diff --git a/test/fuzz/fuzz-netdev-parser/directives.netdev b/test/fuzz/fuzz-netdev-parser/directives.netdev index e34d16af11..f5fa2418fe 100644 --- a/test/fuzz/fuzz-netdev-parser/directives.netdev +++ b/test/fuzz/fuzz-netdev-parser/directives.netdev @@ -241,3 +241,7 @@ GatewayBandwithUp= GatewayBandwidthDown= GatewayBandwidthUp= RoutingAlgorithm= +[IPoIB] +PartitionKey= +Mode= +IgnoreUserspaceMulticastGroups= diff --git a/test/fuzz/fuzz-network-parser/directives.network b/test/fuzz/fuzz-network-parser/directives.network index 5b5a4f8c60..68cf1ba691 100644 --- a/test/fuzz/fuzz-network-parser/directives.network +++ b/test/fuzz/fuzz-network-parser/directives.network @@ -242,6 +242,7 @@ IgnoreCarrierLoss= KeepConfiguration= DHCPv6PrefixDelegation= BatmanAdvanced= +IPoIB= [IPv6Prefix] Prefix= OnLink=