diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt index 4ddb4e4b0426..fa8d86356791 100644 --- a/Documentation/networking/netvsc.txt +++ b/Documentation/networking/netvsc.txt @@ -21,11 +21,23 @@ Features -------------------- Hyper-V supports receive side scaling. For TCP, packets are distributed among available queues based on IP address and port - number. Current versions of Hyper-V host, only distribute UDP - packets based on the IP source and destination address. - The port number is not used as part of the hash value for UDP. - Fragmented IP packets are not distributed between queues; - all fragmented packets arrive on the first channel. + number. + + For UDP, we can switch UDP hash level between L3 and L4 by ethtool + command. UDP over IPv4 and v6 can be set differently. The default + hash level is L4. We currently only allow switching TX hash level + from within the guests. + + On Azure, fragmented UDP packets have high loss rate with L4 + hashing. Using L3 hashing is recommended in this case. + + For example, for UDP over IPv4 on eth0: + To include UDP port numbers in hasing: + ethtool -N eth0 rx-flow-hash udp4 sdfn + To exclude UDP port numbers in hasing: + ethtool -N eth0 rx-flow-hash udp4 sd + To show UDP hash level: + ethtool -n eth0 rx-flow-hash udp4 Generic Receive Offload, aka GRO -------------------------------- diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 9198dd1240ed..ff1c0c8d5e0d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -720,6 +720,8 @@ struct net_device_context { u32 tx_send_table[VRSS_SEND_TAB_SIZE]; /* Ethtool settings */ + bool udp4_l4_hash; + bool udp6_l4_hash; u8 duplex; u32 speed; struct netvsc_ethtool_stats eth_stats; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index b33f0507c373..c0c4c9195a3f 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -190,10 +190,12 @@ static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size, return ppi; } -/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute - * hash for non-TCP traffic with only IP numbers. +/* Azure hosts don't support non-TCP port numbers in hashing for fragmented + * packets. We can use ethtool to change UDP hash level when necessary. */ -static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk) +static inline u32 netvsc_get_hash( + struct sk_buff *skb, + const struct net_device_context *ndc) { struct flow_keys flow; u32 hash; @@ -204,7 +206,11 @@ static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk) if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) return 0; - if (flow.basic.ip_proto == IPPROTO_TCP) { + if (flow.basic.ip_proto == IPPROTO_TCP || + (flow.basic.ip_proto == IPPROTO_UDP && + ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) || + (flow.basic.n_proto == htons(ETH_P_IPV6) && + ndc->udp6_l4_hash)))) { return skb_get_hash(skb); } else { if (flow.basic.n_proto == htons(ETH_P_IP)) @@ -227,7 +233,7 @@ static inline int netvsc_get_tx_queue(struct net_device *ndev, struct sock *sk = skb->sk; int q_idx; - q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) & + q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) & (VRSS_SEND_TAB_SIZE - 1)]; /* If queue index changed record the new value */ @@ -891,6 +897,9 @@ static void netvsc_init_settings(struct net_device *dev) { struct net_device_context *ndc = netdev_priv(dev); + ndc->udp4_l4_hash = true; + ndc->udp6_l4_hash = true; + ndc->speed = SPEED_UNKNOWN; ndc->duplex = DUPLEX_FULL; } @@ -1228,7 +1237,7 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) } static int -netvsc_get_rss_hash_opts(struct netvsc_device *nvdev, +netvsc_get_rss_hash_opts(struct net_device_context *ndc, struct ethtool_rxnfc *info) { info->data = RXH_IP_SRC | RXH_IP_DST; @@ -1237,9 +1246,20 @@ netvsc_get_rss_hash_opts(struct netvsc_device *nvdev, case TCP_V4_FLOW: case TCP_V6_FLOW: info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; - /* fallthrough */ + break; + case UDP_V4_FLOW: + if (ndc->udp4_l4_hash) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + + break; + case UDP_V6_FLOW: + if (ndc->udp6_l4_hash) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + + break; + case IPV4_FLOW: case IPV6_FLOW: break; @@ -1267,11 +1287,51 @@ netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, return 0; case ETHTOOL_GRXFH: - return netvsc_get_rss_hash_opts(nvdev, info); + return netvsc_get_rss_hash_opts(ndc, info); } return -EOPNOTSUPP; } +static int netvsc_set_rss_hash_opts(struct net_device_context *ndc, + struct ethtool_rxnfc *info) +{ + if (info->data == (RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) { + if (info->flow_type == UDP_V4_FLOW) + ndc->udp4_l4_hash = true; + else if (info->flow_type == UDP_V6_FLOW) + ndc->udp6_l4_hash = true; + else + return -EOPNOTSUPP; + + return 0; + } + + if (info->data == (RXH_IP_SRC | RXH_IP_DST)) { + if (info->flow_type == UDP_V4_FLOW) + ndc->udp4_l4_hash = false; + else if (info->flow_type == UDP_V6_FLOW) + ndc->udp6_l4_hash = false; + else + return -EOPNOTSUPP; + + return 0; + } + + return -EOPNOTSUPP; +} + +static int +netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info) +{ + struct net_device_context *ndc = netdev_priv(ndev); + + if (info->cmd == ETHTOOL_SRXFH) + return netvsc_set_rss_hash_opts(ndc, info); + + return -EOPNOTSUPP; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static void netvsc_poll_controller(struct net_device *dev) { @@ -1470,6 +1530,7 @@ static const struct ethtool_ops ethtool_ops = { .set_channels = netvsc_set_channels, .get_ts_info = ethtool_op_get_ts_info, .get_rxnfc = netvsc_get_rxnfc, + .set_rxnfc = netvsc_set_rxnfc, .get_rxfh_key_size = netvsc_get_rxfh_key_size, .get_rxfh_indir_size = netvsc_rss_indir_size, .get_rxfh = netvsc_get_rxfh,