diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a553d4e4a0fb..783675a730e5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN FALSE: disabled Default: FALSE +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + Default: 0 (Layer 3) + Possible values: + 0 - Layer 3 (source and destination addresses plus flow label) + 1 - Layer 4 (standard 5-tuple) + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 3ae32d1ddd27..915bbd867b61 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1334,7 +1334,7 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev, IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, - strict); + NULL, strict); bool ret; if (!rt) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 69f16c605b9d..a8a578610a7b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2430,7 +2430,8 @@ static int mlxsw_sp_router_netevent_event(struct notifier_block *nb, mlxsw_core_schedule_work(&net_work->work); mlxsw_sp_port_dev_put(mlxsw_sp_port); break; - case NETEVENT_MULTIPATH_HASH_UPDATE: + case NETEVENT_IPV4_MPATH_HASH_UPDATE: + case NETEVENT_IPV6_MPATH_HASH_UPDATE: net = ptr; if (!net_eq(net, &init_net)) @@ -7030,13 +7031,25 @@ static void mlxsw_sp_mp4_hash_init(char *recr2_pl) static void mlxsw_sp_mp6_hash_init(char *recr2_pl) { + bool only_l3 = !init_net.ipv6.sysctl.multipath_hash_policy; + mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV6_EN_NOT_TCP_NOT_UDP); mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV6_EN_TCP_UDP); mlxsw_reg_recr2_ipv6_sip_enable(recr2_pl); mlxsw_reg_recr2_ipv6_dip_enable(recr2_pl); - mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV6_FLOW_LABEL); mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV6_NEXT_HEADER); + if (only_l3) { + mlxsw_sp_mp_hash_field_set(recr2_pl, + MLXSW_REG_RECR2_IPV6_FLOW_LABEL); + } else { + mlxsw_sp_mp_hash_header_set(recr2_pl, + MLXSW_REG_RECR2_TCP_UDP_EN_IPV6); + mlxsw_sp_mp_hash_field_set(recr2_pl, + MLXSW_REG_RECR2_TCP_UDP_SPORT); + mlxsw_sp_mp_hash_field_set(recr2_pl, + MLXSW_REG_RECR2_TCP_UDP_DPORT); + } } static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 17daebd19e65..1a8132eb2a3e 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -817,7 +817,8 @@ struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, }; skb_dst_drop(skb); - dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); + dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, + skb, flags); skb_dst_set(skb, dst); break; } diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e459e601c57f..c6be49d3a9eb 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -941,6 +941,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, + const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); @@ -959,7 +960,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, if (!table) return NULL; - return ip6_pol_route(net, table, ifindex, fl6, flags); + return ip6_pol_route(net, table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, @@ -977,7 +978,7 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; - rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; @@ -1110,7 +1111,7 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 1c9e17c11953..e5cfcfc7dd93 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -47,6 +47,7 @@ struct fib_rule { struct fib_lookup_arg { void *lookup_ptr; + const void *lookup_data; void *result; struct fib_rule *rule; u32 table; diff --git a/include/net/flow.h b/include/net/flow.h index 64e7ee9cb980..8ce21793094e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -222,20 +222,4 @@ static inline unsigned int flow_key_size(u16 family) __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6) -{ - struct flow_keys keys; - - return __get_hash_from_flowi6(fl6, &keys); -} - -__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys); - -static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4) -{ - struct flow_keys keys; - - return __get_hash_from_flowi4(fl4, &keys); -} - #endif diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 8d906a35b534..5e86fd9dc857 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -350,7 +350,8 @@ struct fib6_table { typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, - struct flowi6 *, int); + struct flowi6 *, + const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ @@ -364,6 +365,7 @@ struct fib6_entry_notifier_info { struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup); struct fib6_node *fib6_lookup(struct fib6_node *root, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index da2bde5fda8f..ce2abc0ff102 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -75,7 +75,8 @@ static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags); + struct flowi6 *fl6, + const struct sk_buff *skb, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); @@ -88,9 +89,10 @@ static inline struct dst_entry *ip6_route_output(struct net *net, } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags); + const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int ifindex, struct flowi6 *fl6, int flags); + int ifindex, struct flowi6 *fl6, + const struct sk_buff *skb, int flags); void ip6_route_init_special_entries(void); int ip6_route_init(void); @@ -126,9 +128,10 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, } struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *hkeys); + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int flags); +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 8812582a94d5..7c7522e8585b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -395,7 +395,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); #ifdef CONFIG_IP_ROUTE_MULTIPATH -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); #endif void fib_select_multipath(struct fib_result *res, int hash); diff --git a/include/net/netevent.h b/include/net/netevent.h index 40e7bab68490..d9918261701c 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -26,7 +26,8 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ - NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e286fda09fcf..5b51110435fc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; + int multipath_hash_policy; int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 559db9ea8d86..d29f09bc5ff9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) } EXPORT_SYMBOL(__get_hash_from_flowi6); -__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) -{ - memset(keys, 0, sizeof(*keys)); - - keys->addrs.v4addrs.src = fl4->saddr; - keys->addrs.v4addrs.dst = fl4->daddr; - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys->ports.src = fl4->fl4_sport; - keys->ports.dst = fl4->fl4_dport; - keys->keyid.keyid = fl4->fl4_gre_key; - keys->basic.ip_proto = fl4->flowi4_proto; - - return flow_hash_from_keys(keys); -} -EXPORT_SYMBOL(__get_hash_from_flowi4); - static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 181b0d8d589c..e7c602c600ac 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, fl4, skb, NULL); + int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3bb686dac273..6a7b3cba3972 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1748,44 +1748,45 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); + const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; - hash_keys->addrs.v4addrs.src = outer_iph->saddr; - hash_keys->addrs.v4addrs.dst = outer_iph->daddr; if (likely(outer_iph->protocol != IPPROTO_ICMP)) - return; + goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - return; + goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - return; + goto out; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB) - return; + goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - return; - hash_keys->addrs.v4addrs.src = inner_iph->saddr; - hash_keys->addrs.v4addrs.dst = inner_iph->daddr; + goto out; + + key_iph = inner_iph; +out: + hash_keys->addrs.v4addrs.src = key_iph->saddr; + hash_keys->addrs.v4addrs.dst = key_iph->daddr; } /* if skb is set it will be used and fl4 can be NULL */ -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { - struct net *net = fi->fib_net; struct flow_keys hash_keys; u32 mhash; @@ -1809,24 +1810,20 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); - if (flkeys) { - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; - hash_keys.ports.src = flkeys->ports.src; - hash_keys.ports.dst = flkeys->ports.dst; - hash_keys.basic.ip_proto = flkeys->basic.ip_proto; - } else { + if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; - hash_keys.ports.src = keys.ports.src; - hash_keys.ports.dst = keys.ports.dst; - hash_keys.basic.ip_proto = keys.basic.ip_proto; + flkeys = &keys; } + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1852,7 +1849,7 @@ static int ip_mkroute_input(struct sk_buff *skb, { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, NULL, skb, hkeys); + int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 89683d868b37..011de9a20ec6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,7 +400,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index d7d0abc7fd0e..c61718dba2e6 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 04e5f523e50f..00ef9467f3c0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,11 +61,13 @@ unsigned int fib6_rules_seq_read(struct net *net) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .lookup_data = skb, .flags = FIB_LOOKUP_NOREF, }; @@ -80,11 +82,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); @@ -130,7 +132,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, flags); + rt = lookup(net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b0778d323b6e..6f84668be6ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); @@ -629,7 +629,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, + skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cab95cf3b39f..2f995e9e3050 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -299,11 +299,12 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put(rt); rt = net->ipv6.ip6_null_entry; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4f150a394387..83c7766c8c75 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1053,7 +1053,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 869e2e6750f7..1124f310df5a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -679,7 +679,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, - NULL, 0, 0); + NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -1444,7 +1444,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c617ea17faa8..a482b854eeea 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -645,7 +645,7 @@ static void vti6_link_config(struct ip6_tnl *t) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (rt) tdev = rt->dst.dev; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d9bb933dd5c4..d1a0cefac273 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -165,7 +165,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); @@ -254,7 +254,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 94deb69bbbda..910a27318f58 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, lookup_flags |= RT6_LOOKUP_F_IFACE; } - rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) goto out; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index cc5174c7254c..3230b3d7b11b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -181,7 +181,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, *dest = 0; again: - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, + lookup_flags); if (rt->dst.error) goto put_rt_err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e2bb40824c85..f0ae58424c45 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -450,8 +450,10 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(struct rt6_info *match, +static struct rt6_info *rt6_multipath_select(const struct net *net, + struct rt6_info *match, struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) { struct rt6_info *sibling, *next_sibling; @@ -460,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL); + fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -914,7 +916,9 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { struct rt6_info *rt, *rt_cache; struct fib6_node *fn; @@ -929,8 +933,8 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, - fl6->flowi6_oif, flags); + rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, + skb, flags); } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -954,14 +958,15 @@ restart: } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags) + const struct sk_buff *skb, int flags) { - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int strict) + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, @@ -975,7 +980,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -1647,7 +1652,8 @@ void rt6_age_exceptions(struct rt6_info *rt, } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt, *rt_cache; @@ -1669,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, strict); + rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1768,20 +1774,25 @@ uncached_rt_out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_input(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); @@ -1815,8 +1826,6 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, key_iph = inner_iph; _flkeys = NULL; out: - memset(keys, 0, sizeof(*keys)); - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; @@ -1831,17 +1840,60 @@ out: } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *flkeys) +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; + u32 mhash; - if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - return flow_hash_from_keys(&hash_keys) >> 1; + switch (net->ipv6.sysctl.multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; + case 1: + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + + memset(&hash_keys, 0, sizeof(hash_keys)); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.ports.src = fl6->fl6_sport; + hash_keys.ports.dst = fl6->fl6_dport; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } + mhash = flow_hash_from_keys(&hash_keys); - return get_hash_from_flowi6(fl6) >> 1; + return mhash >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -1868,15 +1920,19 @@ void ip6_route_input(struct sk_buff *skb) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); + skb_dst_set(skb, + ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_output(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -1904,7 +1960,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } EXPORT_SYMBOL_GPL(ip6_route_output_flags); @@ -2153,6 +2209,7 @@ struct ip6rd_flowi { static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; @@ -2226,8 +2283,9 @@ out: }; static struct dst_entry *ip6_route_redirect(struct net *net, - const struct flowi6 *fl6, - const struct in6_addr *gateway) + const struct flowi6 *fl6, + const struct sk_buff *skb, + const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; @@ -2235,7 +2293,7 @@ static struct dst_entry *ip6_route_redirect(struct net *net, rdfl.fl6 = *fl6; rdfl.gateway = *gateway; - return fib6_rule_lookup(net, &rdfl.fl6, + return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } @@ -2255,7 +2313,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_uid = uid; - dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2277,7 +2335,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.saddr = iph->daddr; fl6.flowi6_uid = sock_net_uid(net, NULL); - dst = ip6_route_redirect(net, &fl6, &iph->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2479,7 +2537,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); /* if table lookup failed, fall back to full lookup */ if (rt == net->ipv6.ip6_null_entry) { @@ -2542,7 +2600,7 @@ static int ip6_route_check_nh(struct net *net, } if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); if (!grt) goto out; @@ -4607,7 +4665,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - dst = ip6_route_input_lookup(net, dev, &fl6, flags); + dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ba3767ef5e93..45722327375a 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -161,7 +161,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; if (!tbl_id) { - dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags); + dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); } else { struct fib6_table *table; @@ -169,7 +169,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, flags); + rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); dst = &rt->dst; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 262f791f1b9b..966c42af92f4 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,14 +16,31 @@ #include #include #include +#include #ifdef CONFIG_NETLABEL #include #endif +static int zero; static int one = 1; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_policy); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} static struct ctl_table ipv6_table_template[] = { { @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv6.sysctl.multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_policy, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; + ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh index 55595305a604..3bc351008db6 100755 --- a/tools/testing/selftests/net/forwarding/router_multipath.sh +++ b/tools/testing/selftests/net/forwarding/router_multipath.sh @@ -235,6 +235,45 @@ multipath4_test() sysctl -q -w net.ipv4.fib_multipath_hash_policy=$hash_policy } +multipath6_l4_test() +{ + local desc="$1" + local weight_rp12=$2 + local weight_rp13=$3 + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 + local packets_rp12 packets_rp13 + local hash_policy + + # Transmit multiple flows from h1 to h2 and make sure they are + # distributed between both multipath links (rp12 and rp13) + # according to the configured weights. + hash_policy=$(sysctl -n net.ipv6.fib_multipath_hash_policy) + sysctl -q -w net.ipv6.fib_multipath_hash_policy=1 + + ip route replace 2001:db8:2::/64 vrf vrf-r1 \ + nexthop via fe80:2::22 dev $rp12 weight $weight_rp12 \ + nexthop via fe80:3::23 dev $rp13 weight $weight_rp13 + + t0_rp12=$(link_stats_tx_packets_get $rp12) + t0_rp13=$(link_stats_tx_packets_get $rp13) + + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + t1_rp12=$(link_stats_tx_packets_get $rp12) + t1_rp13=$(link_stats_tx_packets_get $rp13) + + let "packets_rp12 = $t1_rp12 - $t0_rp12" + let "packets_rp13 = $t1_rp13 - $t0_rp13" + multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13 + + ip route replace 2001:db8:2::/64 vrf vrf-r1 \ + nexthop via fe80:2::22 dev $rp12 \ + nexthop via fe80:3::23 dev $rp13 + + sysctl -q -w net.ipv6.fib_multipath_hash_policy=$hash_policy +} + multipath6_test() { local desc="$1" @@ -278,6 +317,11 @@ multipath_test() multipath6_test "ECMP" 1 1 multipath6_test "Weighted MP 2:1" 2 1 multipath6_test "Weighted MP 11:45" 11 45 + + log_info "Running IPv6 L4 hash multipath tests" + multipath6_l4_test "ECMP" 1 1 + multipath6_l4_test "Weighted MP 2:1" 2 1 + multipath6_l4_test "Weighted MP 11:45" 11 45 } setup_prepare()