From d7dedee184e775f77d321cfa1c660a7680cf6588 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:25 +0200 Subject: [PATCH 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops Before we convert IPv6 to use hash-threshold instead of modulo-N, we first need each nexthop to store its region boundary in the hash function's output space. The boundary is calculated by dividing the output space equally between the different active nexthops. That is, nexthops that are not dead or linkdown. The boundaries are rebalanced whenever a nexthop is added or removed to a multipath route and whenever a nexthop becomes active or inactive. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/ip6_route.h | 7 +++ net/ipv6/ip6_fib.c | 8 +--- net/ipv6/route.c | 96 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ddf53dd1e948..97cd05d87780 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -149,6 +149,7 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; + atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 34cd3b0c6ded..27d23a65f3cd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, @@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); +void rt6_multipath_rebalance(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b5f19703fca6..e31118f417b4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -796,12 +796,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -991,6 +985,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1054b059747f..ced2c9bed10b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3481,6 +3481,99 @@ struct arg_netdev_event { }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total++; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total++; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + (*weight)++; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + static int fib6_ifup(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; @@ -3489,6 +3582,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); } return 0; @@ -3588,6 +3682,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: @@ -3595,6 +3690,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); break; } From 7696c06a189c0f1f4d0a7e49e28d10e1050ec529 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:26 +0200 Subject: [PATCH 2/4] ipv6: Use a 31-bit multipath hash The hash thresholds assigned to IPv6 nexthops are in the range of [-1, 2^31 - 1], where a negative value is assigned to nexthops that should not be considered during multipath selection. Therefore, in a similar fashion to IPv4, we need to use the upper 31-bits of the multipath hash for multipath selection. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ced2c9bed10b..09e8e10b101d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1833,10 +1833,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) From 3d709f69a3e749f4d1c195dab499df8ab66e25a8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:27 +0200 Subject: [PATCH 3/4] ipv6: Use hash-threshold instead of modulo-N Now that each nexthop stores its region boundary in the multipath hash function's output space, we can use hash-threshold instead of modulo-N in multipath selection. This reduces the number of checks we need to perform during lookup, as dead and linkdown nexthops are assigned a negative region boundary. In addition, in contrast to modulo-N, only flows near region boundaries are affected when a nexthop is added or removed. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 09e8e10b101d..7837b8c754a3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } - if (sibling->rt6i_nh_flags & RTNH_F_DEAD) - break; - if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } return match; } From 398958ae48f44bb036d0fa9829cd489270bf1fc2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:28 +0200 Subject: [PATCH 4/4] ipv6: Add support for non-equal-cost multipath The use of hash-threshold instead of modulo-N makes it trivial to add support for non-equal-cost multipath. Instead of dividing the multipath hash function's output space equally between the nexthops, each nexthop is assigned a region size which is proportional to its weight. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/route.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 97cd05d87780..34ec321d6a03 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -171,6 +171,7 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ + int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7837b8c754a3..1076ae0ea9d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2594,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -3507,11 +3508,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total++; + total += rt->rt6i_nh_weight; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { if (!rt6_is_dead(iter)) - total++; + total += iter->rt6i_nh_weight; } return total; @@ -3522,7 +3523,7 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - (*weight)++; + *weight += rt->rt6i_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } @@ -4024,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -4246,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0)