diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9375324aa8e1..5cdc37c34830 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN on userspace caches to track link events and evict routes. Default: false (generate message) +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9947eb1e9eb6..e525f003e619 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); -int ip6_del_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..a5f7c12c326a 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -48,7 +48,7 @@ struct ipv6_stub { struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); - int (*ip6_del_rt)(struct net *net, struct fib6_info *rt); + int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, struct nl_info *info); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 154b8f01499b..5acdb4d414c4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,8 @@ struct netns_ipv4 { int sysctl_tcp_early_demux; int sysctl_udp_early_demux; + int sysctl_nexthop_compat_mode; + int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c618e242490f..6177c4ba0037 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; + net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 55ca2e521828..e53871e4a097 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; + if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode) + goto offload; } if (nhs == 1) { @@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } +offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..3957364d556c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i); + ipv6_stub->ip6_del_rt(net, f6i, + !net->ipv4.sysctl_nexthop_compat_mode); } } @@ -1041,7 +1042,7 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); - if (replace_notify) + if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..95ad71e76cc3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_tcp_early_demux }, + { + .procname = "nexthop_compat_mode", + .data = &init_net.ipv4.sysctl_nexthop_compat_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 27b4fb6e452b..2c4f20ec1e2a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { if (!(f6i->fib6_flags & RTF_EXPIRES)) fib6_set_expires(f6i, expires); @@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3813,7 +3813,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; if (f6i->fib6_metric != prio) { /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); /* add new one */ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, @@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0, false); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); } if (ifp->rt) { - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt, false); ifp->rt = NULL; } rt_genid_bump_ipv6(net); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..9ebf3fe0d2b1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } -static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, + bool skip_notify) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index fed91ab7ec46..893261230ffc 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; @@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..2d09c4da03ee 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..803212aae4ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -3729,9 +3729,12 @@ out: return err; } -int ip6_del_rt(struct net *net, struct fib6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { - struct nl_info info = { .nl_net = net }; + struct nl_info info = { + .nl_net = net, + .skip_notify = skip_notify + }; return __ip6_del_rt(rt, &info); } @@ -4252,7 +4255,7 @@ restart: (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); goto restart; } } @@ -5554,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; - if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + if (net->ipv4.sysctl_nexthop_compat_mode && + rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b785241127df..dd0e5fec6367 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -253,6 +253,33 @@ check_route6() check_output "${out}" "${expected}" } +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -883,6 +910,173 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + basic() { echo