diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 360b12e61850..323c94f1845b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -173,6 +173,9 @@ struct fib6_info { refcount_t fib6_ref; unsigned long expires; + + struct hlist_node gc_link; + struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -241,12 +244,18 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { @@ -327,8 +336,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { + DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu(&f6i->rcu, fib6_info_destroy_rcu); + } } enum fib6_walk_state { @@ -382,6 +393,7 @@ struct fib6_table { struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; + struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -498,6 +510,38 @@ void fib6_gc_cleanup(void); int fib6_init(void); +/* Add the route to the gc list if it is not already there + * + * The callers should hold f6i->fib6_table->tb6_lock. + */ +static inline void fib6_add_gc_list(struct fib6_info *f6i) +{ + /* If fib6_node is null, the f6i is not in (or removed from) the + * table. + * + * There is a gap between finding the f6i from the table and + * calling this function without the protection of the tb6_lock. + * This check makes sure the f6i is not added to the gc list when + * it is not on the table. + */ + if (!rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock))) + return; + + if (hlist_unhashed(&f6i->gc_link)) + hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); +} + +/* Remove the route from the gc list if it is on the list. + * + * The callers should hold f6i->fib6_table->tb6_lock. + */ +static inline void fib6_remove_gc_list(struct fib6_info *f6i) +{ + if (!hlist_unhashed(&f6i->gc_link)) + hlist_del_init(&f6i->gc_link); +} + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 28b065790261..52a51c69aa9d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -170,7 +170,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric); + u32 defrtr_usr_metric, + int lifetime); void rt6_purge_dflt_routers(struct net *net); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d63f5d063f07..ca1b719323c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1255,6 +1255,7 @@ static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt, bool del_peer) { + struct fib6_table *table; struct fib6_info *f6i; f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, @@ -1264,8 +1265,15 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, if (del_rt) ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { - if (!(f6i->fib6_flags & RTF_EXPIRES)) + if (!(f6i->fib6_flags & RTF_EXPIRES)) { + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + + spin_unlock_bh(&table->tb6_lock); + } fib6_info_release(f6i); } } @@ -2706,6 +2714,7 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; + struct fib6_table *table; __u32 valid_lft; __u32 prefered_lft; int addr_type, err; @@ -2782,11 +2791,20 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (valid_lft == 0) { ip6_del_rt(net, rt, false); rt = NULL; - } else if (addrconf_finite_timeout(rt_expires)) { - /* not infinity */ - fib6_set_expires(rt, jiffies + rt_expires); } else { - fib6_clean_expires(rt); + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + fib6_set_expires(rt, jiffies + rt_expires); + fib6_add_gc_list(rt); + } else { + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); } } else if (valid_lft) { clock_t expires = 0; @@ -4741,6 +4759,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { + struct fib6_table *table; struct fib6_info *f6i; u32 prio; @@ -4761,10 +4780,18 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } else { - if (!expires) + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!(flags & RTF_EXPIRES)) { fib6_clean_expires(f6i); - else + fib6_remove_gc_list(f6i); + } else { fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(f6i); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 38a0348b1d17..805bbf26b3ef 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -1055,6 +1058,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); } /* @@ -1115,10 +1121,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->fib6_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); - else + fib6_remove_gc_list(iter); + } else { fib6_set_expires(iter, rt->expires); + fib6_add_gc_list(iter); + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1477,6 +1486,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (rt->fib6_flags & RTF_EXPIRES) + fib6_add_gc_list(rt); + fib6_start_gc(info->nl_net, rt); } @@ -2280,9 +2293,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2307,6 +2319,42 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + + fib6_gc_table(net, table, gc_args); + + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2322,7 +2370,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -2382,6 +2430,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -2394,6 +2443,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a19999b30bc0..73cb31afe935 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1237,6 +1237,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; + struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; @@ -1382,7 +1383,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref, defrtr_usr_metric); + skb->dev, pref, defrtr_usr_metric, + lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", @@ -1409,8 +1411,15 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } - if (rt) + if (rt) { + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); + fib6_add_gc_list(rt); + + spin_unlock_bh(&table->tb6_lock); + } if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 63b4c6056582..707d65bc9c0e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -931,6 +931,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; + struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; @@ -989,10 +990,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); - else + fib6_remove_gc_list(rt); + } else { fib6_set_expires(rt, jiffies + HZ * lifetime); + fib6_add_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } @@ -3765,8 +3774,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); - else - fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4355,7 +4362,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric) + u32 defrtr_usr_metric, + int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, @@ -4368,6 +4376,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, + .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index b3ecccbbfcd2..3ec1050e47a2 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -743,6 +743,43 @@ fib_notify_test() cleanup &> /dev/null } +# Create a new dummy_10 to remove all associated routes. +reset_dummy_10() +{ + $IP link del dev dummy_10 + + $IP link add dummy_10 type dummy + $IP link set dev dummy_10 up + $IP -6 address add 2001:10::1/64 dev dummy_10 +} + +check_rt_num() +{ + local expected=$1 + local num=$2 + + if [ $num -ne $expected ]; then + echo "FAIL: Expected $expected routes, got $num" + ret=1 + else + ret=0 + fi +} + +check_rt_num_clean() +{ + local expected=$1 + local num=$2 + + if [ $num -ne $expected ]; then + log_test 1 0 "expected $expected routes, got $num" + set +e + cleanup &> /dev/null + return 1 + fi + return 0 +} + fib6_gc_test() { setup @@ -751,7 +788,7 @@ fib6_gc_test() echo "Fib6 garbage collection test" set -e - EXPIRE=3 + EXPIRE=5 # Check expiration of routes every $EXPIRE seconds (GC) $NS_EXEC sysctl -wq net.ipv6.route.gc_interval=$EXPIRE @@ -763,44 +800,114 @@ fib6_gc_test() $NS_EXEC sysctl -wq net.ipv6.route.flush=1 # Temporary routes - for i in $(seq 1 1000); do + for i in $(seq 1 5); do # Expire route after $EXPIRE seconds $IP -6 route add 2001:20::$i \ via 2001:10::2 dev dummy_10 expires $EXPIRE done - sleep $(($EXPIRE * 2)) - N_EXP_SLEEP=$($IP -6 route list |grep expires|wc -l) - if [ $N_EXP_SLEEP -ne 0 ]; then - echo "FAIL: expected 0 routes with expires, got $N_EXP_SLEEP" - ret=1 - else - ret=0 - fi + sleep $(($EXPIRE * 2 + 1)) + $NS_EXEC sysctl -wq net.ipv6.route.flush=1 + check_rt_num 0 $($IP -6 route list |grep expires|wc -l) + log_test $ret 0 "ipv6 route garbage collection" + + reset_dummy_10 # Permanent routes - for i in $(seq 1 5000); do + for i in $(seq 1 5); do $IP -6 route add 2001:30::$i \ via 2001:10::2 dev dummy_10 done # Temporary routes - for i in $(seq 1 1000); do + for i in $(seq 1 5); do # Expire route after $EXPIRE seconds $IP -6 route add 2001:20::$i \ via 2001:10::2 dev dummy_10 expires $EXPIRE done - sleep $(($EXPIRE * 2)) - N_EXP_SLEEP=$($IP -6 route list |grep expires|wc -l) - if [ $N_EXP_SLEEP -ne 0 ]; then - echo "FAIL: expected 0 routes with expires," \ - "got $N_EXP_SLEEP (5000 permanent routes)" - ret=1 - else - ret=0 + sleep $(($EXPIRE * 2 + 1)) + check_rt_num 0 $($IP -6 route list |grep expires|wc -l) + log_test $ret 0 "ipv6 route garbage collection (with permanent routes)" + + reset_dummy_10 + + # Permanent routes + for i in $(seq 1 5); do + $IP -6 route add 2001:20::$i \ + via 2001:10::2 dev dummy_10 + done + # Replace with temporary routes + for i in $(seq 1 5); do + # Expire route after $EXPIRE seconds + $IP -6 route replace 2001:20::$i \ + via 2001:10::2 dev dummy_10 expires $EXPIRE + done + check_rt_num_clean 5 $($IP -6 route list |grep expires|wc -l) || return + # Wait for GC + sleep $(($EXPIRE * 2 + 1)) + $NS_EXEC sysctl -wq net.ipv6.route.flush=1 + check_rt_num 0 $($IP -6 route list |grep expires|wc -l) + log_test $ret 0 "ipv6 route garbage collection (replace with expires)" + + reset_dummy_10 + + # Temporary routes + for i in $(seq 1 5); do + # Expire route after $EXPIRE seconds + $IP -6 route add 2001:20::$i \ + via 2001:10::2 dev dummy_10 expires $EXPIRE + done + # Replace with permanent routes + for i in $(seq 1 5); do + $IP -6 route replace 2001:20::$i \ + via 2001:10::2 dev dummy_10 + done + check_rt_num_clean 0 $($IP -6 route list |grep expires|wc -l) || return + + # Wait for GC + sleep $(($EXPIRE * 2 + 1)) + + check_rt_num 5 $($IP -6 route list |grep -v expires|grep 2001:20::|wc -l) + log_test $ret 0 "ipv6 route garbage collection (replace with permanent)" + + # ra6 is required for the next test. (ipv6toolkit) + if [ ! -x "$(command -v ra6)" ]; then + echo "SKIP: ra6 not found." + set +e + cleanup &> /dev/null + return fi - set +e + # Delete dummy_10 and remove all routes + $IP link del dev dummy_10 - log_test $ret 0 "ipv6 route garbage collection" + # Create a pair of veth devices to send a RA message from one + # device to another. + $IP link add veth1 type veth peer name veth2 + $IP link set dev veth1 up + $IP link set dev veth2 up + $IP -6 address add 2001:10::1/64 dev veth1 nodad + $IP -6 address add 2001:10::2/64 dev veth2 nodad + + # Make veth1 ready to receive RA messages. + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.accept_ra=2 + + # Send a RA message with a route from veth2 to veth1. + $NS_EXEC ra6 -i veth2 -d 2001:10::1 -t $EXPIRE + + # Wait for the RA message. + sleep 1 + + # systemd may mess up the test. You syould make sure that + # systemd-networkd.service and systemd-networkd.socket are stopped. + check_rt_num_clean 1 $($IP -6 route list|grep expires|wc -l) || return + + # Wait for GC + sleep $(($EXPIRE * 2 + 1)) + + $NS_EXEC sysctl -wq net.ipv6.route.flush=1 + check_rt_num 0 $($IP -6 route list |grep expires|wc -l) + log_test $ret 0 "ipv6 route garbage collection (RA message)" + + set +e cleanup &> /dev/null }