From 911362c70df5b766c243dc297fadeaced786ffd8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:53 +0100 Subject: [PATCH 1/7] net: add dst_cache support This patch add a generic, lockless dst cache implementation. The need for lock is avoided updating the dst cache fields only in per cpu scope, and requiring that the cache manipulation functions are invoked with the local bh disabled. The refresh_ts and reset_ts fields are used to ensure the cache consistency in case of cuncurrent cache update (dst_cache_set*) and reset operation (dst_cache_reset). Consider the following scenario: CPU1: CPU2: dst_cache_reset() dst_cache_set() The dst entry set passed to dst_cache_set() should not be used for later dst cache lookup, because it's obtained using old configuration values. Since the refresh_ts is updated only on dst_cache lookup, the cached value in the above scenario will be discarded on the next lookup. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/dst_cache.h | 97 +++++++++++++++++++++++ net/Kconfig | 4 + net/core/Makefile | 1 + net/core/dst_cache.c | 168 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 270 insertions(+) create mode 100644 include/net/dst_cache.h create mode 100644 net/core/dst_cache.c diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h new file mode 100644 index 000000000000..151accae708b --- /dev/null +++ b/include/net/dst_cache.h @@ -0,0 +1,97 @@ +#ifndef _NET_DST_CACHE_H +#define _NET_DST_CACHE_H + +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +struct dst_cache { + struct dst_cache_pcpu __percpu *cache; + unsigned long reset_ts; +}; + +/** + * dst_cache_get - perform cache lookup + * @dst_cache: the cache + * + * The caller should use dst_cache_get_ip4() if it need to retrieve the + * source address to be used when xmitting to the cached dst. + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache); + +/** + * dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); + +/** + * dst_cache_set_ip4 - store the ipv4 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr); + +#if IS_ENABLED(CONFIG_IPV6) + +/** + * dst_cache_set_ip6 - store the ipv6 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr); + +/** + * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr); +#endif + +/** + * dst_cache_reset - invalidate the cache contents + * @dst_cache: the cache + * + * This do not free the cached dst to avoid races and contentions. + * the dst will be freed on later cache lookup. + */ +static inline void dst_cache_reset(struct dst_cache *dst_cache) +{ + dst_cache->reset_ts = jiffies; +} + +/** + * dst_cache_init - initialize the cache, allocating the required storage + * @dst_cache: the cache + * @gfp: allocation flags + */ +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); + +/** + * dst_cache_destroy - empty the cache and free the allocated storage + * @dst_cache: the cache + * + * No synchronization is enforced: it must be called only when the cache + * is unsed. + */ +void dst_cache_destroy(struct dst_cache *dst_cache); + +#endif diff --git a/net/Kconfig b/net/Kconfig index 174354618f8a..b80efecfc1a0 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -392,6 +392,10 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config DST_CACHE + bool "dst cache" + default n + endif # if NET # Used by archs to tell that they support BPF_JIT diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de04de3..7a8fb8aef992 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000000..3938f3f38d69 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif +#include + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); From 607f725f6f7d5ec3759fbc16224afb60e2152a5b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:54 +0100 Subject: [PATCH 2/7] net: replace dst_cache ip6_tunnel implementation with the generic one This also fix a potential race into the existing tunnel code, which could lead to the wrong dst to be permanenty cached: CPU1: CPU2: dst = ip6_route_output(...) dst_cache_reset() // no effect, // the cache is empty dst_cache_set() // the wrong dst // is permanenty stored // into the cache With the new dst implementation the above race is not possible since the first cache lookup after dst_cache_reset will fail due to the timestamp check Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 14 +----- net/ipv6/Kconfig | 1 + net/ipv6/ip6_gre.c | 12 ++--- net/ipv6/ip6_tunnel.c | 103 +++------------------------------------ net/ipv6/ip6_vti.c | 2 +- 5 files changed, 16 insertions(+), 116 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 0d0ce0b2d870..499a707765ea 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -6,6 +6,7 @@ #include #include #include +#include #define IP6TUNNEL_ERR_TIMEO (30*HZ) @@ -33,12 +34,6 @@ struct __ip6_tnl_parm { __be32 o_key; }; -struct ip6_tnl_dst { - seqlock_t lock; - struct dst_entry __rcu *dst; - u32 cookie; -}; - /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -46,7 +41,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ + struct dst_cache dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -66,11 +61,6 @@ struct ipv6_tlv_tnl_enc_lim { __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); -int ip6_tnl_dst_init(struct ip6_tnl *t); -void ip6_tnl_dst_destroy(struct ip6_tnl *t); -void ip6_tnl_dst_reset(struct ip6_tnl *t); -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 40c897515ddc..11e875ffd7ac 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL + select DST_CACHE ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f37f18b6b40c..a94e50602813 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(tunnel); + dst = dst_cache_get(&tunnel->dst_cache); if (!dst) { dst = ip6_route_output(net, NULL, fl6); @@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(tunnel, ndst); + dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); proto = NEXTHDR_GRE; @@ -1009,7 +1009,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1219,7 +1219,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1257,7 +1257,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(tunnel); + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca42aaa6..3f3aabd2f07b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - write_seqlock_bh(&idst->lock); - dst_release(rcu_dereference_protected( - idst->dst, - lockdep_is_held(&idst->lock.lock))); - if (dst) { - dst_hold(dst); - idst->cookie = rt6_get_cookie((struct rt6_info *)dst); - } else { - idst->cookie = 0; - } - rcu_assign_pointer(idst->dst, dst); - write_sequnlock_bh(&idst->lock); -} - -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) -{ - struct ip6_tnl_dst *idst; - struct dst_entry *dst; - unsigned int seq; - u32 cookie; - - idst = raw_cpu_ptr(t->dst_cache); - - rcu_read_lock(); - do { - seq = read_seqbegin(&idst->lock); - dst = rcu_dereference(idst->dst); - cookie = idst->cookie; - } while (read_seqretry(&idst->lock, seq)); - - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - - if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { - ip6_tnl_per_cpu_dst_set(idst, NULL); - dst_release(dst); - dst = NULL; - } - return dst; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); - -void ip6_tnl_dst_reset(struct ip6_tnl *t) -{ - int i; - - for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); - -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) -{ - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); - -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); - -void ip6_tnl_dst_destroy(struct ip6_tnl *t) -{ - if (!t->dst_cache) - return; - - ip6_tnl_dst_reset(t); - free_percpu(t->dst_cache); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); - -int ip6_tnl_dst_init(struct ip6_tnl *t) -{ - int i; - - t->dst_cache = alloc_percpu(struct ip6_tnl_dst); - if (!t->dst_cache) - return -ENOMEM; - - for_each_possible_cpu(i) - seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); - /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(t); + dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(t, ndst); + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); skb->transport_header = skb->network_header; @@ -1366,7 +1275,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; } @@ -1637,7 +1546,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(t); + ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b33d79..d90a11f14040 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; } From e09acddf873bf775b208b452a4c3a3fd26fa9427 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:55 +0100 Subject: [PATCH 3/7] ip_tunnel: replace dst_cache with generic implementation The current ip_tunnel cache implementation is prone to a race that will cause the wrong dst to be cached on cuncurrent dst cache miss and ip tunnel update via netlink. Replacing with the generic implementation fix the issue. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 9 ++--- net/ipv4/Kconfig | 1 + net/ipv4/ip_tunnel.c | 78 +++++++--------------------------------- net/ipv6/sit.c | 17 ++++----- 4 files changed, 25 insertions(+), 80 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index bc439f32baa9..fd36936d85a6 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -13,6 +13,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -85,11 +86,6 @@ struct ip_tunnel_prl_entry { struct rcu_head rcu_head; }; -struct ip_tunnel_dst { - struct dst_entry __rcu *dst; - __be32 saddr; -}; - struct metadata_dst; struct ip_tunnel { @@ -108,7 +104,7 @@ struct ip_tunnel { int tun_hlen; /* Precalculated header length */ int mlink; - struct ip_tunnel_dst __percpu *dst_cache; + struct dst_cache dst_cache; struct ip_tunnel_parm parms; @@ -247,7 +243,6 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); -void ip_tunnel_dst_reset_all(struct ip_tunnel *t); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 775824720b6b..395d82754626 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c7bd72e9b544..4569da7dfa88 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -729,7 +675,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -739,7 +686,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -836,7 +784,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -961,7 +909,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1155,15 +1103,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1193,7 +1141,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9a6b407f5840..0625ac6356b5 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } @@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } @@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); break; @@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } return 0; From 0c1d70af924b966cc71e9e48920b2b635441aa50 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:56 +0100 Subject: [PATCH 4/7] net: use dst_cache for vxlan device In case of UDP traffic with datagram length below MTU this give about 3% performance increase when tunneling over ipv4 and about 70% when tunneling over ipv6. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 55 ++++++++++++++++++++++++++++++++++++++------- include/net/vxlan.h | 1 + 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0a23c64379d6..ad673037bd73 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -480,6 +480,8 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f, rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; + + dst_cache_reset(&rd->dst_cache); rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; @@ -501,6 +503,12 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; + + if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { + kfree(rd); + return -ENOBUFS; + } + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; @@ -749,8 +757,10 @@ static void vxlan_fdb_free(struct rcu_head *head) struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); struct vxlan_rdst *rd, *nd; - list_for_each_entry_safe(rd, nd, &f->remotes, list) + list_for_each_entry_safe(rd, nd, &f->remotes, list) { + dst_cache_destroy(&rd->dst_cache); kfree(rd); + } kfree(f); } @@ -1754,11 +1764,24 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, u8 tos, - __be32 daddr, __be32 *saddr) + __be32 daddr, __be32 *saddr, + struct dst_cache *dst_cache, + struct ip_tunnel_info *info) { struct rtable *rt = NULL; + bool use_cache = false; struct flowi4 fl4; + /* when the ip_tunnel_info is availble, the tos used for lookup is + * packet independent, so we can use the cache + */ + if (dst_cache && !skb->mark && (!tos || info)) { + use_cache = true; + rt = dst_cache_get_ip4(dst_cache, saddr); + if (rt) + return rt; + } + memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = oif; fl4.flowi4_tos = RT_TOS(tos); @@ -1768,8 +1791,11 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; rt = ip_route_output_key(vxlan->net, &fl4); - if (!IS_ERR(rt)) + if (!IS_ERR(rt)) { *saddr = fl4.saddr; + if (use_cache) + dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); + } return rt; } @@ -1777,12 +1803,21 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, const struct in6_addr *daddr, - struct in6_addr *saddr) + struct in6_addr *saddr, + struct dst_cache *dst_cache) { + bool use_cache = false; struct dst_entry *ndst; struct flowi6 fl6; int err; + if (dst_cache && !skb->mark) { + use_cache = true; + ndst = dst_cache_get_ip6(dst_cache, saddr); + if (ndst) + return ndst; + } + memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; fl6.daddr = *daddr; @@ -1797,6 +1832,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, return ERR_PTR(err); *saddr = fl6.saddr; + if (use_cache) + dst_cache_set_ip6(dst_cache, ndst, saddr); return ndst; } #endif @@ -1938,7 +1975,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, - dst->sin.sin_addr.s_addr, &saddr); + dst->sin.sin_addr.s_addr, &saddr, + rdst ? &rdst->dst_cache : NULL, info); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); @@ -1990,7 +2028,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, - &dst->sin6.sin6_addr, &saddr); + &dst->sin6.sin6_addr, &saddr, + rdst ? &rdst->dst_cache : NULL); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); @@ -2331,7 +2370,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) return -EINVAL; rt = vxlan_get_route(vxlan, skb, 0, info->key.tos, info->key.u.ipv4.dst, - &info->key.u.ipv4.src); + &info->key.u.ipv4.src, NULL, info); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); @@ -2343,7 +2382,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) return -EINVAL; ndst = vxlan6_get_route(vxlan, skb, 0, &info->key.u.ipv6.dst, - &info->key.u.ipv6.src); + &info->key.u.ipv6.src, NULL); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 25bd919c9ef0..b314e4af89c5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -148,6 +148,7 @@ struct vxlan_rdst { u32 remote_ifindex; struct list_head list; struct rcu_head rcu; + struct dst_cache dst_cache; }; struct vxlan_config { From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:57 +0100 Subject: [PATCH 5/7] net: add dst_cache to ovs vxlan lwtunnel In case of UDP traffic with datagram length below MTU this give about 2% performance increase when tunneling over ipv4 and about 60% when tunneling over ipv6 Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 15 ++++++++------- include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 3 +++ net/core/dst.c | 10 +++++++++- net/openvswitch/Kconfig | 1 + net/openvswitch/flow_netlink.c | 6 ++++++ 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ad673037bd73..ee1206d9f8df 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1775,7 +1775,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, /* when the ip_tunnel_info is availble, the tos used for lookup is * packet independent, so we can use the cache */ - if (dst_cache && !skb->mark && (!tos || info)) { + if (!skb->mark && (!tos || info)) { use_cache = true; rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) @@ -1806,13 +1806,11 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct in6_addr *saddr, struct dst_cache *dst_cache) { - bool use_cache = false; struct dst_entry *ndst; struct flowi6 fl6; int err; - if (dst_cache && !skb->mark) { - use_cache = true; + if (!skb->mark) { ndst = dst_cache_get_ip6(dst_cache, saddr); if (ndst) return ndst; @@ -1832,7 +1830,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, return ERR_PTR(err); *saddr = fl6.saddr; - if (use_cache) + if (!skb->mark) dst_cache_set_ip6(dst_cache, ndst, saddr); return ndst; } @@ -1886,6 +1884,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk; @@ -1910,6 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = rdst->remote_vni; dst = &rdst->remote_ip; + dst_cache = &rdst->dst_cache; } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", @@ -1924,6 +1924,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, else remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; dst = &remote_ip; + dst_cache = &info->dst_cache; } if (vxlan_addr_any(dst)) { @@ -1976,7 +1977,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, dst->sin.sin_addr.s_addr, &saddr, - rdst ? &rdst->dst_cache : NULL, info); + dst_cache, info); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); @@ -2029,7 +2030,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, &dst->sin6.sin6_addr, &saddr, - rdst ? &rdst->dst_cache : NULL); + dst_cache); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 30a56ab2ccfb..84b833af6882 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, sizeof(a->u.tun_info) + a->u.tun_info.options_len); } +void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index fd36936d85a6..87408ab80856 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -58,6 +58,9 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; +#ifdef CONFIG_DST_CACHE + struct dst_cache dst_cache; +#endif u8 options_len; u8 mode; }; diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8d72..b5cbbe07f786 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9f6654..cd5fd9d728a7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -10,6 +10,7 @@ config OPENVSWITCH select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a45ca2d..58b8efc23668 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { From 468dfffcd762cbb2777ec5a76bc21e3748ebf47e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:58 +0100 Subject: [PATCH 6/7] geneve: add dst caching support use generic dst implementation for both plain geneve devices and lwtunnels. In case of UDP traffic with datagram length below MTU this give about 2% performance increase for plain geneve tunnel over ipv4, about 65% performance increase for ipv6 tunnel. Signed-off-by: Paolo Abeni Suggested-and-Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 0b14ac3b8d11..6f208132a574 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -72,6 +72,7 @@ struct geneve_dev { bool collect_md; struct gro_cells gro_cells; u32 flags; + struct dst_cache dst_cache; }; /* Geneve device flags */ @@ -297,6 +298,13 @@ static int geneve_init(struct net_device *dev) return err; } + err = dst_cache_init(&geneve->dst_cache, GFP_KERNEL); + if (err) { + free_percpu(dev->tstats); + gro_cells_destroy(&geneve->gro_cells); + return err; + } + return 0; } @@ -304,6 +312,7 @@ static void geneve_uninit(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); + dst_cache_destroy(&geneve->dst_cache); gro_cells_destroy(&geneve->gro_cells); free_percpu(dev->tstats); } @@ -753,7 +762,9 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct ip_tunnel_info *info) { struct geneve_dev *geneve = netdev_priv(dev); + struct dst_cache *dst_cache; struct rtable *rt = NULL; + bool use_cache = true; __u8 tos; memset(fl4, 0, sizeof(*fl4)); @@ -764,16 +775,26 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, fl4->daddr = info->key.u.ipv4.dst; fl4->saddr = info->key.u.ipv4.src; fl4->flowi4_tos = RT_TOS(info->key.tos); + dst_cache = &info->dst_cache; } else { tos = geneve->tos; if (tos == 1) { const struct iphdr *iip = ip_hdr(skb); tos = ip_tunnel_get_dsfield(iip, skb); + use_cache = false; } fl4->flowi4_tos = RT_TOS(tos); fl4->daddr = geneve->remote.sin.sin_addr.s_addr; + dst_cache = &geneve->dst_cache; + } + + use_cache = use_cache && !skb->mark; + if (use_cache) { + rt = dst_cache_get_ip4(dst_cache, &fl4->saddr); + if (rt) + return rt; } rt = ip_route_output_key(geneve->net, fl4); @@ -786,6 +807,8 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, ip_rt_put(rt); return ERR_PTR(-ELOOP); } + if (use_cache) + dst_cache_set_ip4(dst_cache, &rt->dst, fl4->saddr); return rt; } @@ -798,6 +821,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs6 = geneve->sock6; struct dst_entry *dst = NULL; + struct dst_cache *dst_cache; + bool use_cache = true; __u8 prio; memset(fl6, 0, sizeof(*fl6)); @@ -808,16 +833,26 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, fl6->daddr = info->key.u.ipv6.dst; fl6->saddr = info->key.u.ipv6.src; fl6->flowi6_tos = RT_TOS(info->key.tos); + dst_cache = &info->dst_cache; } else { prio = geneve->tos; if (prio == 1) { const struct iphdr *iip = ip_hdr(skb); prio = ip_tunnel_get_dsfield(iip, skb); + use_cache = false; } fl6->flowi6_tos = RT_TOS(prio); fl6->daddr = geneve->remote.sin6.sin6_addr; + dst_cache = &geneve->dst_cache; + } + + use_cache = use_cache && !skb->mark; + if (use_cache) { + dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); + if (dst) + return dst; } if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { @@ -830,6 +865,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, return ERR_PTR(-ELOOP); } + if (use_cache) + dst_cache_set_ip6(dst_cache, dst, &fl6->saddr); return dst; } #endif @@ -1272,6 +1309,8 @@ static int geneve_configure(struct net *net, struct net_device *dev, return -EPERM; } + dst_cache_reset(&geneve->dst_cache); + err = register_netdevice(dev); if (err) return err; From 3c1cb4d2604c03779a6c9485204e2a80be6c28f0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:59 +0100 Subject: [PATCH 7/7] net/ipv4: add dst cache support for gre lwtunnels In case of UDP traffic with datagram length below MTU this gives about 4% performance increase Signed-off-by: Paolo Abeni Suggested-and-Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 65748db44285..917c2c1bfadd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -540,9 +540,16 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : + NULL; + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (!skb->mark) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);