2005-04-17 02:20:36 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Definitions for the IP router .
*
* Version : @ ( # ) route . h 1.0 .4 05 / 27 / 93
*
2005-05-06 03:16:16 +04:00
* Authors : Ross Biro
2005-04-17 02:20:36 +04:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Fixes :
* Alan Cox : Reformatted . Added ip_rt_local ( )
* Alan Cox : Support for TCP parameters .
* Alexey Kuznetsov : Major changes for new routing code .
* Mike McLagan : Routing by source
* Robert Olsson : Added rt_cache statistics
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# ifndef _ROUTE_H
# define _ROUTE_H
# include <net/dst.h>
# include <net/inetpeer.h>
# include <net/flow.h>
2008-10-01 18:35:39 +04:00
# include <net/inet_sock.h>
2005-04-17 02:20:36 +04:00
# include <linux/in_route.h>
# include <linux/rtnetlink.h>
# include <linux/route.h>
# include <linux/ip.h>
# include <linux/cache.h>
2006-08-05 10:12:42 +04:00
# include <linux/security.h>
2005-04-17 02:20:36 +04:00
# define RTO_ONLINK 0x01
# define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
struct fib_nh ;
struct inet_peer ;
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write.
Initially a route entry points it's metrics at a read-only location.
If a routing table entry exists, it will point there. Else it will
point at the all zero metric place-holder called 'dst_default_metrics'.
The writeability state of the metrics is stored in the low bits of the
metrics pointer, we have two bits left to spare if we want to store
more states.
For the initial implementation, COW is implemented simply via kmalloc.
However future enhancements will change this to place the writable
metrics somewhere else, in order to increase sharing. Very likely
this "somewhere else" will be the inetpeer cache.
Note also that this means that metrics updates may transiently fail
if we cannot COW the metrics successfully.
But even by itself, this patch should decrease memory usage and
increase cache locality especially for routing workloads. In those
cases the read-only metric copies stay in place and never get written
to.
TCP workloads where metrics get updated, and those rare cases where
PMTU triggers occur, will take a very slight performance hit. But
that hit will be alleviated when the long-term writable metrics
move to a more sharable location.
Since the metrics storage went from a u32 array of RTAX_MAX entries to
what is essentially a pointer, some retooling of the dst_entry layout
was necessary.
Most importantly, we need to preserve the alignment of the reference
count so that it doesn't share cache lines with the read-mostly state,
as per Eric Dumazet's alignment assertion checks.
The only non-trivial bit here is the move of the 'flags' member into
the writeable cacheline. This is OK since we are always accessing the
flags around the same moment when we made a modification to the
reference count.
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-01-27 07:51:05 +03:00
struct fib_info ;
2009-11-03 06:26:03 +03:00
struct rtable {
2010-06-11 10:31:35 +04:00
struct dst_entry dst ;
2005-04-17 02:20:36 +04:00
2011-03-05 08:47:09 +03:00
/* Lookup key. */
__be32 rt_key_dst ;
__be32 rt_key_src ;
2007-02-10 03:19:26 +03:00
[IPV4] route cache: Introduce rt_genid for smooth cache invalidation
Current ip route cache implementation is not suited to large caches.
We can consume a lot of CPU when cache must be invalidated, since we
currently need to evict all cache entries, and this eviction is
sometimes asynchronous. min_delay & max_delay can somewhat control this
asynchronism behavior, but whole thing is a kludge, regularly triggering
infamous soft lockup messages. When entries are still in use, this also
consumes a lot of ram, filling dst_garbage.list.
A better scheme is to use a generation identifier on each entry,
so that cache invalidation can be performed by changing the table
identifier, without having to scan all entries.
No more delayed flushing, no more stalling when secret_interval expires.
Invalidated entries will then be freed at GC time (controled by
ip_rt_gc_timeout or stress), or when an invalidated entry is found
in a chain when an insert is done.
Thus we keep a normal equilibrium.
This patch :
- renames rt_hash_rnd to rt_genid (and makes it an atomic_t)
- Adds a new rt_genid field to 'struct rtable' (filling a hole on 64bit)
- Checks entry->rt_genid at appropriate places :
2008-02-01 04:05:09 +03:00
int rt_genid ;
2005-04-17 02:20:36 +04:00
unsigned rt_flags ;
__u16 rt_type ;
2011-05-04 06:45:15 +04:00
__u8 rt_key_tos ;
2005-04-17 02:20:36 +04:00
2006-09-27 08:26:42 +04:00
__be32 rt_dst ; /* Path destination */
__be32 rt_src ; /* Path source */
2011-04-08 01:04:08 +04:00
int rt_route_iif ;
2005-04-17 02:20:36 +04:00
int rt_iif ;
2011-03-05 08:47:09 +03:00
int rt_oif ;
__u32 rt_mark ;
2005-04-17 02:20:36 +04:00
/* Info on neighbour */
2006-09-27 08:26:42 +04:00
__be32 rt_gateway ;
2005-04-17 02:20:36 +04:00
/* Miscellaneous cached information */
2006-09-27 08:26:42 +04:00
__be32 rt_spec_dst ; /* RFC1122 specific destination */
inet: Create a mechanism for upward inetpeer propagation into routes.
If we didn't have a routing cache, we would not be able to properly
propagate certain kinds of dynamic path attributes, for example
PMTU information and redirects.
The reason is that if we didn't have a routing cache, then there would
be no way to lookup all of the active cached routes hanging off of
sockets, tunnels, IPSEC bundles, etc.
Consider the case where we created a cached route, but no inetpeer
entry existed and also we were not asked to pre-COW the route metrics
and therefore did not force the creation a new inetpeer entry.
If we later get a PMTU message, or a redirect, and store this
information in a new inetpeer entry, there is no way to teach that
cached route about the newly existing inetpeer entry.
The facilities implemented here handle this problem.
First we create a generation ID. When we create a cached route of any
kind, we remember the generation ID at the time of attachment. Any
time we force-create an inetpeer entry in response to new path
information, we bump that generation ID.
The dst_ops->check() callback is where the knowledge of this event
is propagated. If the global generation ID does not equal the one
stored in the cached route, and the cached route has not attached
to an inetpeer yet, we look it up and attach if one is found. Now
that we've updated the cached route's information, we update the
route's generation ID too.
This clears the way for implementing PMTU and redirects directly in
the inetpeer cache. There is absolutely no need to consult cached
route information in order to maintain this information.
At this point nothing bumps the inetpeer genids, that comes in the
later changes which handle PMTUs and redirects using inetpeers.
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-02-08 07:38:06 +03:00
u32 rt_peer_genid ;
2005-04-17 02:20:36 +04:00
struct inet_peer * peer ; /* long-living peer info */
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write.
Initially a route entry points it's metrics at a read-only location.
If a routing table entry exists, it will point there. Else it will
point at the all zero metric place-holder called 'dst_default_metrics'.
The writeability state of the metrics is stored in the low bits of the
metrics pointer, we have two bits left to spare if we want to store
more states.
For the initial implementation, COW is implemented simply via kmalloc.
However future enhancements will change this to place the writable
metrics somewhere else, in order to increase sharing. Very likely
this "somewhere else" will be the inetpeer cache.
Note also that this means that metrics updates may transiently fail
if we cannot COW the metrics successfully.
But even by itself, this patch should decrease memory usage and
increase cache locality especially for routing workloads. In those
cases the read-only metric copies stay in place and never get written
to.
TCP workloads where metrics get updated, and those rare cases where
PMTU triggers occur, will take a very slight performance hit. But
that hit will be alleviated when the long-term writable metrics
move to a more sharable location.
Since the metrics storage went from a u32 array of RTAX_MAX entries to
what is essentially a pointer, some retooling of the dst_entry layout
was necessary.
Most importantly, we need to preserve the alignment of the reference
count so that it doesn't share cache lines with the read-mostly state,
as per Eric Dumazet's alignment assertion checks.
The only non-trivial bit here is the move of the 'flags' member into
the writeable cacheline. This is OK since we are always accessing the
flags around the same moment when we made a modification to the
reference count.
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-01-27 07:51:05 +03:00
struct fib_info * fi ; /* for client ref to shared metrics */
2005-04-17 02:20:36 +04:00
} ;
2010-11-12 04:07:48 +03:00
static inline bool rt_is_input_route ( struct rtable * rt )
{
2011-04-08 01:04:08 +04:00
return rt - > rt_route_iif ! = 0 ;
2010-11-12 04:07:48 +03:00
}
static inline bool rt_is_output_route ( struct rtable * rt )
{
2011-04-08 01:04:08 +04:00
return rt - > rt_route_iif = = 0 ;
2010-11-12 04:07:48 +03:00
}
2009-11-03 06:26:03 +03:00
struct ip_rt_acct {
2005-04-17 02:20:36 +04:00
__u32 o_bytes ;
__u32 o_packets ;
__u32 i_bytes ;
__u32 i_packets ;
} ;
2009-11-03 06:26:03 +03:00
struct rt_cache_stat {
2005-04-17 02:20:36 +04:00
unsigned int in_hit ;
unsigned int in_slow_tot ;
unsigned int in_slow_mc ;
unsigned int in_no_route ;
unsigned int in_brd ;
unsigned int in_martian_dst ;
unsigned int in_martian_src ;
unsigned int out_hit ;
unsigned int out_slow_tot ;
unsigned int out_slow_mc ;
unsigned int gc_total ;
unsigned int gc_ignored ;
unsigned int gc_goal_miss ;
unsigned int gc_dst_overflow ;
unsigned int in_hlist_search ;
unsigned int out_hlist_search ;
} ;
2010-02-16 18:20:26 +03:00
extern struct ip_rt_acct __percpu * ip_rt_acct ;
2005-04-17 02:20:36 +04:00
struct in_device ;
extern int ip_rt_init ( void ) ;
2006-09-27 08:25:43 +04:00
extern void ip_rt_redirect ( __be32 old_gw , __be32 dst , __be32 new_gw ,
__be32 src , struct net_device * dev ) ;
2008-07-06 06:00:44 +04:00
extern void rt_cache_flush ( struct net * net , int how ) ;
2010-12-20 08:11:20 +03:00
extern void rt_cache_flush_batch ( struct net * net ) ;
2011-04-29 01:48:42 +04:00
extern struct rtable * __ip_route_output_key ( struct net * , struct flowi4 * flp ) ;
2011-03-12 09:12:47 +03:00
extern struct rtable * ip_route_output_flow ( struct net * , struct flowi4 * flp ,
2011-03-03 01:31:35 +03:00
struct sock * sk ) ;
2011-03-02 01:59:04 +03:00
extern struct dst_entry * ipv4_blackhole_route ( struct net * net , struct dst_entry * dst_orig ) ;
2010-05-10 15:32:55 +04:00
2011-03-12 09:12:47 +03:00
static inline struct rtable * ip_route_output_key ( struct net * net , struct flowi4 * flp )
2011-03-03 01:56:30 +03:00
{
return ip_route_output_flow ( net , flp , NULL ) ;
}
2011-03-12 08:00:52 +03:00
static inline struct rtable * ip_route_output ( struct net * net , __be32 daddr ,
__be32 saddr , u8 tos , int oif )
{
2011-03-12 09:12:47 +03:00
struct flowi4 fl4 = {
. flowi4_oif = oif ,
. daddr = daddr ,
. saddr = saddr ,
. flowi4_tos = tos ,
2011-03-12 08:00:52 +03:00
} ;
2011-03-12 09:12:47 +03:00
return ip_route_output_key ( net , & fl4 ) ;
2011-03-12 08:00:52 +03:00
}
2011-05-04 07:25:42 +04:00
static inline struct rtable * ip_route_output_ports ( struct net * net , struct flowi4 * fl4 ,
struct sock * sk ,
2011-03-12 08:00:52 +03:00
__be32 daddr , __be32 saddr ,
__be16 dport , __be16 sport ,
__u8 proto , __u8 tos , int oif )
{
2011-05-04 07:25:42 +04:00
flowi4_init_output ( fl4 , oif , sk ? sk - > sk_mark : 0 , tos ,
2011-03-31 15:52:59 +04:00
RT_SCOPE_UNIVERSE , proto ,
sk ? inet_sk_flowi_flags ( sk ) : 0 ,
daddr , saddr , dport , sport ) ;
2011-03-12 08:00:52 +03:00
if ( sk )
2011-05-04 07:25:42 +04:00
security_sk_classify_flow ( sk , flowi4_to_flowi ( fl4 ) ) ;
return ip_route_output_flow ( net , fl4 , sk ) ;
2011-03-12 08:00:52 +03:00
}
2011-05-04 23:33:34 +04:00
static inline struct rtable * ip_route_output_gre ( struct net * net , struct flowi4 * fl4 ,
2011-03-12 08:00:52 +03:00
__be32 daddr , __be32 saddr ,
__be32 gre_key , __u8 tos , int oif )
{
2011-05-04 23:33:34 +04:00
memset ( fl4 , 0 , sizeof ( * fl4 ) ) ;
fl4 - > flowi4_oif = oif ;
fl4 - > daddr = daddr ;
fl4 - > saddr = saddr ;
fl4 - > flowi4_tos = tos ;
fl4 - > flowi4_proto = IPPROTO_GRE ;
fl4 - > fl4_gre_key = gre_key ;
return ip_route_output_key ( net , fl4 ) ;
2011-03-12 08:00:52 +03:00
}
2010-05-10 15:32:55 +04:00
extern int ip_route_input_common ( struct sk_buff * skb , __be32 dst , __be32 src ,
u8 tos , struct net_device * devin , bool noref ) ;
static inline int ip_route_input ( struct sk_buff * skb , __be32 dst , __be32 src ,
u8 tos , struct net_device * devin )
{
return ip_route_input_common ( skb , dst , src , tos , devin , false ) ;
}
static inline int ip_route_input_noref ( struct sk_buff * skb , __be32 dst , __be32 src ,
u8 tos , struct net_device * devin )
{
return ip_route_input_common ( skb , dst , src , tos , devin , true ) ;
}
2011-04-22 08:53:02 +04:00
extern unsigned short ip_rt_frag_needed ( struct net * net , const struct iphdr * iph ,
unsigned short new_mtu , struct net_device * dev ) ;
2005-04-17 02:20:36 +04:00
extern void ip_rt_send_redirect ( struct sk_buff * skb ) ;
2008-01-10 14:25:28 +03:00
extern unsigned inet_addr_type ( struct net * net , __be32 addr ) ;
extern unsigned inet_dev_addr_type ( struct net * net , const struct net_device * dev , __be32 addr ) ;
2005-04-17 02:20:36 +04:00
extern void ip_rt_multicast_event ( struct in_device * ) ;
2008-01-10 14:29:53 +03:00
extern int ip_rt_ioctl ( struct net * , unsigned int cmd , void __user * arg ) ;
2011-05-14 01:29:41 +04:00
extern void ip_rt_get_source ( u8 * src , struct sk_buff * skb , struct rtable * rt ) ;
2005-04-17 02:20:36 +04:00
extern int ip_rt_dump ( struct sk_buff * skb , struct netlink_callback * cb ) ;
2005-11-23 01:47:37 +03:00
struct in_ifaddr ;
extern void fib_add_ifaddr ( struct in_ifaddr * ) ;
2011-03-19 15:13:49 +03:00
extern void fib_del_ifaddr ( struct in_ifaddr * , struct in_ifaddr * ) ;
2005-11-23 01:47:37 +03:00
2005-04-17 02:20:36 +04:00
static inline void ip_rt_put ( struct rtable * rt )
{
if ( rt )
2010-06-11 10:31:35 +04:00
dst_release ( & rt - > dst ) ;
2005-04-17 02:20:36 +04:00
}
# define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)
2007-07-10 02:32:57 +04:00
extern const __u8 ip_tos2prio [ 16 ] ;
2005-04-17 02:20:36 +04:00
static inline char rt_tos2priority ( u8 tos )
{
return ip_tos2prio [ IPTOS_TOS ( tos ) > > 1 ] ;
}
2011-04-27 00:28:44 +04:00
/* ip_route_connect() and ip_route_newports() work in tandem whilst
* binding a socket for a new outgoing connection .
*
* In order to use IPSEC properly , we must , in the end , have a
* route that was looked up using all available keys including source
* and destination ports .
*
* However , if a source port needs to be allocated ( the user specified
* a wildcard source port ) we need to obtain addressing information
* in order to perform that allocation .
*
* So ip_route_connect ( ) looks up a route using wildcarded source and
* destination ports in the key , simply so that we can get a pair of
* addresses to use for port allocation .
*
* Later , once the ports are allocated , ip_route_newports ( ) will make
* another route lookup if needed to make sure we catch any IPSEC
* rules keyed on the port information .
*
* The callers allocate the flow key on their stack , and must pass in
* the same flowi4 object to both the ip_route_connect ( ) and the
* ip_route_newports ( ) calls .
*/
static inline void ip_route_connect_init ( struct flowi4 * fl4 , __be32 dst , __be32 src ,
u32 tos , int oif , u8 protocol ,
__be16 sport , __be16 dport ,
struct sock * sk , bool can_sleep )
2005-04-17 02:20:36 +04:00
{
2011-04-27 00:28:44 +04:00
__u8 flow_flags = 0 ;
2008-10-01 18:35:39 +04:00
if ( inet_sk ( sk ) - > transparent )
2011-03-31 15:52:59 +04:00
flow_flags | = FLOWI_FLAG_ANYSRC ;
2011-01-28 09:01:53 +03:00
if ( protocol = = IPPROTO_TCP )
2011-03-31 15:52:59 +04:00
flow_flags | = FLOWI_FLAG_PRECOW_METRICS ;
2011-03-02 01:22:19 +03:00
if ( can_sleep )
2011-03-31 15:52:59 +04:00
flow_flags | = FLOWI_FLAG_CAN_SLEEP ;
2011-04-27 00:28:44 +04:00
flowi4_init_output ( fl4 , oif , sk - > sk_mark , tos , RT_SCOPE_UNIVERSE ,
2011-03-31 15:52:59 +04:00
protocol , flow_flags , dst , src , dport , sport ) ;
2011-04-27 00:28:44 +04:00
}
static inline struct rtable * ip_route_connect ( struct flowi4 * fl4 ,
__be32 dst , __be32 src , u32 tos ,
int oif , u8 protocol ,
__be16 sport , __be16 dport ,
struct sock * sk , bool can_sleep )
{
struct net * net = sock_net ( sk ) ;
struct rtable * rt ;
ip_route_connect_init ( fl4 , dst , src , tos , oif , protocol ,
sport , dport , sk , can_sleep ) ;
2008-10-01 18:35:39 +04:00
2005-04-17 02:20:36 +04:00
if ( ! dst | | ! src ) {
2011-04-27 00:28:44 +04:00
rt = __ip_route_output_key ( net , fl4 ) ;
2011-03-03 01:31:35 +03:00
if ( IS_ERR ( rt ) )
return rt ;
ip_rt_put ( rt ) ;
2005-04-17 02:20:36 +04:00
}
2011-04-27 00:28:44 +04:00
security_sk_classify_flow ( sk , flowi4_to_flowi ( fl4 ) ) ;
return ip_route_output_flow ( net , fl4 , sk ) ;
2005-04-17 02:20:36 +04:00
}
2011-04-27 00:28:44 +04:00
static inline struct rtable * ip_route_newports ( struct flowi4 * fl4 , struct rtable * rt ,
__be16 orig_sport , __be16 orig_dport ,
__be16 sport , __be16 dport ,
struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2011-02-25 00:38:12 +03:00
if ( sport ! = orig_sport | | dport ! = orig_dport ) {
2011-04-27 00:28:44 +04:00
fl4 - > fl4_dport = dport ;
fl4 - > fl4_sport = sport ;
2011-03-03 01:31:35 +03:00
ip_rt_put ( rt ) ;
2011-04-27 00:28:44 +04:00
security_sk_classify_flow ( sk , flowi4_to_flowi ( fl4 ) ) ;
return ip_route_output_flow ( sock_net ( sk ) , fl4 , sk ) ;
2005-04-17 02:20:36 +04:00
}
2011-03-03 01:31:35 +03:00
return rt ;
2005-04-17 02:20:36 +04:00
}
2011-05-19 02:42:43 +04:00
extern void rt_bind_peer ( struct rtable * rt , __be32 daddr , int create ) ;
2005-04-17 02:20:36 +04:00
2011-05-19 02:38:54 +04:00
static inline struct inet_peer * rt_get_peer ( struct rtable * rt , __be32 daddr )
2005-04-17 02:20:36 +04:00
{
if ( rt - > peer )
return rt - > peer ;
2011-05-19 02:42:43 +04:00
rt_bind_peer ( rt , daddr , 0 ) ;
2005-04-17 02:20:36 +04:00
return rt - > peer ;
}
2008-10-01 18:33:10 +04:00
static inline int inet_iif ( const struct sk_buff * skb )
{
2009-06-02 09:14:27 +04:00
return skb_rtable ( skb ) - > rt_iif ;
2008-10-01 18:33:10 +04:00
}
2010-12-13 08:55:08 +03:00
extern int sysctl_ip_default_ttl ;
static inline int ip4_dst_hoplimit ( const struct dst_entry * dst )
{
int hoplimit = dst_metric_raw ( dst , RTAX_HOPLIMIT ) ;
if ( hoplimit = = 0 )
hoplimit = sysctl_ip_default_ttl ;
return hoplimit ;
}
2005-04-17 02:20:36 +04:00
# endif /* _ROUTE_H */