2005-04-16 15:20:36 -07:00
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system . IPVS is now implemented as a module
* over the Netfilter framework . IPVS can be used to build a
* high - performance and highly available server based on a
* cluster of servers .
*
* Version : $ Id : ip_vs_core . c , v 1.34 2003 / 05 / 10 03 : 05 : 23 wensong Exp $
*
* Authors : Wensong Zhang < wensong @ linuxvirtualserver . org >
* Peter Kese < peter . kese @ ijs . si >
* Julian Anastasov < ja @ ssi . bg >
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese ,
* with changes / fixes from Julian Anastasov , Lars Marowsky - Bree , Horms
* and others .
*
* Changes :
* Paul ` Rusty ' Russell properly handle non - linear skbs
2005-08-09 19:24:19 -07:00
* Harald Welte don ' t use nfcache
2005-04-16 15:20:36 -07:00
*
*/
# include <linux/module.h>
# include <linux/kernel.h>
# include <linux/ip.h>
# include <linux/tcp.h>
# include <linux/icmp.h>
# include <net/ip.h>
# include <net/tcp.h>
# include <net/udp.h>
# include <net/icmp.h> /* for icmp_send */
# include <net/route.h>
# include <linux/netfilter.h>
# include <linux/netfilter_ipv4.h>
# include <net/ip_vs.h>
EXPORT_SYMBOL ( register_ip_vs_scheduler ) ;
EXPORT_SYMBOL ( unregister_ip_vs_scheduler ) ;
EXPORT_SYMBOL ( ip_vs_skb_replace ) ;
EXPORT_SYMBOL ( ip_vs_proto_name ) ;
EXPORT_SYMBOL ( ip_vs_conn_new ) ;
EXPORT_SYMBOL ( ip_vs_conn_in_get ) ;
EXPORT_SYMBOL ( ip_vs_conn_out_get ) ;
# ifdef CONFIG_IP_VS_PROTO_TCP
EXPORT_SYMBOL ( ip_vs_tcp_conn_listen ) ;
# endif
EXPORT_SYMBOL ( ip_vs_conn_put ) ;
# ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL ( ip_vs_get_debug_level ) ;
# endif
EXPORT_SYMBOL ( ip_vs_make_skb_writable ) ;
/* ID used in ICMP lookups */
# define icmp_id(icmph) (((icmph)->un).echo.id)
const char * ip_vs_proto_name ( unsigned proto )
{
static char buf [ 20 ] ;
switch ( proto ) {
case IPPROTO_IP :
return " IP " ;
case IPPROTO_UDP :
return " UDP " ;
case IPPROTO_TCP :
return " TCP " ;
case IPPROTO_ICMP :
return " ICMP " ;
default :
sprintf ( buf , " IP_%d " , proto ) ;
return buf ;
}
}
void ip_vs_init_hash_table ( struct list_head * table , int rows )
{
while ( - - rows > = 0 )
INIT_LIST_HEAD ( & table [ rows ] ) ;
}
static inline void
ip_vs_in_stats ( struct ip_vs_conn * cp , struct sk_buff * skb )
{
struct ip_vs_dest * dest = cp - > dest ;
if ( dest & & ( dest - > flags & IP_VS_DEST_F_AVAILABLE ) ) {
spin_lock ( & dest - > stats . lock ) ;
dest - > stats . inpkts + + ;
dest - > stats . inbytes + = skb - > len ;
spin_unlock ( & dest - > stats . lock ) ;
spin_lock ( & dest - > svc - > stats . lock ) ;
dest - > svc - > stats . inpkts + + ;
dest - > svc - > stats . inbytes + = skb - > len ;
spin_unlock ( & dest - > svc - > stats . lock ) ;
spin_lock ( & ip_vs_stats . lock ) ;
ip_vs_stats . inpkts + + ;
ip_vs_stats . inbytes + = skb - > len ;
spin_unlock ( & ip_vs_stats . lock ) ;
}
}
static inline void
ip_vs_out_stats ( struct ip_vs_conn * cp , struct sk_buff * skb )
{
struct ip_vs_dest * dest = cp - > dest ;
if ( dest & & ( dest - > flags & IP_VS_DEST_F_AVAILABLE ) ) {
spin_lock ( & dest - > stats . lock ) ;
dest - > stats . outpkts + + ;
dest - > stats . outbytes + = skb - > len ;
spin_unlock ( & dest - > stats . lock ) ;
spin_lock ( & dest - > svc - > stats . lock ) ;
dest - > svc - > stats . outpkts + + ;
dest - > svc - > stats . outbytes + = skb - > len ;
spin_unlock ( & dest - > svc - > stats . lock ) ;
spin_lock ( & ip_vs_stats . lock ) ;
ip_vs_stats . outpkts + + ;
ip_vs_stats . outbytes + = skb - > len ;
spin_unlock ( & ip_vs_stats . lock ) ;
}
}
static inline void
ip_vs_conn_stats ( struct ip_vs_conn * cp , struct ip_vs_service * svc )
{
spin_lock ( & cp - > dest - > stats . lock ) ;
cp - > dest - > stats . conns + + ;
spin_unlock ( & cp - > dest - > stats . lock ) ;
spin_lock ( & svc - > stats . lock ) ;
svc - > stats . conns + + ;
spin_unlock ( & svc - > stats . lock ) ;
spin_lock ( & ip_vs_stats . lock ) ;
ip_vs_stats . conns + + ;
spin_unlock ( & ip_vs_stats . lock ) ;
}
static inline int
ip_vs_set_state ( struct ip_vs_conn * cp , int direction ,
const struct sk_buff * skb ,
struct ip_vs_protocol * pp )
{
if ( unlikely ( ! pp - > state_transition ) )
return 0 ;
return pp - > state_transition ( cp , direction , skb , pp ) ;
}
int ip_vs_make_skb_writable ( struct sk_buff * * pskb , int writable_len )
{
struct sk_buff * skb = * pskb ;
/* skb is already used, better copy skb and its payload */
if ( unlikely ( skb_shared ( skb ) | | skb - > sk ) )
goto copy_skb ;
/* skb data is already used, copy it */
if ( unlikely ( skb_cloned ( skb ) ) )
goto copy_data ;
return pskb_may_pull ( skb , writable_len ) ;
copy_data :
if ( unlikely ( writable_len > skb - > len ) )
return 0 ;
return ! pskb_expand_head ( skb , 0 , 0 , GFP_ATOMIC ) ;
copy_skb :
if ( unlikely ( writable_len > skb - > len ) )
return 0 ;
skb = skb_copy ( skb , GFP_ATOMIC ) ;
if ( ! skb )
return 0 ;
BUG_ON ( skb_is_nonlinear ( skb ) ) ;
/* Rest of kernel will get very unhappy if we pass it a
suddenly - orphaned skbuff */
if ( ( * pskb ) - > sk )
skb_set_owner_w ( skb , ( * pskb ) - > sk ) ;
kfree_skb ( * pskb ) ;
* pskb = skb ;
return 1 ;
}
/*
* IPVS persistent scheduling function
* It creates a connection entry according to its template if exists ,
* or selects a server and creates a connection entry plus a template .
* Locking : we are svc user ( svc - > refcnt ) , so we hold all dests too
* Protocols supported : TCP , UDP
*/
static struct ip_vs_conn *
ip_vs_sched_persist ( struct ip_vs_service * svc ,
const struct sk_buff * skb ,
2006-09-28 14:29:52 -07:00
__be16 ports [ 2 ] )
2005-04-16 15:20:36 -07:00
{
struct ip_vs_conn * cp = NULL ;
2007-04-20 22:47:35 -07:00
struct iphdr * iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
struct ip_vs_dest * dest ;
struct ip_vs_conn * ct ;
2006-09-28 14:29:52 -07:00
__be16 dport ; /* destination port to forward */
__be32 snet ; /* source network of the client, after masking */
2005-04-16 15:20:36 -07:00
/* Mask saddr with the netmask to adjust template granularity */
snet = iph - > saddr & svc - > netmask ;
IP_VS_DBG ( 6 , " p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
" mnet %u.%u.%u.%u \n " ,
NIPQUAD ( iph - > saddr ) , ntohs ( ports [ 0 ] ) ,
NIPQUAD ( iph - > daddr ) , ntohs ( ports [ 1 ] ) ,
NIPQUAD ( snet ) ) ;
/*
* As far as we know , FTP is a very complicated network protocol , and
* it uses control connection and data connections . For active FTP ,
* FTP server initialize data connection to the client , its source port
* is often 20. For passive FTP , FTP server tells the clients the port
* that it passively listens to , and the client issues the data
* connection . In the tunneling or direct routing mode , the load
* balancer is on the client - to - server half of connection , the port
* number is unknown to the load balancer . So , a conn template like
* < caddr , 0 , vaddr , 0 , daddr , 0 > is created for persistent FTP
* service , and a template like < caddr , 0 , vaddr , vport , daddr , dport >
* is created for other persistent services .
*/
if ( ports [ 1 ] = = svc - > port ) {
/* Check if a template already exists */
if ( svc - > port ! = FTPPORT )
2005-09-14 21:08:51 -07:00
ct = ip_vs_ct_in_get ( iph - > protocol , snet , 0 ,
2005-04-16 15:20:36 -07:00
iph - > daddr , ports [ 1 ] ) ;
else
2005-09-14 21:08:51 -07:00
ct = ip_vs_ct_in_get ( iph - > protocol , snet , 0 ,
2005-04-16 15:20:36 -07:00
iph - > daddr , 0 ) ;
if ( ! ct | | ! ip_vs_check_template ( ct ) ) {
/*
* No template found or the dest of the connection
* template is not available .
*/
dest = svc - > scheduler - > schedule ( svc , skb ) ;
if ( dest = = NULL ) {
IP_VS_DBG ( 1 , " p-schedule: no dest found. \n " ) ;
return NULL ;
}
/*
* Create a template like < protocol , caddr , 0 ,
* vaddr , vport , daddr , dport > for non - ftp service ,
* and < protocol , caddr , 0 , vaddr , 0 , daddr , 0 >
* for ftp service .
*/
if ( svc - > port ! = FTPPORT )
ct = ip_vs_conn_new ( iph - > protocol ,
snet , 0 ,
iph - > daddr ,
ports [ 1 ] ,
dest - > addr , dest - > port ,
2005-09-14 21:08:51 -07:00
IP_VS_CONN_F_TEMPLATE ,
2005-04-16 15:20:36 -07:00
dest ) ;
else
ct = ip_vs_conn_new ( iph - > protocol ,
snet , 0 ,
iph - > daddr , 0 ,
dest - > addr , 0 ,
2005-09-14 21:08:51 -07:00
IP_VS_CONN_F_TEMPLATE ,
2005-04-16 15:20:36 -07:00
dest ) ;
if ( ct = = NULL )
return NULL ;
ct - > timeout = svc - > timeout ;
} else {
/* set destination with the found template */
dest = ct - > dest ;
}
dport = dest - > port ;
} else {
/*
* Note : persistent fwmark - based services and persistent
* port zero service are handled here .
* fwmark template : < IPPROTO_IP , caddr , 0 , fwmark , 0 , daddr , 0 >
* port zero template : < protocol , caddr , 0 , vaddr , 0 , daddr , 0 >
*/
if ( svc - > fwmark )
2005-09-14 21:08:51 -07:00
ct = ip_vs_ct_in_get ( IPPROTO_IP , snet , 0 ,
2005-04-16 15:20:36 -07:00
htonl ( svc - > fwmark ) , 0 ) ;
else
2005-09-14 21:08:51 -07:00
ct = ip_vs_ct_in_get ( iph - > protocol , snet , 0 ,
2005-04-16 15:20:36 -07:00
iph - > daddr , 0 ) ;
if ( ! ct | | ! ip_vs_check_template ( ct ) ) {
/*
* If it is not persistent port zero , return NULL ,
* otherwise create a connection template .
*/
if ( svc - > port )
return NULL ;
dest = svc - > scheduler - > schedule ( svc , skb ) ;
if ( dest = = NULL ) {
IP_VS_DBG ( 1 , " p-schedule: no dest found. \n " ) ;
return NULL ;
}
/*
* Create a template according to the service
*/
if ( svc - > fwmark )
ct = ip_vs_conn_new ( IPPROTO_IP ,
snet , 0 ,
htonl ( svc - > fwmark ) , 0 ,
dest - > addr , 0 ,
2005-09-14 21:08:51 -07:00
IP_VS_CONN_F_TEMPLATE ,
2005-04-16 15:20:36 -07:00
dest ) ;
else
ct = ip_vs_conn_new ( iph - > protocol ,
snet , 0 ,
iph - > daddr , 0 ,
dest - > addr , 0 ,
2005-09-14 21:08:51 -07:00
IP_VS_CONN_F_TEMPLATE ,
2005-04-16 15:20:36 -07:00
dest ) ;
if ( ct = = NULL )
return NULL ;
ct - > timeout = svc - > timeout ;
} else {
/* set destination with the found template */
dest = ct - > dest ;
}
dport = ports [ 1 ] ;
}
/*
* Create a new connection according to the template
*/
cp = ip_vs_conn_new ( iph - > protocol ,
iph - > saddr , ports [ 0 ] ,
iph - > daddr , ports [ 1 ] ,
dest - > addr , dport ,
0 ,
dest ) ;
if ( cp = = NULL ) {
ip_vs_conn_put ( ct ) ;
return NULL ;
}
/*
* Add its control
*/
ip_vs_control_add ( cp , ct ) ;
ip_vs_conn_put ( ct ) ;
ip_vs_conn_stats ( cp , svc ) ;
return cp ;
}
/*
* IPVS main scheduling function
* It selects a server according to the virtual service , and
* creates a connection entry .
* Protocols supported : TCP , UDP
*/
struct ip_vs_conn *
ip_vs_schedule ( struct ip_vs_service * svc , const struct sk_buff * skb )
{
struct ip_vs_conn * cp = NULL ;
2007-04-20 22:47:35 -07:00
struct iphdr * iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
struct ip_vs_dest * dest ;
2006-09-28 14:29:52 -07:00
__be16 _ports [ 2 ] , * pptr ;
2005-04-16 15:20:36 -07:00
pptr = skb_header_pointer ( skb , iph - > ihl * 4 ,
sizeof ( _ports ) , _ports ) ;
if ( pptr = = NULL )
return NULL ;
/*
* Persistent service
*/
if ( svc - > flags & IP_VS_SVC_F_PERSISTENT )
return ip_vs_sched_persist ( svc , skb , pptr ) ;
/*
* Non - persistent service
*/
if ( ! svc - > fwmark & & pptr [ 1 ] ! = svc - > port ) {
if ( ! svc - > port )
IP_VS_ERR ( " Schedule: port zero only supported "
" in persistent services, "
" check your ipvs configuration \n " ) ;
return NULL ;
}
dest = svc - > scheduler - > schedule ( svc , skb ) ;
if ( dest = = NULL ) {
IP_VS_DBG ( 1 , " Schedule: no dest found. \n " ) ;
return NULL ;
}
/*
* Create a connection entry .
*/
cp = ip_vs_conn_new ( iph - > protocol ,
iph - > saddr , pptr [ 0 ] ,
iph - > daddr , pptr [ 1 ] ,
dest - > addr , dest - > port ? dest - > port : pptr [ 1 ] ,
0 ,
dest ) ;
if ( cp = = NULL )
return NULL ;
IP_VS_DBG ( 6 , " Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
2006-01-03 14:22:59 -08:00
" d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d \n " ,
2005-04-16 15:20:36 -07:00
ip_vs_fwd_tag ( cp ) ,
NIPQUAD ( cp - > caddr ) , ntohs ( cp - > cport ) ,
NIPQUAD ( cp - > vaddr ) , ntohs ( cp - > vport ) ,
NIPQUAD ( cp - > daddr ) , ntohs ( cp - > dport ) ,
cp - > flags , atomic_read ( & cp - > refcnt ) ) ;
ip_vs_conn_stats ( cp , svc ) ;
return cp ;
}
/*
* Pass or drop the packet .
* Called by ip_vs_in , when the virtual service is available but
* no destination is available for a new connection .
*/
int ip_vs_leave ( struct ip_vs_service * svc , struct sk_buff * skb ,
struct ip_vs_protocol * pp )
{
2006-09-28 14:29:52 -07:00
__be16 _ports [ 2 ] , * pptr ;
2007-04-20 22:47:35 -07:00
struct iphdr * iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
pptr = skb_header_pointer ( skb , iph - > ihl * 4 ,
sizeof ( _ports ) , _ports ) ;
if ( pptr = = NULL ) {
ip_vs_service_put ( svc ) ;
return NF_DROP ;
}
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is RTN_UNICAST ( and not local ) , then create
a cache_bypass connection entry */
if ( sysctl_ip_vs_cache_bypass & & svc - > fwmark
& & ( inet_addr_type ( iph - > daddr ) = = RTN_UNICAST ) ) {
int ret , cs ;
struct ip_vs_conn * cp ;
ip_vs_service_put ( svc ) ;
/* create a new connection entry */
IP_VS_DBG ( 6 , " ip_vs_leave: create a cache_bypass entry \n " ) ;
cp = ip_vs_conn_new ( iph - > protocol ,
iph - > saddr , pptr [ 0 ] ,
iph - > daddr , pptr [ 1 ] ,
0 , 0 ,
IP_VS_CONN_F_BYPASS ,
NULL ) ;
if ( cp = = NULL )
return NF_DROP ;
/* statistics */
ip_vs_in_stats ( cp , skb ) ;
/* set state */
cs = ip_vs_set_state ( cp , IP_VS_DIR_INPUT , skb , pp ) ;
/* transmit the first SYN packet */
ret = cp - > packet_xmit ( skb , cp , pp ) ;
/* do not touch skb anymore */
atomic_inc ( & cp - > in_pkts ) ;
ip_vs_conn_put ( cp ) ;
return ret ;
}
/*
* When the virtual ftp service is presented , packets destined
* for other services on the VIP may get here ( except services
* listed in the ipvs table ) , pass the packets , because it is
* not ipvs job to decide to drop the packets .
*/
if ( ( svc - > port = = FTPPORT ) & & ( pptr [ 1 ] ! = FTPPORT ) ) {
ip_vs_service_put ( svc ) ;
return NF_ACCEPT ;
}
ip_vs_service_put ( svc ) ;
/*
* Notify the client that the destination is unreachable , and
* release the socket buffer .
* Since it is in IP layer , the TCP socket is not actually
* created , the TCP RST packet cannot be sent , instead that
* ICMP_PORT_UNREACH is sent here no matter it is TCP / UDP . - - WZ
*/
icmp_send ( skb , ICMP_DEST_UNREACH , ICMP_PORT_UNREACH , 0 ) ;
return NF_DROP ;
}
/*
* It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
* chain , and is used for VS / NAT .
* It detects packets for VS / NAT connections and sends the packets
* immediately . This can avoid that iptable_nat mangles the packets
* for VS / NAT .
*/
static unsigned int ip_vs_post_routing ( unsigned int hooknum ,
struct sk_buff * * pskb ,
const struct net_device * in ,
const struct net_device * out ,
int ( * okfn ) ( struct sk_buff * ) )
{
2005-08-09 19:24:19 -07:00
if ( ! ( ( * pskb ) - > ipvs_property ) )
2005-04-16 15:20:36 -07:00
return NF_ACCEPT ;
/* The packet was sent from IPVS, exit this chain */
2006-01-05 12:20:40 -08:00
return NF_STOP ;
2005-04-16 15:20:36 -07:00
}
2006-11-14 21:37:50 -08:00
__sum16 ip_vs_checksum_complete ( struct sk_buff * skb , int offset )
2005-04-16 15:20:36 -07:00
{
2006-11-14 21:24:49 -08:00
return csum_fold ( skb_checksum ( skb , offset , skb - > len - offset , 0 ) ) ;
2005-04-16 15:20:36 -07:00
}
static inline struct sk_buff *
ip_vs_gather_frags ( struct sk_buff * skb , u_int32_t user )
{
skb = ip_defrag ( skb , user ) ;
if ( skb )
2007-04-20 22:47:35 -07:00
ip_send_check ( ip_hdr ( skb ) ) ;
2005-04-16 15:20:36 -07:00
return skb ;
}
/*
* Packet has been made sufficiently writable in caller
* - inout : 1 = in - > out , 0 = out - > in
*/
void ip_vs_nat_icmp ( struct sk_buff * skb , struct ip_vs_protocol * pp ,
struct ip_vs_conn * cp , int inout )
{
2007-04-20 22:47:35 -07:00
struct iphdr * iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
unsigned int icmp_offset = iph - > ihl * 4 ;
2007-04-10 20:50:43 -07:00
struct icmphdr * icmph = ( struct icmphdr * ) ( skb_network_header ( skb ) +
icmp_offset ) ;
2005-04-16 15:20:36 -07:00
struct iphdr * ciph = ( struct iphdr * ) ( icmph + 1 ) ;
if ( inout ) {
iph - > saddr = cp - > vaddr ;
ip_send_check ( iph ) ;
ciph - > daddr = cp - > vaddr ;
ip_send_check ( ciph ) ;
} else {
iph - > daddr = cp - > daddr ;
ip_send_check ( iph ) ;
ciph - > saddr = cp - > daddr ;
ip_send_check ( ciph ) ;
}
/* the TCP/UDP port */
if ( IPPROTO_TCP = = ciph - > protocol | | IPPROTO_UDP = = ciph - > protocol ) {
2006-09-28 14:29:52 -07:00
__be16 * ports = ( void * ) ciph + ciph - > ihl * 4 ;
2005-04-16 15:20:36 -07:00
if ( inout )
ports [ 1 ] = cp - > vport ;
else
ports [ 0 ] = cp - > dport ;
}
/* And finally the ICMP checksum */
icmph - > checksum = 0 ;
icmph - > checksum = ip_vs_checksum_complete ( skb , icmp_offset ) ;
skb - > ip_summed = CHECKSUM_UNNECESSARY ;
if ( inout )
IP_VS_DBG_PKT ( 11 , pp , skb , ( void * ) ciph - ( void * ) iph ,
" Forwarding altered outgoing ICMP " ) ;
else
IP_VS_DBG_PKT ( 11 , pp , skb , ( void * ) ciph - ( void * ) iph ,
" Forwarding altered incoming ICMP " ) ;
}
/*
* Handle ICMP messages in the inside - to - outside direction ( outgoing ) .
* Find any that might be relevant , check against existing connections ,
* forward to the right destination host if relevant .
* Currently handles error types - unreachable , quench , ttl exceeded .
* ( Only used in VS / NAT )
*/
static int ip_vs_out_icmp ( struct sk_buff * * pskb , int * related )
{
struct sk_buff * skb = * pskb ;
struct iphdr * iph ;
struct icmphdr _icmph , * ic ;
struct iphdr _ciph , * cih ; /* The ip header contained within the ICMP */
struct ip_vs_conn * cp ;
struct ip_vs_protocol * pp ;
unsigned int offset , ihl , verdict ;
* related = 1 ;
/* reassemble IP fragments */
2007-04-20 22:47:35 -07:00
if ( ip_hdr ( skb ) - > frag_off & htons ( IP_MF | IP_OFFSET ) ) {
2005-04-16 15:20:36 -07:00
skb = ip_vs_gather_frags ( skb , IP_DEFRAG_VS_OUT ) ;
if ( ! skb )
return NF_STOLEN ;
* pskb = skb ;
}
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
offset = ihl = iph - > ihl * 4 ;
ic = skb_header_pointer ( skb , offset , sizeof ( _icmph ) , & _icmph ) ;
if ( ic = = NULL )
return NF_DROP ;
IP_VS_DBG ( 12 , " Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u \n " ,
ic - > type , ntohs ( icmp_id ( ic ) ) ,
NIPQUAD ( iph - > saddr ) , NIPQUAD ( iph - > daddr ) ) ;
/*
* Work through seeing if this is for us .
* These checks are supposed to be in an order that means easy
* things are checked first to speed up processing . . . . however
* this means that some packets will manage to get a long way
* down this stack and then be rejected , but that ' s life .
*/
if ( ( ic - > type ! = ICMP_DEST_UNREACH ) & &
( ic - > type ! = ICMP_SOURCE_QUENCH ) & &
( ic - > type ! = ICMP_TIME_EXCEEDED ) ) {
* related = 0 ;
return NF_ACCEPT ;
}
/* Now find the contained IP header */
offset + = sizeof ( _icmph ) ;
cih = skb_header_pointer ( skb , offset , sizeof ( _ciph ) , & _ciph ) ;
if ( cih = = NULL )
return NF_ACCEPT ; /* The packet looks wrong, ignore */
pp = ip_vs_proto_get ( cih - > protocol ) ;
if ( ! pp )
return NF_ACCEPT ;
/* Is the embedded protocol header present? */
2007-03-07 14:19:10 +09:00
if ( unlikely ( cih - > frag_off & htons ( IP_OFFSET ) & &
2005-04-16 15:20:36 -07:00
pp - > dont_defrag ) )
return NF_ACCEPT ;
IP_VS_DBG_PKT ( 11 , pp , skb , offset , " Checking outgoing ICMP for " ) ;
offset + = cih - > ihl * 4 ;
/* The embedded headers contain source and dest in reverse order */
cp = pp - > conn_out_get ( skb , pp , cih , offset , 1 ) ;
if ( ! cp )
return NF_ACCEPT ;
verdict = NF_DROP ;
if ( IP_VS_FWD_METHOD ( cp ) ! = 0 ) {
IP_VS_ERR ( " shouldn't reach here, because the box is on the "
" half connection in the tun/dr module. \n " ) ;
}
/* Ensure the checksum is correct */
2007-04-09 11:59:39 -07:00
if ( ! skb_csum_unnecessary ( skb ) & & ip_vs_checksum_complete ( skb , ihl ) ) {
2005-04-16 15:20:36 -07:00
/* Failed checksum! */
IP_VS_DBG ( 1 , " Forward ICMP: failed checksum from %d.%d.%d.%d! \n " ,
NIPQUAD ( iph - > saddr ) ) ;
goto out ;
}
if ( IPPROTO_TCP = = cih - > protocol | | IPPROTO_UDP = = cih - > protocol )
offset + = 2 * sizeof ( __u16 ) ;
if ( ! ip_vs_make_skb_writable ( pskb , offset ) )
goto out ;
skb = * pskb ;
ip_vs_nat_icmp ( skb , pp , cp , 1 ) ;
/* do the statistics and put it back */
ip_vs_out_stats ( cp , skb ) ;
2005-08-09 19:24:19 -07:00
skb - > ipvs_property = 1 ;
2005-04-16 15:20:36 -07:00
verdict = NF_ACCEPT ;
out :
__ip_vs_conn_put ( cp ) ;
return verdict ;
}
static inline int is_tcp_reset ( const struct sk_buff * skb )
{
struct tcphdr _tcph , * th ;
2007-03-12 20:09:15 -03:00
th = skb_header_pointer ( skb , ip_hdrlen ( skb ) , sizeof ( _tcph ) , & _tcph ) ;
2005-04-16 15:20:36 -07:00
if ( th = = NULL )
return 0 ;
return th - > rst ;
}
/*
* It is hooked at the NF_IP_FORWARD chain , used only for VS / NAT .
* Check if outgoing packet belongs to the established ip_vs_conn ,
* rewrite addresses of the packet and send it on its way . . .
*/
static unsigned int
ip_vs_out ( unsigned int hooknum , struct sk_buff * * pskb ,
const struct net_device * in , const struct net_device * out ,
int ( * okfn ) ( struct sk_buff * ) )
{
struct sk_buff * skb = * pskb ;
struct iphdr * iph ;
struct ip_vs_protocol * pp ;
struct ip_vs_conn * cp ;
int ihl ;
EnterFunction ( 11 ) ;
2005-08-09 19:24:19 -07:00
if ( skb - > ipvs_property )
2005-04-16 15:20:36 -07:00
return NF_ACCEPT ;
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( iph - > protocol = = IPPROTO_ICMP ) ) {
int related , verdict = ip_vs_out_icmp ( pskb , & related ) ;
if ( related )
return verdict ;
skb = * pskb ;
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
}
pp = ip_vs_proto_get ( iph - > protocol ) ;
if ( unlikely ( ! pp ) )
return NF_ACCEPT ;
/* reassemble IP fragments */
2007-03-07 14:19:10 +09:00
if ( unlikely ( iph - > frag_off & htons ( IP_MF | IP_OFFSET ) & &
2005-04-16 15:20:36 -07:00
! pp - > dont_defrag ) ) {
skb = ip_vs_gather_frags ( skb , IP_DEFRAG_VS_OUT ) ;
if ( ! skb )
return NF_STOLEN ;
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
* pskb = skb ;
}
ihl = iph - > ihl < < 2 ;
/*
* Check if the packet belongs to an existing entry
*/
cp = pp - > conn_out_get ( skb , pp , iph , ihl , 0 ) ;
if ( unlikely ( ! cp ) ) {
if ( sysctl_ip_vs_nat_icmp_send & &
( pp - > protocol = = IPPROTO_TCP | |
pp - > protocol = = IPPROTO_UDP ) ) {
2006-09-28 14:29:52 -07:00
__be16 _ports [ 2 ] , * pptr ;
2005-04-16 15:20:36 -07:00
pptr = skb_header_pointer ( skb , ihl ,
sizeof ( _ports ) , _ports ) ;
if ( pptr = = NULL )
return NF_ACCEPT ; /* Not for me */
if ( ip_vs_lookup_real_service ( iph - > protocol ,
iph - > saddr , pptr [ 0 ] ) ) {
/*
* Notify the real server : there is no
* existing entry if it is not RST
* packet or not TCP packet .
*/
if ( iph - > protocol ! = IPPROTO_TCP
| | ! is_tcp_reset ( skb ) ) {
icmp_send ( skb , ICMP_DEST_UNREACH ,
ICMP_PORT_UNREACH , 0 ) ;
return NF_DROP ;
}
}
}
IP_VS_DBG_PKT ( 12 , pp , skb , 0 ,
" packet continues traversal as normal " ) ;
return NF_ACCEPT ;
}
IP_VS_DBG_PKT ( 11 , pp , skb , 0 , " Outgoing packet " ) ;
if ( ! ip_vs_make_skb_writable ( pskb , ihl ) )
goto drop ;
/* mangle the packet */
if ( pp - > snat_handler & & ! pp - > snat_handler ( pskb , pp , cp ) )
goto drop ;
skb = * pskb ;
2007-04-20 22:47:35 -07:00
ip_hdr ( skb ) - > saddr = cp - > vaddr ;
ip_send_check ( ip_hdr ( skb ) ) ;
2005-04-16 15:20:36 -07:00
2007-02-09 23:24:47 +09:00
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through . We want this packet to be routed as
* if it came from this machine itself . So re - compute
* the routing information .
*/
if ( ip_route_me_harder ( pskb , RTN_LOCAL ) ! = 0 )
goto drop ;
2006-10-02 16:11:51 -07:00
skb = * pskb ;
2005-04-16 15:20:36 -07:00
IP_VS_DBG_PKT ( 10 , pp , skb , 0 , " After SNAT " ) ;
ip_vs_out_stats ( cp , skb ) ;
ip_vs_set_state ( cp , IP_VS_DIR_OUTPUT , skb , pp ) ;
ip_vs_conn_put ( cp ) ;
2005-08-09 19:24:19 -07:00
skb - > ipvs_property = 1 ;
2005-04-16 15:20:36 -07:00
LeaveFunction ( 11 ) ;
return NF_ACCEPT ;
drop :
ip_vs_conn_put ( cp ) ;
kfree_skb ( * pskb ) ;
return NF_STOLEN ;
}
/*
* Handle ICMP messages in the outside - to - inside direction ( incoming ) .
* Find any that might be relevant , check against existing connections ,
* forward to the right destination host if relevant .
* Currently handles error types - unreachable , quench , ttl exceeded .
*/
2007-02-09 23:24:47 +09:00
static int
2005-04-16 15:20:36 -07:00
ip_vs_in_icmp ( struct sk_buff * * pskb , int * related , unsigned int hooknum )
{
struct sk_buff * skb = * pskb ;
struct iphdr * iph ;
struct icmphdr _icmph , * ic ;
struct iphdr _ciph , * cih ; /* The ip header contained within the ICMP */
struct ip_vs_conn * cp ;
struct ip_vs_protocol * pp ;
unsigned int offset , ihl , verdict ;
* related = 1 ;
/* reassemble IP fragments */
2007-04-20 22:47:35 -07:00
if ( ip_hdr ( skb ) - > frag_off & htons ( IP_MF | IP_OFFSET ) ) {
2005-04-16 15:20:36 -07:00
skb = ip_vs_gather_frags ( skb ,
2007-02-09 23:24:47 +09:00
hooknum = = NF_IP_LOCAL_IN ?
2005-04-16 15:20:36 -07:00
IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD ) ;
if ( ! skb )
return NF_STOLEN ;
* pskb = skb ;
}
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
offset = ihl = iph - > ihl * 4 ;
ic = skb_header_pointer ( skb , offset , sizeof ( _icmph ) , & _icmph ) ;
if ( ic = = NULL )
return NF_DROP ;
IP_VS_DBG ( 12 , " Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u \n " ,
ic - > type , ntohs ( icmp_id ( ic ) ) ,
NIPQUAD ( iph - > saddr ) , NIPQUAD ( iph - > daddr ) ) ;
/*
* Work through seeing if this is for us .
* These checks are supposed to be in an order that means easy
* things are checked first to speed up processing . . . . however
* this means that some packets will manage to get a long way
* down this stack and then be rejected , but that ' s life .
*/
if ( ( ic - > type ! = ICMP_DEST_UNREACH ) & &
( ic - > type ! = ICMP_SOURCE_QUENCH ) & &
( ic - > type ! = ICMP_TIME_EXCEEDED ) ) {
* related = 0 ;
return NF_ACCEPT ;
}
/* Now find the contained IP header */
offset + = sizeof ( _icmph ) ;
cih = skb_header_pointer ( skb , offset , sizeof ( _ciph ) , & _ciph ) ;
if ( cih = = NULL )
return NF_ACCEPT ; /* The packet looks wrong, ignore */
pp = ip_vs_proto_get ( cih - > protocol ) ;
if ( ! pp )
return NF_ACCEPT ;
/* Is the embedded protocol header present? */
2007-03-07 14:19:10 +09:00
if ( unlikely ( cih - > frag_off & htons ( IP_OFFSET ) & &
2005-04-16 15:20:36 -07:00
pp - > dont_defrag ) )
return NF_ACCEPT ;
IP_VS_DBG_PKT ( 11 , pp , skb , offset , " Checking incoming ICMP for " ) ;
offset + = cih - > ihl * 4 ;
/* The embedded headers contain source and dest in reverse order */
cp = pp - > conn_in_get ( skb , pp , cih , offset , 1 ) ;
if ( ! cp )
return NF_ACCEPT ;
verdict = NF_DROP ;
/* Ensure the checksum is correct */
2007-04-09 11:59:39 -07:00
if ( ! skb_csum_unnecessary ( skb ) & & ip_vs_checksum_complete ( skb , ihl ) ) {
2005-04-16 15:20:36 -07:00
/* Failed checksum! */
IP_VS_DBG ( 1 , " Incoming ICMP: failed checksum from %d.%d.%d.%d! \n " ,
NIPQUAD ( iph - > saddr ) ) ;
goto out ;
}
/* do the statistics and put it back */
ip_vs_in_stats ( cp , skb ) ;
if ( IPPROTO_TCP = = cih - > protocol | | IPPROTO_UDP = = cih - > protocol )
offset + = 2 * sizeof ( __u16 ) ;
verdict = ip_vs_icmp_xmit ( skb , cp , pp , offset ) ;
/* do not touch skb anymore */
out :
__ip_vs_conn_put ( cp ) ;
return verdict ;
}
/*
* Check if it ' s for virtual services , look it up ,
* and send it on its way . . .
*/
static unsigned int
ip_vs_in ( unsigned int hooknum , struct sk_buff * * pskb ,
const struct net_device * in , const struct net_device * out ,
int ( * okfn ) ( struct sk_buff * ) )
{
struct sk_buff * skb = * pskb ;
struct iphdr * iph ;
struct ip_vs_protocol * pp ;
struct ip_vs_conn * cp ;
int ret , restart ;
int ihl ;
/*
* Big tappo : only PACKET_HOST ( neither loopback nor mcasts )
* . . . don ' t know why 1 st test DOES NOT include 2 nd ( ? )
*/
if ( unlikely ( skb - > pkt_type ! = PACKET_HOST
| | skb - > dev = = & loopback_dev | | skb - > sk ) ) {
IP_VS_DBG ( 12 , " packet type=%d proto=%d daddr=%d.%d.%d.%d ignored \n " ,
skb - > pkt_type ,
2007-04-20 22:47:35 -07:00
ip_hdr ( skb ) - > protocol ,
NIPQUAD ( ip_hdr ( skb ) - > daddr ) ) ;
2005-04-16 15:20:36 -07:00
return NF_ACCEPT ;
}
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
if ( unlikely ( iph - > protocol = = IPPROTO_ICMP ) ) {
int related , verdict = ip_vs_in_icmp ( pskb , & related , hooknum ) ;
if ( related )
return verdict ;
skb = * pskb ;
2007-04-20 22:47:35 -07:00
iph = ip_hdr ( skb ) ;
2005-04-16 15:20:36 -07:00
}
/* Protocol supported? */
pp = ip_vs_proto_get ( iph - > protocol ) ;
if ( unlikely ( ! pp ) )
return NF_ACCEPT ;
ihl = iph - > ihl < < 2 ;
/*
* Check if the packet belongs to an existing connection entry
*/
cp = pp - > conn_in_get ( skb , pp , iph , ihl , 0 ) ;
if ( unlikely ( ! cp ) ) {
int v ;
if ( ! pp - > conn_schedule ( skb , pp , & v , & cp ) )
return v ;
}
if ( unlikely ( ! cp ) ) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT ( 12 , pp , skb , 0 ,
" packet continues traversal as normal " ) ;
return NF_ACCEPT ;
}
IP_VS_DBG_PKT ( 11 , pp , skb , 0 , " Incoming packet " ) ;
/* Check the server status */
if ( cp - > dest & & ! ( cp - > dest - > flags & IP_VS_DEST_F_AVAILABLE ) ) {
/* the destination server is not available */
if ( sysctl_ip_vs_expire_nodest_conn ) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now ( cp ) ;
}
2005-11-08 09:40:05 -08:00
/* don't restart its timer, and silently
drop the packet . */
__ip_vs_conn_put ( cp ) ;
2005-04-16 15:20:36 -07:00
return NF_DROP ;
}
ip_vs_in_stats ( cp , skb ) ;
restart = ip_vs_set_state ( cp , IP_VS_DIR_INPUT , skb , pp ) ;
if ( cp - > packet_xmit )
ret = cp - > packet_xmit ( skb , cp , pp ) ;
/* do not touch skb anymore */
else {
IP_VS_DBG_RL ( " warning: packet_xmit is null " ) ;
ret = NF_ACCEPT ;
}
/* increase its packet counter and check if it is needed
to be synchronized */
atomic_inc ( & cp - > in_pkts ) ;
if ( ( ip_vs_sync_state & IP_VS_STATE_MASTER ) & &
( cp - > protocol ! = IPPROTO_TCP | |
cp - > state = = IP_VS_TCP_S_ESTABLISHED ) & &
( atomic_read ( & cp - > in_pkts ) % sysctl_ip_vs_sync_threshold [ 1 ]
= = sysctl_ip_vs_sync_threshold [ 0 ] ) )
ip_vs_sync_conn ( cp ) ;
ip_vs_conn_put ( cp ) ;
return ret ;
}
/*
* It is hooked at the NF_IP_FORWARD chain , in order to catch ICMP
* related packets destined for 0.0 .0 .0 / 0.
* When fwmark - based virtual service is used , such as transparent
* cache cluster , TCP packets can be marked and routed to ip_vs_in ,
* but ICMP destined for 0.0 .0 .0 / 0 cannot not be easily marked and
* sent to ip_vs_in_icmp . So , catch them at the NF_IP_FORWARD chain
* and send them to ip_vs_in_icmp .
*/
static unsigned int
ip_vs_forward_icmp ( unsigned int hooknum , struct sk_buff * * pskb ,
const struct net_device * in , const struct net_device * out ,
int ( * okfn ) ( struct sk_buff * ) )
{
int r ;
2007-04-20 22:47:35 -07:00
if ( ip_hdr ( * pskb ) - > protocol ! = IPPROTO_ICMP )
2005-04-16 15:20:36 -07:00
return NF_ACCEPT ;
return ip_vs_in_icmp ( pskb , & r , hooknum ) ;
}
/* After packet filtering, forward packet through VS/DR, VS/TUN,
or VS / NAT ( change destination ) , so that filtering rules can be
applied to IPVS . */
static struct nf_hook_ops ip_vs_in_ops = {
. hook = ip_vs_in ,
. owner = THIS_MODULE ,
. pf = PF_INET ,
. hooknum = NF_IP_LOCAL_IN ,
. priority = 100 ,
} ;
/* After packet filtering, change source only for VS/NAT */
static struct nf_hook_ops ip_vs_out_ops = {
. hook = ip_vs_out ,
. owner = THIS_MODULE ,
. pf = PF_INET ,
. hooknum = NF_IP_FORWARD ,
. priority = 100 ,
} ;
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
destined for 0.0 .0 .0 / 0 , which is for incoming IPVS connections */
static struct nf_hook_ops ip_vs_forward_icmp_ops = {
. hook = ip_vs_forward_icmp ,
. owner = THIS_MODULE ,
. pf = PF_INET ,
. hooknum = NF_IP_FORWARD ,
. priority = 99 ,
} ;
/* Before the netfilter connection tracking, exit from POST_ROUTING */
static struct nf_hook_ops ip_vs_post_routing_ops = {
. hook = ip_vs_post_routing ,
. owner = THIS_MODULE ,
. pf = PF_INET ,
. hooknum = NF_IP_POST_ROUTING ,
. priority = NF_IP_PRI_NAT_SRC - 1 ,
} ;
/*
* Initialize IP Virtual Server
*/
static int __init ip_vs_init ( void )
{
int ret ;
ret = ip_vs_control_init ( ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't setup control. \n " ) ;
goto cleanup_nothing ;
}
ip_vs_protocol_init ( ) ;
ret = ip_vs_app_init ( ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't setup application helper. \n " ) ;
goto cleanup_protocol ;
}
ret = ip_vs_conn_init ( ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't setup connection table. \n " ) ;
goto cleanup_app ;
}
ret = nf_register_hook ( & ip_vs_in_ops ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't register in hook. \n " ) ;
goto cleanup_conn ;
}
ret = nf_register_hook ( & ip_vs_out_ops ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't register out hook. \n " ) ;
goto cleanup_inops ;
}
ret = nf_register_hook ( & ip_vs_post_routing_ops ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't register post_routing hook. \n " ) ;
goto cleanup_outops ;
}
ret = nf_register_hook ( & ip_vs_forward_icmp_ops ) ;
if ( ret < 0 ) {
IP_VS_ERR ( " can't register forward_icmp hook. \n " ) ;
goto cleanup_postroutingops ;
}
IP_VS_INFO ( " ipvs loaded. \n " ) ;
return ret ;
cleanup_postroutingops :
nf_unregister_hook ( & ip_vs_post_routing_ops ) ;
cleanup_outops :
nf_unregister_hook ( & ip_vs_out_ops ) ;
cleanup_inops :
nf_unregister_hook ( & ip_vs_in_ops ) ;
cleanup_conn :
ip_vs_conn_cleanup ( ) ;
cleanup_app :
ip_vs_app_cleanup ( ) ;
cleanup_protocol :
ip_vs_protocol_cleanup ( ) ;
ip_vs_control_cleanup ( ) ;
cleanup_nothing :
return ret ;
}
static void __exit ip_vs_cleanup ( void )
{
nf_unregister_hook ( & ip_vs_forward_icmp_ops ) ;
nf_unregister_hook ( & ip_vs_post_routing_ops ) ;
nf_unregister_hook ( & ip_vs_out_ops ) ;
nf_unregister_hook ( & ip_vs_in_ops ) ;
ip_vs_conn_cleanup ( ) ;
ip_vs_app_cleanup ( ) ;
ip_vs_protocol_cleanup ( ) ;
ip_vs_control_cleanup ( ) ;
IP_VS_INFO ( " ipvs unloaded. \n " ) ;
}
module_init ( ip_vs_init ) ;
module_exit ( ip_vs_cleanup ) ;
MODULE_LICENSE ( " GPL " ) ;