2005-04-16 15:20:36 -07:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* The Internet Protocol ( IP ) output module .
*
* Version : $ Id : ip_output . c , v 1.100 2002 / 02 / 01 22 : 01 : 03 davem Exp $
*
2005-05-05 16:16:16 -07:00
* Authors : Ross Biro
2005-04-16 15:20:36 -07:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Donald Becker , < becker @ super . org >
* Alan Cox , < Alan . Cox @ linux . org >
* Richard Underwood
* Stefan Becker , < stefanb @ yello . ping . de >
* Jorge Cwik , < jorge @ laser . satlink . net >
* Arnt Gulbrandsen , < agulbra @ nvg . unit . no >
* Hirokazu Takahashi , < taka @ valinux . co . jp >
*
* See ip_input . c for original log
*
* Fixes :
* Alan Cox : Missing nonblock feature in ip_build_xmit .
* Mike Kilburn : htons ( ) missing in ip_build_xmit .
* Bradford Johnson : Fix faulty handling of some frames when
* no route is found .
* Alexander Demenshin : Missing sk / skb free in ip_queue_xmit
* ( in case if packet not accepted by
* output firewall rules )
* Mike McLagan : Routing by source
* Alexey Kuznetsov : use new route cache
* Andi Kleen : Fix broken PMTU recovery and remove
* some redundant tests .
* Vitaly E . Lavrov : Transparent proxy revived after year coma .
* Andi Kleen : Replace ip_reply with ip_send_reply .
* Andi Kleen : Split fast and slow ip_build_xmit path
* for decreased register pressure on x86
* and more readibility .
* Marc Boucher : When call_out_firewall returns FW_QUEUE ,
* silently drop skb instead of failing with - EPERM .
* Detlev Wengorz : Copy protocol for fragments .
* Hirokazu Takahashi : HW checksumming for outgoing UDP
* datagrams .
* Hirokazu Takahashi : sendfile ( ) on UDP works now .
*/
# include <asm/uaccess.h>
# include <asm/system.h>
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/sched.h>
# include <linux/mm.h>
# include <linux/string.h>
# include <linux/errno.h>
# include <linux/config.h>
# include <linux/socket.h>
# include <linux/sockios.h>
# include <linux/in.h>
# include <linux/inet.h>
# include <linux/netdevice.h>
# include <linux/etherdevice.h>
# include <linux/proc_fs.h>
# include <linux/stat.h>
# include <linux/init.h>
# include <net/snmp.h>
# include <net/ip.h>
# include <net/protocol.h>
# include <net/route.h>
# include <net/tcp.h>
# include <net/udp.h>
# include <linux/skbuff.h>
# include <net/sock.h>
# include <net/arp.h>
# include <net/icmp.h>
# include <net/raw.h>
# include <net/checksum.h>
# include <net/inetpeer.h>
# include <net/checksum.h>
# include <linux/igmp.h>
# include <linux/netfilter_ipv4.h>
# include <linux/netfilter_bridge.h>
# include <linux/mroute.h>
# include <linux/netlink.h>
/*
* Shall we try to damage output packets if routing dev changes ?
*/
int sysctl_ip_dynaddr ;
int sysctl_ip_default_ttl = IPDEFTTL ;
/* Generate a checksum for an outgoing IP datagram. */
__inline__ void ip_send_check ( struct iphdr * iph )
{
iph - > check = 0 ;
iph - > check = ip_fast_csum ( ( unsigned char * ) iph , iph - > ihl ) ;
}
/* dev_loopback_xmit for use with netfilter. */
static int ip_dev_loopback_xmit ( struct sk_buff * newskb )
{
newskb - > mac . raw = newskb - > data ;
__skb_pull ( newskb , newskb - > nh . raw - newskb - > data ) ;
newskb - > pkt_type = PACKET_LOOPBACK ;
newskb - > ip_summed = CHECKSUM_UNNECESSARY ;
BUG_TRAP ( newskb - > dst ) ;
# ifdef CONFIG_NETFILTER_DEBUG
nf_debug_ip_loopback_xmit ( newskb ) ;
# endif
2005-05-03 16:21:37 -07:00
nf_reset ( newskb ) ;
2005-04-16 15:20:36 -07:00
netif_rx ( newskb ) ;
return 0 ;
}
static inline int ip_select_ttl ( struct inet_sock * inet , struct dst_entry * dst )
{
int ttl = inet - > uc_ttl ;
if ( ttl < 0 )
ttl = dst_metric ( dst , RTAX_HOPLIMIT ) ;
return ttl ;
}
/*
* Add an ip header to a skbuff and send it out .
*
*/
int ip_build_and_send_pkt ( struct sk_buff * skb , struct sock * sk ,
u32 saddr , u32 daddr , struct ip_options * opt )
{
struct inet_sock * inet = inet_sk ( sk ) ;
struct rtable * rt = ( struct rtable * ) skb - > dst ;
struct iphdr * iph ;
/* Build the IP header. */
if ( opt )
iph = ( struct iphdr * ) skb_push ( skb , sizeof ( struct iphdr ) + opt - > optlen ) ;
else
iph = ( struct iphdr * ) skb_push ( skb , sizeof ( struct iphdr ) ) ;
iph - > version = 4 ;
iph - > ihl = 5 ;
iph - > tos = inet - > tos ;
if ( ip_dont_fragment ( sk , & rt - > u . dst ) )
iph - > frag_off = htons ( IP_DF ) ;
else
iph - > frag_off = 0 ;
iph - > ttl = ip_select_ttl ( inet , & rt - > u . dst ) ;
iph - > daddr = rt - > rt_dst ;
iph - > saddr = rt - > rt_src ;
iph - > protocol = sk - > sk_protocol ;
iph - > tot_len = htons ( skb - > len ) ;
ip_select_ident ( iph , & rt - > u . dst , sk ) ;
skb - > nh . iph = iph ;
if ( opt & & opt - > optlen ) {
iph - > ihl + = opt - > optlen > > 2 ;
ip_options_build ( skb , opt , daddr , rt , 0 ) ;
}
ip_send_check ( iph ) ;
skb - > priority = sk - > sk_priority ;
/* Send it out. */
return NF_HOOK ( PF_INET , NF_IP_LOCAL_OUT , skb , NULL , rt - > u . dst . dev ,
dst_output ) ;
}
static inline int ip_finish_output2 ( struct sk_buff * skb )
{
struct dst_entry * dst = skb - > dst ;
struct hh_cache * hh = dst - > hh ;
struct net_device * dev = dst - > dev ;
int hh_len = LL_RESERVED_SPACE ( dev ) ;
/* Be paranoid, rather than too clever. */
if ( unlikely ( skb_headroom ( skb ) < hh_len & & dev - > hard_header ) ) {
struct sk_buff * skb2 ;
skb2 = skb_realloc_headroom ( skb , LL_RESERVED_SPACE ( dev ) ) ;
if ( skb2 = = NULL ) {
kfree_skb ( skb ) ;
return - ENOMEM ;
}
if ( skb - > sk )
skb_set_owner_w ( skb2 , skb - > sk ) ;
kfree_skb ( skb ) ;
skb = skb2 ;
}
# ifdef CONFIG_NETFILTER_DEBUG
nf_debug_ip_finish_output2 ( skb ) ;
# endif /*CONFIG_NETFILTER_DEBUG*/
2005-04-25 12:01:07 -07:00
nf_reset ( skb ) ;
2005-04-16 15:20:36 -07:00
if ( hh ) {
int hh_alen ;
read_lock_bh ( & hh - > hh_lock ) ;
hh_alen = HH_DATA_ALIGN ( hh - > hh_len ) ;
memcpy ( skb - > data - hh_alen , hh - > hh_data , hh_alen ) ;
read_unlock_bh ( & hh - > hh_lock ) ;
skb_push ( skb , hh - > hh_len ) ;
return hh - > hh_output ( skb ) ;
} else if ( dst - > neighbour )
return dst - > neighbour - > output ( skb ) ;
if ( net_ratelimit ( ) )
printk ( KERN_DEBUG " ip_finish_output2: No header cache and no neighbour! \n " ) ;
kfree_skb ( skb ) ;
return - EINVAL ;
}
int ip_finish_output ( struct sk_buff * skb )
{
struct net_device * dev = skb - > dst - > dev ;
skb - > dev = dev ;
skb - > protocol = htons ( ETH_P_IP ) ;
return NF_HOOK ( PF_INET , NF_IP_POST_ROUTING , skb , NULL , dev ,
ip_finish_output2 ) ;
}
int ip_mc_output ( struct sk_buff * skb )
{
struct sock * sk = skb - > sk ;
struct rtable * rt = ( struct rtable * ) skb - > dst ;
struct net_device * dev = rt - > u . dst . dev ;
/*
* If the indicated interface is up and running , send the packet .
*/
IP_INC_STATS ( IPSTATS_MIB_OUTREQUESTS ) ;
skb - > dev = dev ;
skb - > protocol = htons ( ETH_P_IP ) ;
/*
* Multicasts are looped back for other local users
*/
if ( rt - > rt_flags & RTCF_MULTICAST ) {
if ( ( ! sk | | inet_sk ( sk ) - > mc_loop )
# ifdef CONFIG_IP_MROUTE
/* Small optimization: do not loopback not local frames,
which returned after forwarding ; they will be dropped
by ip_mr_input in any case .
Note , that local frames are looped back to be delivered
to local recipients .
This check is duplicated in ip_mr_input at the moment .
*/
& & ( ( rt - > rt_flags & RTCF_LOCAL ) | | ! ( IPCB ( skb ) - > flags & IPSKB_FORWARDED ) )
# endif
) {
struct sk_buff * newskb = skb_clone ( skb , GFP_ATOMIC ) ;
if ( newskb )
NF_HOOK ( PF_INET , NF_IP_POST_ROUTING , newskb , NULL ,
newskb - > dev ,
ip_dev_loopback_xmit ) ;
}
/* Multicasts with ttl 0 must not go beyond the host */
if ( skb - > nh . iph - > ttl = = 0 ) {
kfree_skb ( skb ) ;
return 0 ;
}
}
if ( rt - > rt_flags & RTCF_BROADCAST ) {
struct sk_buff * newskb = skb_clone ( skb , GFP_ATOMIC ) ;
if ( newskb )
NF_HOOK ( PF_INET , NF_IP_POST_ROUTING , newskb , NULL ,
newskb - > dev , ip_dev_loopback_xmit ) ;
}
if ( skb - > len > dst_mtu ( & rt - > u . dst ) )
return ip_fragment ( skb , ip_finish_output ) ;
else
return ip_finish_output ( skb ) ;
}
int ip_output ( struct sk_buff * skb )
{
IP_INC_STATS ( IPSTATS_MIB_OUTREQUESTS ) ;
if ( skb - > len > dst_mtu ( skb - > dst ) & & ! skb_shinfo ( skb ) - > tso_size )
return ip_fragment ( skb , ip_finish_output ) ;
else
return ip_finish_output ( skb ) ;
}
int ip_queue_xmit ( struct sk_buff * skb , int ipfragok )
{
struct sock * sk = skb - > sk ;
struct inet_sock * inet = inet_sk ( sk ) ;
struct ip_options * opt = inet - > opt ;
struct rtable * rt ;
struct iphdr * iph ;
/* Skip all of this if the packet is already routed,
* f . e . by something like SCTP .
*/
rt = ( struct rtable * ) skb - > dst ;
if ( rt ! = NULL )
goto packet_routed ;
/* Make sure we can route this packet. */
rt = ( struct rtable * ) __sk_dst_check ( sk , 0 ) ;
if ( rt = = NULL ) {
u32 daddr ;
/* Use correct destination address if we have options. */
daddr = inet - > daddr ;
if ( opt & & opt - > srr )
daddr = opt - > faddr ;
{
struct flowi fl = { . oif = sk - > sk_bound_dev_if ,
. nl_u = { . ip4_u =
{ . daddr = daddr ,
. saddr = inet - > saddr ,
. tos = RT_CONN_FLAGS ( sk ) } } ,
. proto = sk - > sk_protocol ,
. uli_u = { . ports =
{ . sport = inet - > sport ,
. dport = inet - > dport } } } ;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out .
*/
if ( ip_route_output_flow ( & rt , & fl , sk , 0 ) )
goto no_route ;
}
__sk_dst_set ( sk , & rt - > u . dst ) ;
tcp_v4_setup_caps ( sk , & rt - > u . dst ) ;
}
skb - > dst = dst_clone ( & rt - > u . dst ) ;
packet_routed :
if ( opt & & opt - > is_strictroute & & rt - > rt_dst ! = rt - > rt_gateway )
goto no_route ;
/* OK, we know where to send it, allocate and build IP header. */
iph = ( struct iphdr * ) skb_push ( skb , sizeof ( struct iphdr ) + ( opt ? opt - > optlen : 0 ) ) ;
* ( ( __u16 * ) iph ) = htons ( ( 4 < < 12 ) | ( 5 < < 8 ) | ( inet - > tos & 0xff ) ) ;
iph - > tot_len = htons ( skb - > len ) ;
if ( ip_dont_fragment ( sk , & rt - > u . dst ) & & ! ipfragok )
iph - > frag_off = htons ( IP_DF ) ;
else
iph - > frag_off = 0 ;
iph - > ttl = ip_select_ttl ( inet , & rt - > u . dst ) ;
iph - > protocol = sk - > sk_protocol ;
iph - > saddr = rt - > rt_src ;
iph - > daddr = rt - > rt_dst ;
skb - > nh . iph = iph ;
/* Transport layer set skb->h.foo itself. */
if ( opt & & opt - > optlen ) {
iph - > ihl + = opt - > optlen > > 2 ;
ip_options_build ( skb , opt , inet - > daddr , rt , 0 ) ;
}
ip_select_ident_more ( iph , & rt - > u . dst , sk , skb_shinfo ( skb ) - > tso_segs ) ;
/* Add an IP checksum. */
ip_send_check ( iph ) ;
skb - > priority = sk - > sk_priority ;
return NF_HOOK ( PF_INET , NF_IP_LOCAL_OUT , skb , NULL , rt - > u . dst . dev ,
dst_output ) ;
no_route :
IP_INC_STATS ( IPSTATS_MIB_OUTNOROUTES ) ;
kfree_skb ( skb ) ;
return - EHOSTUNREACH ;
}
static void ip_copy_metadata ( struct sk_buff * to , struct sk_buff * from )
{
to - > pkt_type = from - > pkt_type ;
to - > priority = from - > priority ;
to - > protocol = from - > protocol ;
to - > security = from - > security ;
dst_release ( to - > dst ) ;
to - > dst = dst_clone ( from - > dst ) ;
to - > dev = from - > dev ;
/* Copy the flags to each fragment. */
IPCB ( to ) - > flags = IPCB ( from ) - > flags ;
# ifdef CONFIG_NET_SCHED
to - > tc_index = from - > tc_index ;
# endif
# ifdef CONFIG_NETFILTER
to - > nfmark = from - > nfmark ;
to - > nfcache = from - > nfcache ;
/* Connection association is same as pre-frag packet */
nf_conntrack_put ( to - > nfct ) ;
to - > nfct = from - > nfct ;
nf_conntrack_get ( to - > nfct ) ;
to - > nfctinfo = from - > nfctinfo ;
# ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put ( to - > nf_bridge ) ;
to - > nf_bridge = from - > nf_bridge ;
nf_bridge_get ( to - > nf_bridge ) ;
# endif
# ifdef CONFIG_NETFILTER_DEBUG
to - > nf_debug = from - > nf_debug ;
# endif
# endif
}
/*
* This IP datagram is too large to be sent in one piece . Break it up into
* smaller pieces ( each of size equal to IP header plus
* a block of the data of the original IP data part ) that will yet fit in a
* single device frame , and queue such a frame for sending .
*/
int ip_fragment ( struct sk_buff * skb , int ( * output ) ( struct sk_buff * ) )
{
struct iphdr * iph ;
int raw = 0 ;
int ptr ;
struct net_device * dev ;
struct sk_buff * skb2 ;
unsigned int mtu , hlen , left , len , ll_rs ;
int offset ;
int not_last_frag ;
struct rtable * rt = ( struct rtable * ) skb - > dst ;
int err = 0 ;
dev = rt - > u . dst . dev ;
/*
* Point into the IP datagram header .
*/
iph = skb - > nh . iph ;
if ( unlikely ( ( iph - > frag_off & htons ( IP_DF ) ) & & ! skb - > local_df ) ) {
icmp_send ( skb , ICMP_DEST_UNREACH , ICMP_FRAG_NEEDED ,
htonl ( dst_mtu ( & rt - > u . dst ) ) ) ;
kfree_skb ( skb ) ;
return - EMSGSIZE ;
}
/*
* Setup starting values .
*/
hlen = iph - > ihl * 4 ;
mtu = dst_mtu ( & rt - > u . dst ) - hlen ; /* Size of data space */
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one , it is not prohibited . In this case fall back to copying .
*
* LATER : this step can be merged to real generation of fragments ,
* we can switch to copy when see the first bad fragment .
*/
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * frag ;
int first_len = skb_pagelen ( skb ) ;
if ( first_len - hlen > mtu | |
( ( first_len - hlen ) & 7 ) | |
( iph - > frag_off & htons ( IP_MF | IP_OFFSET ) ) | |
skb_cloned ( skb ) )
goto slow_path ;
for ( frag = skb_shinfo ( skb ) - > frag_list ; frag ; frag = frag - > next ) {
/* Correct geometry. */
if ( frag - > len > mtu | |
( ( frag - > len & 7 ) & & frag - > next ) | |
skb_headroom ( frag ) < hlen )
goto slow_path ;
/* Partially cloned skb? */
if ( skb_shared ( frag ) )
goto slow_path ;
2005-05-18 22:52:33 -07:00
BUG_ON ( frag - > sk ) ;
if ( skb - > sk ) {
sock_hold ( skb - > sk ) ;
frag - > sk = skb - > sk ;
frag - > destructor = sock_wfree ;
skb - > truesize - = frag - > truesize ;
}
2005-04-16 15:20:36 -07:00
}
/* Everything is OK. Generate! */
err = 0 ;
offset = 0 ;
frag = skb_shinfo ( skb ) - > frag_list ;
skb_shinfo ( skb ) - > frag_list = NULL ;
skb - > data_len = first_len - skb_headlen ( skb ) ;
skb - > len = first_len ;
iph - > tot_len = htons ( first_len ) ;
iph - > frag_off = htons ( IP_MF ) ;
ip_send_check ( iph ) ;
for ( ; ; ) {
/* Prepare header of the next frame,
* before previous one went down . */
if ( frag ) {
frag - > ip_summed = CHECKSUM_NONE ;
frag - > h . raw = frag - > data ;
frag - > nh . raw = __skb_push ( frag , hlen ) ;
memcpy ( frag - > nh . raw , iph , hlen ) ;
iph = frag - > nh . iph ;
iph - > tot_len = htons ( frag - > len ) ;
ip_copy_metadata ( frag , skb ) ;
if ( offset = = 0 )
ip_options_fragment ( frag ) ;
offset + = skb - > len - hlen ;
iph - > frag_off = htons ( offset > > 3 ) ;
if ( frag - > next ! = NULL )
iph - > frag_off | = htons ( IP_MF ) ;
/* Ready, complete checksum */
ip_send_check ( iph ) ;
}
err = output ( skb ) ;
if ( err | | ! frag )
break ;
skb = frag ;
frag = skb - > next ;
skb - > next = NULL ;
}
if ( err = = 0 ) {
IP_INC_STATS ( IPSTATS_MIB_FRAGOKS ) ;
return 0 ;
}
while ( frag ) {
skb = frag - > next ;
kfree_skb ( frag ) ;
frag = skb ;
}
IP_INC_STATS ( IPSTATS_MIB_FRAGFAILS ) ;
return err ;
}
slow_path :
left = skb - > len - hlen ; /* Space per frame */
ptr = raw + hlen ; /* Where to start from */
# ifdef CONFIG_BRIDGE_NETFILTER
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header */
ll_rs = LL_RESERVED_SPACE_EXTRA ( rt - > u . dst . dev , nf_bridge_pad ( skb ) ) ;
mtu - = nf_bridge_pad ( skb ) ;
# else
ll_rs = LL_RESERVED_SPACE ( rt - > u . dst . dev ) ;
# endif
/*
* Fragment the datagram .
*/
offset = ( ntohs ( iph - > frag_off ) & IP_OFFSET ) < < 3 ;
not_last_frag = iph - > frag_off & htons ( IP_MF ) ;
/*
* Keep copying data until we run out .
*/
while ( left > 0 ) {
len = left ;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if ( len > mtu )
len = mtu ;
/* IF: we are not sending upto and including the packet end
then align the next start on an eight byte boundary */
if ( len < left ) {
len & = ~ 7 ;
}
/*
* Allocate buffer .
*/
if ( ( skb2 = alloc_skb ( len + hlen + ll_rs , GFP_ATOMIC ) ) = = NULL ) {
NETDEBUG ( printk ( KERN_INFO " IP: frag: no memory for new fragment! \n " ) ) ;
err = - ENOMEM ;
goto fail ;
}
/*
* Set up data on packet
*/
ip_copy_metadata ( skb2 , skb ) ;
skb_reserve ( skb2 , ll_rs ) ;
skb_put ( skb2 , len + hlen ) ;
skb2 - > nh . raw = skb2 - > data ;
skb2 - > h . raw = skb2 - > data + hlen ;
/*
* Charge the memory for the fragment to any owner
* it might possess
*/
if ( skb - > sk )
skb_set_owner_w ( skb2 , skb - > sk ) ;
/*
* Copy the packet header into the new buffer .
*/
memcpy ( skb2 - > nh . raw , skb - > data , hlen ) ;
/*
* Copy a block of the IP datagram .
*/
if ( skb_copy_bits ( skb , ptr , skb2 - > h . raw , len ) )
BUG ( ) ;
left - = len ;
/*
* Fill in the new header fields .
*/
iph = skb2 - > nh . iph ;
iph - > frag_off = htons ( ( offset > > 3 ) ) ;
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST ( otherwise ,
* options are already fixed ) and make it ONCE
* on the initial skb , so that all the following fragments
* will inherit fixed options .
*/
if ( offset = = 0 )
ip_options_fragment ( skb ) ;
/*
* Added AC : If we are fragmenting a fragment that ' s not the
* last fragment then keep MF on each bit
*/
if ( left > 0 | | not_last_frag )
iph - > frag_off | = htons ( IP_MF ) ;
ptr + = len ;
offset + = len ;
/*
* Put this fragment into the sending queue .
*/
IP_INC_STATS ( IPSTATS_MIB_FRAGCREATES ) ;
iph - > tot_len = htons ( len + hlen ) ;
ip_send_check ( iph ) ;
err = output ( skb2 ) ;
if ( err )
goto fail ;
}
kfree_skb ( skb ) ;
IP_INC_STATS ( IPSTATS_MIB_FRAGOKS ) ;
return err ;
fail :
kfree_skb ( skb ) ;
IP_INC_STATS ( IPSTATS_MIB_FRAGFAILS ) ;
return err ;
}
int
ip_generic_getfrag ( void * from , char * to , int offset , int len , int odd , struct sk_buff * skb )
{
struct iovec * iov = from ;
if ( skb - > ip_summed = = CHECKSUM_HW ) {
if ( memcpy_fromiovecend ( to , iov , offset , len ) < 0 )
return - EFAULT ;
} else {
unsigned int csum = 0 ;
if ( csum_partial_copy_fromiovecend ( to , iov , offset , len , & csum ) < 0 )
return - EFAULT ;
skb - > csum = csum_block_add ( skb - > csum , csum , odd ) ;
}
return 0 ;
}
static inline unsigned int
csum_page ( struct page * page , int offset , int copy )
{
char * kaddr ;
unsigned int csum ;
kaddr = kmap ( page ) ;
csum = csum_partial ( kaddr + offset , copy , 0 ) ;
kunmap ( page ) ;
return csum ;
}
/*
* ip_append_data ( ) and ip_append_page ( ) can make one large IP datagram
* from many pieces of data . Each pieces will be holded on the socket
* until ip_push_pending_frames ( ) is called . Each piece can be a page
* or non - page data .
*
* Not only UDP , other transport protocols - e . g . raw sockets - can use
* this interface potentially .
*
* LATER : length must be adjusted by pad at tail , when it is required .
*/
int ip_append_data ( struct sock * sk ,
int getfrag ( void * from , char * to , int offset , int len ,
int odd , struct sk_buff * skb ) ,
void * from , int length , int transhdrlen ,
struct ipcm_cookie * ipc , struct rtable * rt ,
unsigned int flags )
{
struct inet_sock * inet = inet_sk ( sk ) ;
struct sk_buff * skb ;
struct ip_options * opt = NULL ;
int hh_len ;
int exthdrlen ;
int mtu ;
int copy ;
int err ;
int offset = 0 ;
unsigned int maxfraglen , fragheaderlen ;
int csummode = CHECKSUM_NONE ;
if ( flags & MSG_PROBE )
return 0 ;
if ( skb_queue_empty ( & sk - > sk_write_queue ) ) {
/*
* setup for corking .
*/
opt = ipc - > opt ;
if ( opt ) {
if ( inet - > cork . opt = = NULL ) {
inet - > cork . opt = kmalloc ( sizeof ( struct ip_options ) + 40 , sk - > sk_allocation ) ;
if ( unlikely ( inet - > cork . opt = = NULL ) )
return - ENOBUFS ;
}
memcpy ( inet - > cork . opt , opt , sizeof ( struct ip_options ) + opt - > optlen ) ;
inet - > cork . flags | = IPCORK_OPT ;
inet - > cork . addr = ipc - > addr ;
}
dst_hold ( & rt - > u . dst ) ;
inet - > cork . fragsize = mtu = dst_mtu ( rt - > u . dst . path ) ;
inet - > cork . rt = rt ;
inet - > cork . length = 0 ;
sk - > sk_sndmsg_page = NULL ;
sk - > sk_sndmsg_off = 0 ;
if ( ( exthdrlen = rt - > u . dst . header_len ) ! = 0 ) {
length + = exthdrlen ;
transhdrlen + = exthdrlen ;
}
} else {
rt = inet - > cork . rt ;
if ( inet - > cork . flags & IPCORK_OPT )
opt = inet - > cork . opt ;
transhdrlen = 0 ;
exthdrlen = 0 ;
mtu = inet - > cork . fragsize ;
}
hh_len = LL_RESERVED_SPACE ( rt - > u . dst . dev ) ;
fragheaderlen = sizeof ( struct iphdr ) + ( opt ? opt - > optlen : 0 ) ;
maxfraglen = ( ( mtu - fragheaderlen ) & ~ 7 ) + fragheaderlen ;
if ( inet - > cork . length + length > 0xFFFF - fragheaderlen ) {
ip_local_error ( sk , EMSGSIZE , rt - > rt_dst , inet - > dport , mtu - exthdrlen ) ;
return - EMSGSIZE ;
}
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won ' t be fragmented in the future .
*/
if ( transhdrlen & &
length + fragheaderlen < = mtu & &
rt - > u . dst . dev - > features & ( NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM ) & &
! exthdrlen )
csummode = CHECKSUM_HW ;
inet - > cork . length + = length ;
/* So, what's going on in the loop below?
*
* We use calculated fragment length to generate chained skb ,
* each of segments is IP fragment ready for sending to network after
* adding appropriate IP header .
*/
if ( ( skb = skb_peek_tail ( & sk - > sk_write_queue ) ) = = NULL )
goto alloc_new_skb ;
while ( length > 0 ) {
/* Check if the remaining data fits into current packet. */
copy = mtu - skb - > len ;
if ( copy < length )
copy = maxfraglen - skb - > len ;
if ( copy < = 0 ) {
char * data ;
unsigned int datalen ;
unsigned int fraglen ;
unsigned int fraggap ;
unsigned int alloclen ;
struct sk_buff * skb_prev ;
alloc_new_skb :
skb_prev = skb ;
if ( skb_prev )
fraggap = skb_prev - > len - maxfraglen ;
else
fraggap = 0 ;
/*
* If remaining data exceeds the mtu ,
* we know we need more fragment ( s ) .
*/
datalen = length + fraggap ;
if ( datalen > mtu - fragheaderlen )
datalen = maxfraglen - fragheaderlen ;
fraglen = datalen + fragheaderlen ;
if ( ( flags & MSG_MORE ) & &
! ( rt - > u . dst . dev - > features & NETIF_F_SG ) )
alloclen = mtu ;
else
alloclen = datalen + fragheaderlen ;
/* The last fragment gets additional space at tail.
* Note , with MSG_MORE we overallocate on fragments ,
* because we have no idea what fragment will be
* the last .
*/
if ( datalen = = length )
alloclen + = rt - > u . dst . trailer_len ;
if ( transhdrlen ) {
skb = sock_alloc_send_skb ( sk ,
alloclen + hh_len + 15 ,
( flags & MSG_DONTWAIT ) , & err ) ;
} else {
skb = NULL ;
if ( atomic_read ( & sk - > sk_wmem_alloc ) < =
2 * sk - > sk_sndbuf )
skb = sock_wmalloc ( sk ,
alloclen + hh_len + 15 , 1 ,
sk - > sk_allocation ) ;
if ( unlikely ( skb = = NULL ) )
err = - ENOBUFS ;
}
if ( skb = = NULL )
goto error ;
/*
* Fill in the control structures
*/
skb - > ip_summed = csummode ;
skb - > csum = 0 ;
skb_reserve ( skb , hh_len ) ;
/*
* Find where to start putting bytes .
*/
data = skb_put ( skb , fraglen ) ;
skb - > nh . raw = data + exthdrlen ;
data + = fragheaderlen ;
skb - > h . raw = data + exthdrlen ;
if ( fraggap ) {
skb - > csum = skb_copy_and_csum_bits (
skb_prev , maxfraglen ,
data + transhdrlen , fraggap , 0 ) ;
skb_prev - > csum = csum_sub ( skb_prev - > csum ,
skb - > csum ) ;
data + = fraggap ;
skb_trim ( skb_prev , maxfraglen ) ;
}
copy = datalen - transhdrlen - fraggap ;
if ( copy > 0 & & getfrag ( from , data + transhdrlen , offset , copy , fraggap , skb ) < 0 ) {
err = - EFAULT ;
kfree_skb ( skb ) ;
goto error ;
}
offset + = copy ;
length - = datalen - fraggap ;
transhdrlen = 0 ;
exthdrlen = 0 ;
csummode = CHECKSUM_NONE ;
/*
* Put the packet on the pending queue .
*/
__skb_queue_tail ( & sk - > sk_write_queue , skb ) ;
continue ;
}
if ( copy > length )
copy = length ;
if ( ! ( rt - > u . dst . dev - > features & NETIF_F_SG ) ) {
unsigned int off ;
off = skb - > len ;
if ( getfrag ( from , skb_put ( skb , copy ) ,
offset , copy , off , skb ) < 0 ) {
__skb_trim ( skb , off ) ;
err = - EFAULT ;
goto error ;
}
} else {
int i = skb_shinfo ( skb ) - > nr_frags ;
skb_frag_t * frag = & skb_shinfo ( skb ) - > frags [ i - 1 ] ;
struct page * page = sk - > sk_sndmsg_page ;
int off = sk - > sk_sndmsg_off ;
unsigned int left ;
if ( page & & ( left = PAGE_SIZE - off ) > 0 ) {
if ( copy > = left )
copy = left ;
if ( page ! = frag - > page ) {
if ( i = = MAX_SKB_FRAGS ) {
err = - EMSGSIZE ;
goto error ;
}
get_page ( page ) ;
skb_fill_page_desc ( skb , i , page , sk - > sk_sndmsg_off , 0 ) ;
frag = & skb_shinfo ( skb ) - > frags [ i ] ;
}
} else if ( i < MAX_SKB_FRAGS ) {
if ( copy > PAGE_SIZE )
copy = PAGE_SIZE ;
page = alloc_pages ( sk - > sk_allocation , 0 ) ;
if ( page = = NULL ) {
err = - ENOMEM ;
goto error ;
}
sk - > sk_sndmsg_page = page ;
sk - > sk_sndmsg_off = 0 ;
skb_fill_page_desc ( skb , i , page , 0 , 0 ) ;
frag = & skb_shinfo ( skb ) - > frags [ i ] ;
skb - > truesize + = PAGE_SIZE ;
atomic_add ( PAGE_SIZE , & sk - > sk_wmem_alloc ) ;
} else {
err = - EMSGSIZE ;
goto error ;
}
if ( getfrag ( from , page_address ( frag - > page ) + frag - > page_offset + frag - > size , offset , copy , skb - > len , skb ) < 0 ) {
err = - EFAULT ;
goto error ;
}
sk - > sk_sndmsg_off + = copy ;
frag - > size + = copy ;
skb - > len + = copy ;
skb - > data_len + = copy ;
}
offset + = copy ;
length - = copy ;
}
return 0 ;
error :
inet - > cork . length - = length ;
IP_INC_STATS ( IPSTATS_MIB_OUTDISCARDS ) ;
return err ;
}
ssize_t ip_append_page ( struct sock * sk , struct page * page ,
int offset , size_t size , int flags )
{
struct inet_sock * inet = inet_sk ( sk ) ;
struct sk_buff * skb ;
struct rtable * rt ;
struct ip_options * opt = NULL ;
int hh_len ;
int mtu ;
int len ;
int err ;
unsigned int maxfraglen , fragheaderlen , fraggap ;
if ( inet - > hdrincl )
return - EPERM ;
if ( flags & MSG_PROBE )
return 0 ;
if ( skb_queue_empty ( & sk - > sk_write_queue ) )
return - EINVAL ;
rt = inet - > cork . rt ;
if ( inet - > cork . flags & IPCORK_OPT )
opt = inet - > cork . opt ;
if ( ! ( rt - > u . dst . dev - > features & NETIF_F_SG ) )
return - EOPNOTSUPP ;
hh_len = LL_RESERVED_SPACE ( rt - > u . dst . dev ) ;
mtu = inet - > cork . fragsize ;
fragheaderlen = sizeof ( struct iphdr ) + ( opt ? opt - > optlen : 0 ) ;
maxfraglen = ( ( mtu - fragheaderlen ) & ~ 7 ) + fragheaderlen ;
if ( inet - > cork . length + size > 0xFFFF - fragheaderlen ) {
ip_local_error ( sk , EMSGSIZE , rt - > rt_dst , inet - > dport , mtu ) ;
return - EMSGSIZE ;
}
if ( ( skb = skb_peek_tail ( & sk - > sk_write_queue ) ) = = NULL )
return - EINVAL ;
inet - > cork . length + = size ;
while ( size > 0 ) {
int i ;
/* Check if the remaining data fits into current packet. */
len = mtu - skb - > len ;
if ( len < size )
len = maxfraglen - skb - > len ;
if ( len < = 0 ) {
struct sk_buff * skb_prev ;
char * data ;
struct iphdr * iph ;
int alloclen ;
skb_prev = skb ;
if ( skb_prev )
fraggap = skb_prev - > len - maxfraglen ;
else
fraggap = 0 ;
alloclen = fragheaderlen + hh_len + fraggap + 15 ;
skb = sock_wmalloc ( sk , alloclen , 1 , sk - > sk_allocation ) ;
if ( unlikely ( ! skb ) ) {
err = - ENOBUFS ;
goto error ;
}
/*
* Fill in the control structures
*/
skb - > ip_summed = CHECKSUM_NONE ;
skb - > csum = 0 ;
skb_reserve ( skb , hh_len ) ;
/*
* Find where to start putting bytes .
*/
data = skb_put ( skb , fragheaderlen + fraggap ) ;
skb - > nh . iph = iph = ( struct iphdr * ) data ;
data + = fragheaderlen ;
skb - > h . raw = data ;
if ( fraggap ) {
skb - > csum = skb_copy_and_csum_bits (
skb_prev , maxfraglen ,
data , fraggap , 0 ) ;
skb_prev - > csum = csum_sub ( skb_prev - > csum ,
skb - > csum ) ;
skb_trim ( skb_prev , maxfraglen ) ;
}
/*
* Put the packet on the pending queue .
*/
__skb_queue_tail ( & sk - > sk_write_queue , skb ) ;
continue ;
}
i = skb_shinfo ( skb ) - > nr_frags ;
if ( len > size )
len = size ;
if ( skb_can_coalesce ( skb , i , page , offset ) ) {
skb_shinfo ( skb ) - > frags [ i - 1 ] . size + = len ;
} else if ( i < MAX_SKB_FRAGS ) {
get_page ( page ) ;
skb_fill_page_desc ( skb , i , page , offset , len ) ;
} else {
err = - EMSGSIZE ;
goto error ;
}
if ( skb - > ip_summed = = CHECKSUM_NONE ) {
unsigned int csum ;
csum = csum_page ( page , offset , len ) ;
skb - > csum = csum_block_add ( skb - > csum , csum , skb - > len ) ;
}
skb - > len + = len ;
skb - > data_len + = len ;
offset + = len ;
size - = len ;
}
return 0 ;
error :
inet - > cork . length - = size ;
IP_INC_STATS ( IPSTATS_MIB_OUTDISCARDS ) ;
return err ;
}
/*
* Combined all pending IP fragments on the socket as one IP datagram
* and push them out .
*/
int ip_push_pending_frames ( struct sock * sk )
{
struct sk_buff * skb , * tmp_skb ;
struct sk_buff * * tail_skb ;
struct inet_sock * inet = inet_sk ( sk ) ;
struct ip_options * opt = NULL ;
struct rtable * rt = inet - > cork . rt ;
struct iphdr * iph ;
int df = 0 ;
__u8 ttl ;
int err = 0 ;
if ( ( skb = __skb_dequeue ( & sk - > sk_write_queue ) ) = = NULL )
goto out ;
tail_skb = & ( skb_shinfo ( skb ) - > frag_list ) ;
/* move skb->data to ip header from ext header */
if ( skb - > data < skb - > nh . raw )
__skb_pull ( skb , skb - > nh . raw - skb - > data ) ;
while ( ( tmp_skb = __skb_dequeue ( & sk - > sk_write_queue ) ) ! = NULL ) {
__skb_pull ( tmp_skb , skb - > h . raw - skb - > nh . raw ) ;
* tail_skb = tmp_skb ;
tail_skb = & ( tmp_skb - > next ) ;
skb - > len + = tmp_skb - > len ;
skb - > data_len + = tmp_skb - > len ;
skb - > truesize + = tmp_skb - > truesize ;
__sock_put ( tmp_skb - > sk ) ;
tmp_skb - > destructor = NULL ;
tmp_skb - > sk = NULL ;
}
/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
* to fragment the frame generated here . No matter , what transforms
* how transforms change size of the packet , it will come out .
*/
if ( inet - > pmtudisc ! = IP_PMTUDISC_DO )
skb - > local_df = 1 ;
/* DF bit is set when we want to see DF on outgoing frames.
* If local_df is set too , we still allow to fragment this frame
* locally . */
if ( inet - > pmtudisc = = IP_PMTUDISC_DO | |
( skb - > len < = dst_mtu ( & rt - > u . dst ) & &
ip_dont_fragment ( sk , & rt - > u . dst ) ) )
df = htons ( IP_DF ) ;
if ( inet - > cork . flags & IPCORK_OPT )
opt = inet - > cork . opt ;
if ( rt - > rt_type = = RTN_MULTICAST )
ttl = inet - > mc_ttl ;
else
ttl = ip_select_ttl ( inet , & rt - > u . dst ) ;
iph = ( struct iphdr * ) skb - > data ;
iph - > version = 4 ;
iph - > ihl = 5 ;
if ( opt ) {
iph - > ihl + = opt - > optlen > > 2 ;
ip_options_build ( skb , opt , inet - > cork . addr , rt , 0 ) ;
}
iph - > tos = inet - > tos ;
iph - > tot_len = htons ( skb - > len ) ;
iph - > frag_off = df ;
if ( ! df ) {
__ip_select_ident ( iph , & rt - > u . dst , 0 ) ;
} else {
iph - > id = htons ( inet - > id + + ) ;
}
iph - > ttl = ttl ;
iph - > protocol = sk - > sk_protocol ;
iph - > saddr = rt - > rt_src ;
iph - > daddr = rt - > rt_dst ;
ip_send_check ( iph ) ;
skb - > priority = sk - > sk_priority ;
skb - > dst = dst_clone ( & rt - > u . dst ) ;
/* Netfilter gets whole the not fragmented skb. */
err = NF_HOOK ( PF_INET , NF_IP_LOCAL_OUT , skb , NULL ,
skb - > dst - > dev , dst_output ) ;
if ( err ) {
if ( err > 0 )
err = inet - > recverr ? net_xmit_errno ( err ) : 0 ;
if ( err )
goto error ;
}
out :
inet - > cork . flags & = ~ IPCORK_OPT ;
if ( inet - > cork . opt ) {
kfree ( inet - > cork . opt ) ;
inet - > cork . opt = NULL ;
}
if ( inet - > cork . rt ) {
ip_rt_put ( inet - > cork . rt ) ;
inet - > cork . rt = NULL ;
}
return err ;
error :
IP_INC_STATS ( IPSTATS_MIB_OUTDISCARDS ) ;
goto out ;
}
/*
* Throw away all pending data on the socket .
*/
void ip_flush_pending_frames ( struct sock * sk )
{
struct inet_sock * inet = inet_sk ( sk ) ;
struct sk_buff * skb ;
while ( ( skb = __skb_dequeue_tail ( & sk - > sk_write_queue ) ) ! = NULL )
kfree_skb ( skb ) ;
inet - > cork . flags & = ~ IPCORK_OPT ;
if ( inet - > cork . opt ) {
kfree ( inet - > cork . opt ) ;
inet - > cork . opt = NULL ;
}
if ( inet - > cork . rt ) {
ip_rt_put ( inet - > cork . rt ) ;
inet - > cork . rt = NULL ;
}
}
/*
* Fetch data from kernel space and fill in checksum if needed .
*/
static int ip_reply_glue_bits ( void * dptr , char * to , int offset ,
int len , int odd , struct sk_buff * skb )
{
unsigned int csum ;
csum = csum_partial_copy_nocheck ( dptr + offset , to , len , 0 ) ;
skb - > csum = csum_block_add ( skb - > csum , csum , odd ) ;
return 0 ;
}
/*
* Generic function to send a packet as reply to another packet .
* Used to send TCP resets so far . ICMP should use this function too .
*
* Should run single threaded per socket because it uses the sock
* structure to pass arguments .
*
* LATER : switch from ip_build_xmit to ip_append_ *
*/
void ip_send_reply ( struct sock * sk , struct sk_buff * skb , struct ip_reply_arg * arg ,
unsigned int len )
{
struct inet_sock * inet = inet_sk ( sk ) ;
struct {
struct ip_options opt ;
char data [ 40 ] ;
} replyopts ;
struct ipcm_cookie ipc ;
u32 daddr ;
struct rtable * rt = ( struct rtable * ) skb - > dst ;
if ( ip_options_echo ( & replyopts . opt , skb ) )
return ;
daddr = ipc . addr = rt - > rt_src ;
ipc . opt = NULL ;
if ( replyopts . opt . optlen ) {
ipc . opt = & replyopts . opt ;
if ( ipc . opt - > srr )
daddr = replyopts . opt . faddr ;
}
{
struct flowi fl = { . nl_u = { . ip4_u =
{ . daddr = daddr ,
. saddr = rt - > rt_spec_dst ,
. tos = RT_TOS ( skb - > nh . iph - > tos ) } } ,
/* Not quite clean, but right. */
. uli_u = { . ports =
{ . sport = skb - > h . th - > dest ,
. dport = skb - > h . th - > source } } ,
. proto = sk - > sk_protocol } ;
if ( ip_route_output_key ( & rt , & fl ) )
return ;
}
/* And let IP do all the hard work.
This chunk is not reenterable , hence spinlock .
Note that it uses the fact , that this function is called
with locally disabled BH and that sk cannot be already spinlocked .
*/
bh_lock_sock ( sk ) ;
inet - > tos = skb - > nh . iph - > tos ;
sk - > sk_priority = skb - > priority ;
sk - > sk_protocol = skb - > nh . iph - > protocol ;
ip_append_data ( sk , ip_reply_glue_bits , arg - > iov - > iov_base , len , 0 ,
& ipc , rt , MSG_DONTWAIT ) ;
if ( ( skb = skb_peek ( & sk - > sk_write_queue ) ) ! = NULL ) {
if ( arg - > csumoffset > = 0 )
* ( ( u16 * ) skb - > h . raw + arg - > csumoffset ) = csum_fold ( csum_add ( skb - > csum , arg - > csum ) ) ;
skb - > ip_summed = CHECKSUM_NONE ;
ip_push_pending_frames ( sk ) ;
}
bh_unlock_sock ( sk ) ;
ip_rt_put ( rt ) ;
}
/*
* IP protocol layer initialiser
*/
static struct packet_type ip_packet_type = {
. type = __constant_htons ( ETH_P_IP ) ,
. func = ip_rcv ,
} ;
/*
* IP registers the packet type and then calls the subprotocol initialisers
*/
void __init ip_init ( void )
{
dev_add_pack ( & ip_packet_type ) ;
ip_rt_init ( ) ;
inet_initpeers ( ) ;
# if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
igmp_mc_proc_init ( ) ;
# endif
}
EXPORT_SYMBOL ( ip_finish_output ) ;
EXPORT_SYMBOL ( ip_fragment ) ;
EXPORT_SYMBOL ( ip_generic_getfrag ) ;
EXPORT_SYMBOL ( ip_queue_xmit ) ;
EXPORT_SYMBOL ( ip_send_check ) ;
# ifdef CONFIG_SYSCTL
EXPORT_SYMBOL ( sysctl_ip_default_ttl ) ;
# endif