2005-04-16 15:20:36 -07:00
/*
* Routines having to do with the ' struct sk_buff ' memory handlers .
*
* Authors : Alan Cox < iiitac @ pyr . swan . ac . uk >
* Florian La Roche < rzsfl @ rz . uni - sb . de >
*
* Version : $ Id : skbuff . c , v 1.90 2001 / 11 / 07 05 : 56 : 19 davem Exp $
*
* Fixes :
* Alan Cox : Fixed the worst of the load
* balancer bugs .
* Dave Platt : Interrupt stacking fix .
* Richard Kooijman : Timestamp fixes .
* Alan Cox : Changed buffer format .
* Alan Cox : destructor hook for AF_UNIX etc .
* Linus Torvalds : Better skb_clone .
* Alan Cox : Added skb_copy .
* Alan Cox : Added all the changed routines Linus
* only put in the headers
* Ray VanTassle : Fixed - - skb - > lock in free
* Alan Cox : skb_copy copy arp field
* Andi Kleen : slabified it .
* Robert Olsson : Removed skb_head_pool
*
* NOTE :
* The __skb_ routines should be called with interrupts
* disabled , or you better be * real * sure that the operation is atomic
* with respect to whatever list is being frobbed ( e . g . via lock_sock ( )
* or via disabling bottom half handlers , etc ) .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
/*
* The functions in this file will not compile correctly with gcc 2.4 . x
*/
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/interrupt.h>
# include <linux/in.h>
# include <linux/inet.h>
# include <linux/slab.h>
# include <linux/netdevice.h>
# ifdef CONFIG_NET_CLS_ACT
# include <net/pkt_sched.h>
# endif
# include <linux/string.h>
# include <linux/skbuff.h>
2007-11-06 23:30:13 -08:00
# include <linux/splice.h>
2005-04-16 15:20:36 -07:00
# include <linux/cache.h>
# include <linux/rtnetlink.h>
# include <linux/init.h>
2007-04-02 20:19:53 -07:00
# include <linux/scatterlist.h>
2005-04-16 15:20:36 -07:00
# include <net/protocol.h>
# include <net/dst.h>
# include <net/sock.h>
# include <net/checksum.h>
# include <net/xfrm.h>
# include <asm/uaccess.h>
# include <asm/system.h>
2006-10-19 16:08:53 -04:00
# include "kmap_skb.h"
2006-12-06 20:33:20 -08:00
static struct kmem_cache * skbuff_head_cache __read_mostly ;
static struct kmem_cache * skbuff_fclone_cache __read_mostly ;
2005-04-16 15:20:36 -07:00
2007-11-06 23:30:13 -08:00
static void sock_pipe_buf_release ( struct pipe_inode_info * pipe ,
struct pipe_buffer * buf )
{
struct sk_buff * skb = ( struct sk_buff * ) buf - > private ;
kfree_skb ( skb ) ;
}
static void sock_pipe_buf_get ( struct pipe_inode_info * pipe ,
struct pipe_buffer * buf )
{
struct sk_buff * skb = ( struct sk_buff * ) buf - > private ;
skb_get ( skb ) ;
}
static int sock_pipe_buf_steal ( struct pipe_inode_info * pipe ,
struct pipe_buffer * buf )
{
return 1 ;
}
/* Pipe buffer operations for a socket. */
static struct pipe_buf_operations sock_pipe_buf_ops = {
. can_merge = 0 ,
. map = generic_pipe_buf_map ,
. unmap = generic_pipe_buf_unmap ,
. confirm = generic_pipe_buf_confirm ,
. release = sock_pipe_buf_release ,
. steal = sock_pipe_buf_steal ,
. get = sock_pipe_buf_get ,
} ;
2005-04-16 15:20:36 -07:00
/*
* Keep out - of - line to prevent kernel bloat .
* __builtin_return_address is not used because it is not always
* reliable .
*/
/**
* skb_over_panic - private function
* @ skb : buffer
* @ sz : size
* @ here : address
*
* Out of line support code for skb_put ( ) . Not user callable .
*/
void skb_over_panic ( struct sk_buff * skb , int sz , void * here )
{
2005-04-21 16:43:02 -07:00
printk ( KERN_EMERG " skb_over_panic: text:%p len:%d put:%d head:%p "
2007-04-19 20:43:29 -07:00
" data:%p tail:%#lx end:%#lx dev:%s \n " ,
2007-04-19 20:29:13 -07:00
here , skb - > len , sz , skb - > head , skb - > data ,
2007-04-19 20:43:29 -07:00
( unsigned long ) skb - > tail , ( unsigned long ) skb - > end ,
2005-04-21 16:43:02 -07:00
skb - > dev ? skb - > dev - > name : " <NULL> " ) ;
2005-04-16 15:20:36 -07:00
BUG ( ) ;
}
/**
* skb_under_panic - private function
* @ skb : buffer
* @ sz : size
* @ here : address
*
* Out of line support code for skb_push ( ) . Not user callable .
*/
void skb_under_panic ( struct sk_buff * skb , int sz , void * here )
{
2005-04-21 16:43:02 -07:00
printk ( KERN_EMERG " skb_under_panic: text:%p len:%d put:%d head:%p "
2007-04-19 20:43:29 -07:00
" data:%p tail:%#lx end:%#lx dev:%s \n " ,
2007-04-19 20:29:13 -07:00
here , skb - > len , sz , skb - > head , skb - > data ,
2007-04-19 20:43:29 -07:00
( unsigned long ) skb - > tail , ( unsigned long ) skb - > end ,
2005-04-21 16:43:02 -07:00
skb - > dev ? skb - > dev - > name : " <NULL> " ) ;
2005-04-16 15:20:36 -07:00
BUG ( ) ;
}
2006-04-20 00:10:50 -07:00
void skb_truesize_bug ( struct sk_buff * skb )
{
printk ( KERN_ERR " SKB BUG: Invalid truesize (%u) "
" len=%u, sizeof(sk_buff)=%Zd \n " ,
skb - > truesize , skb - > len , sizeof ( struct sk_buff ) ) ;
}
EXPORT_SYMBOL ( skb_truesize_bug ) ;
2005-04-16 15:20:36 -07:00
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* ' private ' fields and also do memory statistics to find all the
* [ BEEP ] leaks .
*
*/
/**
2005-08-17 14:57:30 -07:00
* __alloc_skb - allocate a network buffer
2005-04-16 15:20:36 -07:00
* @ size : size to allocate
* @ gfp_mask : allocation mask
2005-10-18 22:07:41 -07:00
* @ fclone : allocate from fclone cache instead of head cache
* and allocate a cloned ( child ) skb
2006-12-06 20:32:36 -08:00
* @ node : numa node to allocate memory on
2005-04-16 15:20:36 -07:00
*
* Allocate a new & sk_buff . The returned buffer has no headroom and a
* tail room of size bytes . The object has a reference count of one .
* The return is the buffer . On a failure the return is % NULL .
*
* Buffers may only be allocated from interrupts using a @ gfp_mask of
* % GFP_ATOMIC .
*/
2005-10-07 07:46:04 +01:00
struct sk_buff * __alloc_skb ( unsigned int size , gfp_t gfp_mask ,
2006-12-06 20:32:36 -08:00
int fclone , int node )
2005-04-16 15:20:36 -07:00
{
2006-12-06 20:33:20 -08:00
struct kmem_cache * cache ;
2006-01-03 14:06:50 -08:00
struct skb_shared_info * shinfo ;
2005-04-16 15:20:36 -07:00
struct sk_buff * skb ;
u8 * data ;
2006-01-23 16:32:45 -08:00
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache ;
2005-04-16 15:20:36 -07:00
/* Get the HEAD */
2006-12-06 20:32:36 -08:00
skb = kmem_cache_alloc_node ( cache , gfp_mask & ~ __GFP_DMA , node ) ;
2005-04-16 15:20:36 -07:00
if ( ! skb )
goto out ;
size = SKB_DATA_ALIGN ( size ) ;
2006-12-06 20:32:36 -08:00
data = kmalloc_node_track_caller ( size + sizeof ( struct skb_shared_info ) ,
gfp_mask , node ) ;
2005-04-16 15:20:36 -07:00
if ( ! data )
goto nodata ;
2007-03-19 10:48:59 -03:00
/*
* See comment in sk_buff definition , just before the ' tail ' member
*/
memset ( skb , 0 , offsetof ( struct sk_buff , tail ) ) ;
2005-04-16 15:20:36 -07:00
skb - > truesize = size + sizeof ( struct sk_buff ) ;
atomic_set ( & skb - > users , 1 ) ;
skb - > head = data ;
skb - > data = data ;
2007-04-19 20:29:13 -07:00
skb_reset_tail_pointer ( skb ) ;
2007-04-19 20:43:29 -07:00
skb - > end = skb - > tail + size ;
2006-01-03 14:06:50 -08:00
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo ( skb ) ;
atomic_set ( & shinfo - > dataref , 1 ) ;
shinfo - > nr_frags = 0 ;
2006-06-22 02:40:14 -07:00
shinfo - > gso_size = 0 ;
shinfo - > gso_segs = 0 ;
shinfo - > gso_type = 0 ;
2006-01-03 14:06:50 -08:00
shinfo - > ip6_frag_id = 0 ;
shinfo - > frag_list = NULL ;
2005-08-17 14:57:30 -07:00
if ( fclone ) {
struct sk_buff * child = skb + 1 ;
atomic_t * fclone_ref = ( atomic_t * ) ( child + 1 ) ;
2005-04-16 15:20:36 -07:00
2005-08-17 14:57:30 -07:00
skb - > fclone = SKB_FCLONE_ORIG ;
atomic_set ( fclone_ref , 1 ) ;
child - > fclone = SKB_FCLONE_UNAVAILABLE ;
}
2005-04-16 15:20:36 -07:00
out :
return skb ;
nodata :
2006-01-23 16:32:45 -08:00
kmem_cache_free ( cache , skb ) ;
2005-04-16 15:20:36 -07:00
skb = NULL ;
goto out ;
}
2006-07-31 22:35:23 -07:00
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @ dev : network device to receive on
* @ length : length to allocate
* @ gfp_mask : get_free_pages mask , passed to alloc_skb
*
* Allocate a new & sk_buff and assign it a usage count of one . The
* buffer has unspecified headroom built in . Users should allocate
* the headroom they think they need without accounting for the
* built in space . The built in space is used for optimisations .
*
* % NULL is returned if there is no free memory .
*/
struct sk_buff * __netdev_alloc_skb ( struct net_device * dev ,
unsigned int length , gfp_t gfp_mask )
{
2002-04-09 12:14:34 -07:00
int node = dev - > dev . parent ? dev_to_node ( dev - > dev . parent ) : - 1 ;
2006-07-31 22:35:23 -07:00
struct sk_buff * skb ;
2007-02-09 23:24:36 +09:00
skb = __alloc_skb ( length + NET_SKB_PAD , gfp_mask , 0 , node ) ;
2006-08-07 16:09:04 -07:00
if ( likely ( skb ) ) {
2006-07-31 22:35:23 -07:00
skb_reserve ( skb , NET_SKB_PAD ) ;
2006-08-07 16:09:04 -07:00
skb - > dev = dev ;
}
2006-07-31 22:35:23 -07:00
return skb ;
}
2005-04-16 15:20:36 -07:00
2006-07-13 19:26:39 -07:00
static void skb_drop_list ( struct sk_buff * * listp )
2005-04-16 15:20:36 -07:00
{
2006-07-13 19:26:39 -07:00
struct sk_buff * list = * listp ;
2005-04-16 15:20:36 -07:00
2006-07-13 19:26:39 -07:00
* listp = NULL ;
2005-04-16 15:20:36 -07:00
do {
struct sk_buff * this = list ;
list = list - > next ;
kfree_skb ( this ) ;
} while ( list ) ;
}
2006-07-13 19:26:39 -07:00
static inline void skb_drop_fraglist ( struct sk_buff * skb )
{
skb_drop_list ( & skb_shinfo ( skb ) - > frag_list ) ;
}
2005-04-16 15:20:36 -07:00
static void skb_clone_fraglist ( struct sk_buff * skb )
{
struct sk_buff * list ;
for ( list = skb_shinfo ( skb ) - > frag_list ; list ; list = list - > next )
skb_get ( list ) ;
}
2006-06-29 13:02:35 -07:00
static void skb_release_data ( struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
if ( ! skb - > cloned | |
! atomic_sub_return ( skb - > nohdr ? ( 1 < < SKB_DATAREF_SHIFT ) + 1 : 1 ,
& skb_shinfo ( skb ) - > dataref ) ) {
if ( skb_shinfo ( skb ) - > nr_frags ) {
int i ;
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + )
put_page ( skb_shinfo ( skb ) - > frags [ i ] . page ) ;
}
if ( skb_shinfo ( skb ) - > frag_list )
skb_drop_fraglist ( skb ) ;
kfree ( skb - > head ) ;
}
}
/*
* Free an skbuff by memory without cleaning the state .
*/
2007-11-26 23:11:19 +08:00
static void kfree_skbmem ( struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
2005-08-17 14:57:30 -07:00
struct sk_buff * other ;
atomic_t * fclone_ref ;
switch ( skb - > fclone ) {
case SKB_FCLONE_UNAVAILABLE :
kmem_cache_free ( skbuff_head_cache , skb ) ;
break ;
case SKB_FCLONE_ORIG :
fclone_ref = ( atomic_t * ) ( skb + 2 ) ;
if ( atomic_dec_and_test ( fclone_ref ) )
kmem_cache_free ( skbuff_fclone_cache , skb ) ;
break ;
case SKB_FCLONE_CLONE :
fclone_ref = ( atomic_t * ) ( skb + 1 ) ;
other = skb - 1 ;
/* The clone portion is available for
* fast - cloning again .
*/
skb - > fclone = SKB_FCLONE_UNAVAILABLE ;
if ( atomic_dec_and_test ( fclone_ref ) )
kmem_cache_free ( skbuff_fclone_cache , other ) ;
break ;
2007-04-20 17:09:22 -07:00
}
2005-04-16 15:20:36 -07:00
}
2007-11-26 23:11:19 +08:00
/* Free everything but the sk_buff shell. */
static void skb_release_all ( struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
dst_release ( skb - > dst ) ;
# ifdef CONFIG_XFRM
secpath_put ( skb - > sp ) ;
# endif
2005-04-19 22:39:42 -07:00
if ( skb - > destructor ) {
WARN_ON ( in_irq ( ) ) ;
2005-04-16 15:20:36 -07:00
skb - > destructor ( skb ) ;
}
[NETFILTER]: Add nf_conntrack subsystem.
The existing connection tracking subsystem in netfilter can only
handle ipv4. There were basically two choices present to add
connection tracking support for ipv6. We could either duplicate all
of the ipv4 connection tracking code into an ipv6 counterpart, or (the
choice taken by these patches) we could design a generic layer that
could handle both ipv4 and ipv6 and thus requiring only one sub-protocol
(TCP, UDP, etc.) connection tracking helper module to be written.
In fact nf_conntrack is capable of working with any layer 3
protocol.
The existing ipv4 specific conntrack code could also not deal
with the pecularities of doing connection tracking on ipv6,
which is also cured here. For example, these issues include:
1) ICMPv6 handling, which is used for neighbour discovery in
ipv6 thus some messages such as these should not participate
in connection tracking since effectively they are like ARP
messages
2) fragmentation must be handled differently in ipv6, because
the simplistic "defrag, connection track and NAT, refrag"
(which the existing ipv4 connection tracking does) approach simply
isn't feasible in ipv6
3) ipv6 extension header parsing must occur at the correct spots
before and after connection tracking decisions, and there were
no provisions for this in the existing connection tracking
design
4) ipv6 has no need for stateful NAT
The ipv4 specific conntrack layer is kept around, until all of
the ipv4 specific conntrack helpers are ported over to nf_conntrack
and it is feature complete. Once that occurs, the old conntrack
stuff will get placed into the feature-removal-schedule and we will
fully kill it off 6 months later.
Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
2005-11-09 16:38:16 -08:00
# if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
2007-03-23 11:17:07 -07:00
nf_conntrack_put ( skb - > nfct ) ;
[NETFILTER]: Add nf_conntrack subsystem.
The existing connection tracking subsystem in netfilter can only
handle ipv4. There were basically two choices present to add
connection tracking support for ipv6. We could either duplicate all
of the ipv4 connection tracking code into an ipv6 counterpart, or (the
choice taken by these patches) we could design a generic layer that
could handle both ipv4 and ipv6 and thus requiring only one sub-protocol
(TCP, UDP, etc.) connection tracking helper module to be written.
In fact nf_conntrack is capable of working with any layer 3
protocol.
The existing ipv4 specific conntrack code could also not deal
with the pecularities of doing connection tracking on ipv6,
which is also cured here. For example, these issues include:
1) ICMPv6 handling, which is used for neighbour discovery in
ipv6 thus some messages such as these should not participate
in connection tracking since effectively they are like ARP
messages
2) fragmentation must be handled differently in ipv6, because
the simplistic "defrag, connection track and NAT, refrag"
(which the existing ipv4 connection tracking does) approach simply
isn't feasible in ipv6
3) ipv6 extension header parsing must occur at the correct spots
before and after connection tracking decisions, and there were
no provisions for this in the existing connection tracking
design
4) ipv6 has no need for stateful NAT
The ipv4 specific conntrack layer is kept around, until all of
the ipv4 specific conntrack helpers are ported over to nf_conntrack
and it is feature complete. Once that occurs, the old conntrack
stuff will get placed into the feature-removal-schedule and we will
fully kill it off 6 months later.
Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
2005-11-09 16:38:16 -08:00
nf_conntrack_put_reasm ( skb - > nfct_reasm ) ;
# endif
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put ( skb - > nf_bridge ) ;
# endif
/* XXX: IS this still necessary? - JHS */
# ifdef CONFIG_NET_SCHED
skb - > tc_index = 0 ;
# ifdef CONFIG_NET_CLS_ACT
skb - > tc_verd = 0 ;
# endif
# endif
2007-11-26 23:11:19 +08:00
skb_release_data ( skb ) ;
}
/**
* __kfree_skb - private function
* @ skb : buffer
*
* Free an sk_buff . Release anything attached to the buffer .
* Clean the state . This is an internal helper function . Users should
* always call kfree_skb
*/
2005-04-16 15:20:36 -07:00
2007-11-26 23:11:19 +08:00
void __kfree_skb ( struct sk_buff * skb )
{
skb_release_all ( skb ) ;
2005-04-16 15:20:36 -07:00
kfree_skbmem ( skb ) ;
}
2006-03-20 21:28:35 -08:00
/**
* kfree_skb - free an sk_buff
* @ skb : buffer to free
*
* Drop a reference to the buffer and free it if the usage count has
* hit zero .
*/
void kfree_skb ( struct sk_buff * skb )
{
if ( unlikely ( ! skb ) )
return ;
if ( likely ( atomic_read ( & skb - > users ) = = 1 ) )
smp_rmb ( ) ;
else if ( likely ( ! atomic_dec_and_test ( & skb - > users ) ) )
return ;
__kfree_skb ( skb ) ;
}
2007-10-14 00:37:30 -07:00
static void __copy_skb_header ( struct sk_buff * new , const struct sk_buff * old )
{
new - > tstamp = old - > tstamp ;
new - > dev = old - > dev ;
new - > transport_header = old - > transport_header ;
new - > network_header = old - > network_header ;
new - > mac_header = old - > mac_header ;
new - > dst = dst_clone ( old - > dst ) ;
# ifdef CONFIG_INET
new - > sp = secpath_get ( old - > sp ) ;
# endif
memcpy ( new - > cb , old - > cb , sizeof ( old - > cb ) ) ;
new - > csum_start = old - > csum_start ;
new - > csum_offset = old - > csum_offset ;
new - > local_df = old - > local_df ;
new - > pkt_type = old - > pkt_type ;
new - > ip_summed = old - > ip_summed ;
skb_copy_queue_mapping ( new , old ) ;
new - > priority = old - > priority ;
# if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
new - > ipvs_property = old - > ipvs_property ;
# endif
new - > protocol = old - > protocol ;
new - > mark = old - > mark ;
__nf_copy ( new , old ) ;
# if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
defined ( CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE )
new - > nf_trace = old - > nf_trace ;
# endif
# ifdef CONFIG_NET_SCHED
new - > tc_index = old - > tc_index ;
# ifdef CONFIG_NET_CLS_ACT
new - > tc_verd = old - > tc_verd ;
# endif
# endif
skb_copy_secmark ( new , old ) ;
}
2007-10-14 00:37:52 -07:00
static struct sk_buff * __skb_clone ( struct sk_buff * n , struct sk_buff * skb )
2005-04-16 15:20:36 -07:00
{
# define C(x) n->x = skb->x
n - > next = n - > prev = NULL ;
n - > sk = NULL ;
2007-10-14 00:37:30 -07:00
__copy_skb_header ( n , skb ) ;
2005-04-16 15:20:36 -07:00
C ( len ) ;
C ( data_len ) ;
2007-03-16 15:00:46 -07:00
C ( mac_len ) ;
[SKBUFF]: Keep track of writable header len of headerless clones
Currently NAT (and others) that want to modify cloned skbs copy them,
even if in the vast majority of cases its not necessary because the
skb is a clone made by TCP and the portion NAT wants to modify is
actually writable because TCP release the header reference before
cloning.
The problem is that there is no clean way for NAT to find out how
long the writable header area is, so this patch introduces skb->hdr_len
to hold this length. When a headerless skb is cloned skb->hdr_len
is set to the current headroom, for regular clones it is copied from
the original. A new function skb_clone_writable(skb, len) returns
whether the skb is writable up to len bytes from skb->data. To avoid
enlarging the skb the mac_len field is reduced to 16 bit and the
new hdr_len field is put in the remaining 16 bit.
I've done a few rough benchmarks of NAT (not with this exact patch,
but a very similar one). As expected it saves huge amounts of system
time in case of sendfile, bringing it down to basically the same
amount as without NAT, with sendmsg it only helps on loopback,
probably because of the large MTU.
Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and
without NAT:
- sendfile eth0, no NAT: sys 0m0.388s
- sendfile eth0, NAT: sys 0m1.835s
- sendfile eth0: NAT + path: sys 0m0.370s (~ -80%)
- sendfile lo, no NAT: sys 0m0.258s
- sendfile lo, NAT: sys 0m2.609s
- sendfile lo, NAT + patch: sys 0m0.260s (~ -90%)
- sendmsg eth0, no NAT: sys 0m2.508s
- sendmsg eth0, NAT: sys 0m2.539s
- sendmsg eth0, NAT + patch: sys 0m2.445s (no change)
- sendmsg lo, no NAT: sys 0m2.151s
- sendmsg lo, NAT: sys 0m3.557s
- sendmsg lo, NAT + patch: sys 0m2.159s (~ -40%)
I expect other users can see a similar performance improvement,
packet mangling iptables targets, ipip and ip_gre come to mind ..
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-06-25 04:35:20 -07:00
n - > hdr_len = skb - > nohdr ? skb_headroom ( skb ) : skb - > hdr_len ;
2008-01-07 21:56:41 -08:00
n - > cloned = 1 ;
2005-04-16 15:20:36 -07:00
n - > nohdr = 0 ;
n - > destructor = NULL ;
2008-01-07 21:56:41 -08:00
C ( iif ) ;
2005-04-16 15:20:36 -07:00
C ( tail ) ;
C ( end ) ;
2008-01-07 21:56:41 -08:00
C ( head ) ;
C ( data ) ;
C ( truesize ) ;
atomic_set ( & n - > users , 1 ) ;
2005-04-16 15:20:36 -07:00
atomic_inc ( & ( skb_shinfo ( skb ) - > dataref ) ) ;
skb - > cloned = 1 ;
return n ;
2007-10-14 00:37:52 -07:00
# undef C
}
/**
* skb_morph - morph one skb into another
* @ dst : the skb to receive the contents
* @ src : the skb to supply the contents
*
* This is identical to skb_clone except that the target skb is
* supplied by the user .
*
* The target skb is returned upon exit .
*/
struct sk_buff * skb_morph ( struct sk_buff * dst , struct sk_buff * src )
{
2007-11-26 23:11:19 +08:00
skb_release_all ( dst ) ;
2007-10-14 00:37:52 -07:00
return __skb_clone ( dst , src ) ;
}
EXPORT_SYMBOL_GPL ( skb_morph ) ;
/**
* skb_clone - duplicate an sk_buff
* @ skb : buffer to clone
* @ gfp_mask : allocation priority
*
* Duplicate an & sk_buff . The new one is not owned by a socket . Both
* copies share the same packet data but not structure . The new
* buffer has a reference count of 1. If the allocation fails the
* function returns % NULL otherwise the new buffer is returned .
*
* If this function is called from an interrupt gfp_mask ( ) must be
* % GFP_ATOMIC .
*/
struct sk_buff * skb_clone ( struct sk_buff * skb , gfp_t gfp_mask )
{
struct sk_buff * n ;
n = skb + 1 ;
if ( skb - > fclone = = SKB_FCLONE_ORIG & &
n - > fclone = = SKB_FCLONE_UNAVAILABLE ) {
atomic_t * fclone_ref = ( atomic_t * ) ( n + 1 ) ;
n - > fclone = SKB_FCLONE_CLONE ;
atomic_inc ( fclone_ref ) ;
} else {
n = kmem_cache_alloc ( skbuff_head_cache , gfp_mask ) ;
if ( ! n )
return NULL ;
n - > fclone = SKB_FCLONE_UNAVAILABLE ;
}
return __skb_clone ( n , skb ) ;
2005-04-16 15:20:36 -07:00
}
static void copy_skb_header ( struct sk_buff * new , const struct sk_buff * old )
{
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-10 21:22:35 -07:00
# ifndef NET_SKBUFF_DATA_USES_OFFSET
2005-04-16 15:20:36 -07:00
/*
* Shift between the two data areas in bytes
*/
unsigned long offset = new - > data - old - > data ;
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-10 21:22:35 -07:00
# endif
2007-10-14 00:37:30 -07:00
__copy_skb_header ( new , old ) ;
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-10 21:22:35 -07:00
# ifndef NET_SKBUFF_DATA_USES_OFFSET
/* {transport,network,mac}_header are relative to skb->head */
new - > transport_header + = offset ;
new - > network_header + = offset ;
new - > mac_header + = offset ;
# endif
2006-06-22 02:40:14 -07:00
skb_shinfo ( new ) - > gso_size = skb_shinfo ( old ) - > gso_size ;
skb_shinfo ( new ) - > gso_segs = skb_shinfo ( old ) - > gso_segs ;
skb_shinfo ( new ) - > gso_type = skb_shinfo ( old ) - > gso_type ;
2005-04-16 15:20:36 -07:00
}
/**
* skb_copy - create private copy of an sk_buff
* @ skb : buffer to copy
* @ gfp_mask : allocation priority
*
* Make a copy of both an & sk_buff and its data . This is used when the
* caller wishes to modify the data and needs a private copy of the
* data to alter . Returns % NULL on failure or the pointer to the buffer
* on success . The returned buffer has a reference count of 1.
*
* As by - product this function converts non - linear & sk_buff to linear
* one , so that & sk_buff becomes completely private and caller is allowed
* to modify all the data of returned buffer . This means that this
* function is not recommended for use in circumstances when only
* header is going to be modified . Use pskb_copy ( ) instead .
*/
2005-10-07 07:46:04 +01:00
struct sk_buff * skb_copy ( const struct sk_buff * skb , gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
int headerlen = skb - > data - skb - > head ;
/*
* Allocate the copy buffer
*/
2007-04-19 20:43:29 -07:00
struct sk_buff * n ;
# ifdef NET_SKBUFF_DATA_USES_OFFSET
n = alloc_skb ( skb - > end + skb - > data_len , gfp_mask ) ;
# else
n = alloc_skb ( skb - > end - skb - > head + skb - > data_len , gfp_mask ) ;
# endif
2005-04-16 15:20:36 -07:00
if ( ! n )
return NULL ;
/* Set the data pointer */
skb_reserve ( n , headerlen ) ;
/* Set the tail pointer and length */
skb_put ( n , skb - > len ) ;
if ( skb_copy_bits ( skb , - headerlen , n - > head , headerlen + skb - > len ) )
BUG ( ) ;
copy_skb_header ( n , skb ) ;
return n ;
}
/**
* pskb_copy - create copy of an sk_buff with private head .
* @ skb : buffer to copy
* @ gfp_mask : allocation priority
*
* Make a copy of both an & sk_buff and part of its data , located
* in header . Fragmented data remain shared . This is used when
* the caller wishes to modify only header of & sk_buff and needs
* private copy of the header to alter . Returns % NULL on failure
* or the pointer to the buffer on success .
* The returned buffer has a reference count of 1.
*/
2005-10-07 07:46:04 +01:00
struct sk_buff * pskb_copy ( struct sk_buff * skb , gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
/*
* Allocate the copy buffer
*/
2007-04-19 20:43:29 -07:00
struct sk_buff * n ;
# ifdef NET_SKBUFF_DATA_USES_OFFSET
n = alloc_skb ( skb - > end , gfp_mask ) ;
# else
n = alloc_skb ( skb - > end - skb - > head , gfp_mask ) ;
# endif
2005-04-16 15:20:36 -07:00
if ( ! n )
goto out ;
/* Set the data pointer */
skb_reserve ( n , skb - > data - skb - > head ) ;
/* Set the tail pointer and length */
skb_put ( n , skb_headlen ( skb ) ) ;
/* Copy the bytes */
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data ( skb , n - > data , n - > len ) ;
2005-04-16 15:20:36 -07:00
2006-11-07 14:57:15 -08:00
n - > truesize + = skb - > data_len ;
2005-04-16 15:20:36 -07:00
n - > data_len = skb - > data_len ;
n - > len = skb - > len ;
if ( skb_shinfo ( skb ) - > nr_frags ) {
int i ;
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
skb_shinfo ( n ) - > frags [ i ] = skb_shinfo ( skb ) - > frags [ i ] ;
get_page ( skb_shinfo ( n ) - > frags [ i ] . page ) ;
}
skb_shinfo ( n ) - > nr_frags = i ;
}
if ( skb_shinfo ( skb ) - > frag_list ) {
skb_shinfo ( n ) - > frag_list = skb_shinfo ( skb ) - > frag_list ;
skb_clone_fraglist ( n ) ;
}
copy_skb_header ( n , skb ) ;
out :
return n ;
}
/**
* pskb_expand_head - reallocate header of & sk_buff
* @ skb : buffer to reallocate
* @ nhead : room to add at head
* @ ntail : room to add at tail
* @ gfp_mask : allocation priority
*
* Expands ( or creates identical copy , if & nhead and & ntail are zero )
* header of skb . & sk_buff itself is not changed . & sk_buff MUST have
* reference count of 1. Returns zero in the case of success or error ,
* if expansion failed . In the last case , & sk_buff is not changed .
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function .
*/
2005-07-08 14:57:47 -07:00
int pskb_expand_head ( struct sk_buff * skb , int nhead , int ntail ,
2005-10-07 07:46:04 +01:00
gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
int i ;
u8 * data ;
2007-04-19 20:43:29 -07:00
# ifdef NET_SKBUFF_DATA_USES_OFFSET
int size = nhead + skb - > end + ntail ;
# else
2005-04-16 15:20:36 -07:00
int size = nhead + ( skb - > end - skb - > head ) + ntail ;
2007-04-19 20:43:29 -07:00
# endif
2005-04-16 15:20:36 -07:00
long off ;
if ( skb_shared ( skb ) )
BUG ( ) ;
size = SKB_DATA_ALIGN ( size ) ;
data = kmalloc ( size + sizeof ( struct skb_shared_info ) , gfp_mask ) ;
if ( ! data )
goto nodata ;
/* Copy only real data... and, alas, header. This should be
* optimized for the cases when header is void . */
2007-04-19 20:43:29 -07:00
# ifdef NET_SKBUFF_DATA_USES_OFFSET
2007-05-19 13:55:25 -07:00
memcpy ( data + nhead , skb - > head , skb - > tail ) ;
2007-04-19 20:43:29 -07:00
# else
2007-05-19 13:55:25 -07:00
memcpy ( data + nhead , skb - > head , skb - > tail - skb - > head ) ;
2007-04-19 20:29:13 -07:00
# endif
2007-04-19 20:43:29 -07:00
memcpy ( data + size , skb_end_pointer ( skb ) ,
sizeof ( struct skb_shared_info ) ) ;
2005-04-16 15:20:36 -07:00
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + )
get_page ( skb_shinfo ( skb ) - > frags [ i ] . page ) ;
if ( skb_shinfo ( skb ) - > frag_list )
skb_clone_fraglist ( skb ) ;
skb_release_data ( skb ) ;
off = ( data + nhead ) - skb - > head ;
skb - > head = data ;
skb - > data + = off ;
2007-04-19 20:43:29 -07:00
# ifdef NET_SKBUFF_DATA_USES_OFFSET
skb - > end = size ;
2007-04-09 11:45:04 -07:00
off = nhead ;
2007-04-19 20:43:29 -07:00
# else
skb - > end = skb - > head + size ;
2007-04-09 11:45:04 -07:00
# endif
2007-04-19 20:29:13 -07:00
/* {transport,network,mac}_header and tail are relative to skb->head */
skb - > tail + = off ;
2007-04-10 21:21:55 -07:00
skb - > transport_header + = off ;
skb - > network_header + = off ;
skb - > mac_header + = off ;
2007-10-15 01:46:08 -07:00
skb - > csum_start + = nhead ;
2005-04-16 15:20:36 -07:00
skb - > cloned = 0 ;
[SKBUFF]: Keep track of writable header len of headerless clones
Currently NAT (and others) that want to modify cloned skbs copy them,
even if in the vast majority of cases its not necessary because the
skb is a clone made by TCP and the portion NAT wants to modify is
actually writable because TCP release the header reference before
cloning.
The problem is that there is no clean way for NAT to find out how
long the writable header area is, so this patch introduces skb->hdr_len
to hold this length. When a headerless skb is cloned skb->hdr_len
is set to the current headroom, for regular clones it is copied from
the original. A new function skb_clone_writable(skb, len) returns
whether the skb is writable up to len bytes from skb->data. To avoid
enlarging the skb the mac_len field is reduced to 16 bit and the
new hdr_len field is put in the remaining 16 bit.
I've done a few rough benchmarks of NAT (not with this exact patch,
but a very similar one). As expected it saves huge amounts of system
time in case of sendfile, bringing it down to basically the same
amount as without NAT, with sendmsg it only helps on loopback,
probably because of the large MTU.
Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and
without NAT:
- sendfile eth0, no NAT: sys 0m0.388s
- sendfile eth0, NAT: sys 0m1.835s
- sendfile eth0: NAT + path: sys 0m0.370s (~ -80%)
- sendfile lo, no NAT: sys 0m0.258s
- sendfile lo, NAT: sys 0m2.609s
- sendfile lo, NAT + patch: sys 0m0.260s (~ -90%)
- sendmsg eth0, no NAT: sys 0m2.508s
- sendmsg eth0, NAT: sys 0m2.539s
- sendmsg eth0, NAT + patch: sys 0m2.445s (no change)
- sendmsg lo, no NAT: sys 0m2.151s
- sendmsg lo, NAT: sys 0m3.557s
- sendmsg lo, NAT + patch: sys 0m2.159s (~ -40%)
I expect other users can see a similar performance improvement,
packet mangling iptables targets, ipip and ip_gre come to mind ..
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-06-25 04:35:20 -07:00
skb - > hdr_len = 0 ;
2005-04-16 15:20:36 -07:00
skb - > nohdr = 0 ;
atomic_set ( & skb_shinfo ( skb ) - > dataref , 1 ) ;
return 0 ;
nodata :
return - ENOMEM ;
}
/* Make private copy of skb with writable head and some headroom */
struct sk_buff * skb_realloc_headroom ( struct sk_buff * skb , unsigned int headroom )
{
struct sk_buff * skb2 ;
int delta = headroom - skb_headroom ( skb ) ;
if ( delta < = 0 )
skb2 = pskb_copy ( skb , GFP_ATOMIC ) ;
else {
skb2 = skb_clone ( skb , GFP_ATOMIC ) ;
if ( skb2 & & pskb_expand_head ( skb2 , SKB_DATA_ALIGN ( delta ) , 0 ,
GFP_ATOMIC ) ) {
kfree_skb ( skb2 ) ;
skb2 = NULL ;
}
}
return skb2 ;
}
/**
* skb_copy_expand - copy and expand sk_buff
* @ skb : buffer to copy
* @ newheadroom : new free bytes at head
* @ newtailroom : new free bytes at tail
* @ gfp_mask : allocation priority
*
* Make a copy of both an & sk_buff and its data and while doing so
* allocate additional space .
*
* This is used when the caller wishes to modify the data and needs a
* private copy of the data to alter as well as more space for new fields .
* Returns % NULL on failure or the pointer to the buffer
* on success . The returned buffer has a reference count of 1.
*
* You must pass % GFP_ATOMIC as the allocation priority if this function
* is called from an interrupt .
*/
struct sk_buff * skb_copy_expand ( const struct sk_buff * skb ,
2005-07-08 14:57:47 -07:00
int newheadroom , int newtailroom ,
2005-10-07 07:46:04 +01:00
gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
/*
* Allocate the copy buffer
*/
struct sk_buff * n = alloc_skb ( newheadroom + skb - > len + newtailroom ,
gfp_mask ) ;
2007-04-10 18:30:09 -07:00
int oldheadroom = skb_headroom ( skb ) ;
2005-04-16 15:20:36 -07:00
int head_copy_len , head_copy_off ;
2007-09-16 16:32:11 -07:00
int off ;
2005-04-16 15:20:36 -07:00
if ( ! n )
return NULL ;
skb_reserve ( n , newheadroom ) ;
/* Set the tail pointer and length */
skb_put ( n , skb - > len ) ;
2007-04-10 18:30:09 -07:00
head_copy_len = oldheadroom ;
2005-04-16 15:20:36 -07:00
head_copy_off = 0 ;
if ( newheadroom < = head_copy_len )
head_copy_len = newheadroom ;
else
head_copy_off = newheadroom - head_copy_len ;
/* Copy the linear header and data. */
if ( skb_copy_bits ( skb , - head_copy_len , n - > head + head_copy_off ,
skb - > len + head_copy_len ) )
BUG ( ) ;
copy_skb_header ( n , skb ) ;
2007-04-10 18:30:09 -07:00
off = newheadroom - oldheadroom ;
2007-09-16 16:32:11 -07:00
n - > csum_start + = off ;
# ifdef NET_SKBUFF_DATA_USES_OFFSET
2007-04-10 18:30:09 -07:00
n - > transport_header + = off ;
n - > network_header + = off ;
n - > mac_header + = off ;
2007-09-16 16:32:11 -07:00
# endif
2007-04-10 18:30:09 -07:00
2005-04-16 15:20:36 -07:00
return n ;
}
/**
* skb_pad - zero pad the tail of an skb
* @ skb : buffer to pad
* @ pad : space to pad
*
* Ensure that a buffer is followed by a padding area that is zero
* filled . Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire .
*
2006-06-23 02:06:41 -07:00
* May return error in out of memory cases . The skb is freed on error .
2005-04-16 15:20:36 -07:00
*/
2007-02-09 23:24:36 +09:00
2006-06-23 02:06:41 -07:00
int skb_pad ( struct sk_buff * skb , int pad )
2005-04-16 15:20:36 -07:00
{
2006-06-23 02:06:41 -07:00
int err ;
int ntail ;
2007-02-09 23:24:36 +09:00
2005-04-16 15:20:36 -07:00
/* If the skbuff is non linear tailroom is always zero.. */
2006-06-23 02:06:41 -07:00
if ( ! skb_cloned ( skb ) & & skb_tailroom ( skb ) > = pad ) {
2005-04-16 15:20:36 -07:00
memset ( skb - > data + skb - > len , 0 , pad ) ;
2006-06-23 02:06:41 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
2006-06-23 02:06:41 -07:00
2007-04-19 20:43:29 -07:00
ntail = skb - > data_len + pad - ( skb - > end - skb - > tail ) ;
2006-06-23 02:06:41 -07:00
if ( likely ( skb_cloned ( skb ) | | ntail > 0 ) ) {
err = pskb_expand_head ( skb , 0 , ntail , GFP_ATOMIC ) ;
if ( unlikely ( err ) )
goto free_skb ;
}
/* FIXME: The use of this function with non-linear skb's really needs
* to be audited .
*/
err = skb_linearize ( skb ) ;
if ( unlikely ( err ) )
goto free_skb ;
memset ( skb - > data + skb - > len , 0 , pad ) ;
return 0 ;
free_skb :
2005-04-16 15:20:36 -07:00
kfree_skb ( skb ) ;
2006-06-23 02:06:41 -07:00
return err ;
2007-02-09 23:24:36 +09:00
}
2006-06-09 16:13:38 -07:00
/* Trims skb to length len. It can change skb pointers.
2005-04-16 15:20:36 -07:00
*/
2006-06-09 16:13:38 -07:00
int ___pskb_trim ( struct sk_buff * skb , unsigned int len )
2005-04-16 15:20:36 -07:00
{
2006-07-13 19:26:39 -07:00
struct sk_buff * * fragp ;
struct sk_buff * frag ;
2005-04-16 15:20:36 -07:00
int offset = skb_headlen ( skb ) ;
int nfrags = skb_shinfo ( skb ) - > nr_frags ;
int i ;
2006-07-13 19:26:39 -07:00
int err ;
if ( skb_cloned ( skb ) & &
unlikely ( ( err = pskb_expand_head ( skb , 0 , 0 , GFP_ATOMIC ) ) ) )
return err ;
2005-04-16 15:20:36 -07:00
2006-07-30 20:20:28 -07:00
i = 0 ;
if ( offset > = len )
goto drop_pages ;
for ( ; i < nfrags ; i + + ) {
2005-04-16 15:20:36 -07:00
int end = offset + skb_shinfo ( skb ) - > frags [ i ] . size ;
2006-07-13 19:26:39 -07:00
if ( end < len ) {
offset = end ;
continue ;
}
2006-07-30 20:20:28 -07:00
skb_shinfo ( skb ) - > frags [ i + + ] . size = len - offset ;
2006-07-13 19:26:39 -07:00
2006-07-30 20:20:28 -07:00
drop_pages :
2006-07-13 19:26:39 -07:00
skb_shinfo ( skb ) - > nr_frags = i ;
for ( ; i < nfrags ; i + + )
put_page ( skb_shinfo ( skb ) - > frags [ i ] . page ) ;
if ( skb_shinfo ( skb ) - > frag_list )
skb_drop_fraglist ( skb ) ;
2006-07-30 20:20:28 -07:00
goto done ;
2006-07-13 19:26:39 -07:00
}
for ( fragp = & skb_shinfo ( skb ) - > frag_list ; ( frag = * fragp ) ;
fragp = & frag - > next ) {
int end = offset + frag - > len ;
if ( skb_shared ( frag ) ) {
struct sk_buff * nfrag ;
nfrag = skb_clone ( frag , GFP_ATOMIC ) ;
if ( unlikely ( ! nfrag ) )
return - ENOMEM ;
nfrag - > next = frag - > next ;
2006-07-30 20:20:28 -07:00
kfree_skb ( frag ) ;
2006-07-13 19:26:39 -07:00
frag = nfrag ;
* fragp = frag ;
2005-04-16 15:20:36 -07:00
}
2006-07-13 19:26:39 -07:00
if ( end < len ) {
offset = end ;
continue ;
}
if ( end > len & &
unlikely ( ( err = pskb_trim ( frag , len - offset ) ) ) )
return err ;
if ( frag - > next )
skb_drop_list ( & frag - > next ) ;
break ;
2005-04-16 15:20:36 -07:00
}
2006-07-30 20:20:28 -07:00
done :
2006-07-13 19:26:39 -07:00
if ( len > skb_headlen ( skb ) ) {
2005-04-16 15:20:36 -07:00
skb - > data_len - = skb - > len - len ;
skb - > len = len ;
} else {
2006-07-13 19:26:39 -07:00
skb - > len = len ;
skb - > data_len = 0 ;
2007-04-19 20:29:13 -07:00
skb_set_tail_pointer ( skb , len ) ;
2005-04-16 15:20:36 -07:00
}
return 0 ;
}
/**
* __pskb_pull_tail - advance tail of skb header
* @ skb : buffer to reallocate
* @ delta : number of bytes to advance tail
*
* The function makes a sense only on a fragmented & sk_buff ,
* it expands header moving its tail forward and copying necessary
* data from fragmented part .
*
* & sk_buff MUST have reference count of 1.
*
* Returns % NULL ( and & sk_buff does not change ) if pull failed
* or value of new tail of skb in the case of success .
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function .
*/
/* Moves tail of skb head forward, copying data from fragmented part,
* when it is necessary .
* 1. It may fail due to malloc failure .
* 2. It may change skb pointers .
*
* It is pretty complicated . Luckily , it is called only in exceptional cases .
*/
unsigned char * __pskb_pull_tail ( struct sk_buff * skb , int delta )
{
/* If skb has not enough free space at tail, get new one
* plus 128 bytes for future expansions . If we have enough
* room at tail , reallocate without expansion only if skb is cloned .
*/
2007-04-19 20:43:29 -07:00
int i , k , eat = ( skb - > tail + delta ) - skb - > end ;
2005-04-16 15:20:36 -07:00
if ( eat > 0 | | skb_cloned ( skb ) ) {
if ( pskb_expand_head ( skb , 0 , eat > 0 ? eat + 128 : 0 ,
GFP_ATOMIC ) )
return NULL ;
}
2007-04-19 20:29:13 -07:00
if ( skb_copy_bits ( skb , skb_headlen ( skb ) , skb_tail_pointer ( skb ) , delta ) )
2005-04-16 15:20:36 -07:00
BUG ( ) ;
/* Optimization: no fragments, no reasons to preestimate
* size of pulled pages . Superb .
*/
if ( ! skb_shinfo ( skb ) - > frag_list )
goto pull_pages ;
/* Estimate size of pulled pages. */
eat = delta ;
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
if ( skb_shinfo ( skb ) - > frags [ i ] . size > = eat )
goto pull_pages ;
eat - = skb_shinfo ( skb ) - > frags [ i ] . size ;
}
/* If we need update frag list, we are in troubles.
* Certainly , it possible to add an offset to skb data ,
* but taking into account that pulling is expected to
* be very rare operation , it is worth to fight against
* further bloating skb head and crucify ourselves here instead .
* Pure masohism , indeed . 8 ) 8 )
*/
if ( eat ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
struct sk_buff * clone = NULL ;
struct sk_buff * insp = NULL ;
do {
2006-01-08 22:24:28 -08:00
BUG_ON ( ! list ) ;
2005-04-16 15:20:36 -07:00
if ( list - > len < = eat ) {
/* Eaten as whole. */
eat - = list - > len ;
list = list - > next ;
insp = list ;
} else {
/* Eaten partially. */
if ( skb_shared ( list ) ) {
/* Sucks! We need to fork list. :-( */
clone = skb_clone ( list , GFP_ATOMIC ) ;
if ( ! clone )
return NULL ;
insp = list - > next ;
list = clone ;
} else {
/* This may be pulled without
* problems . */
insp = list ;
}
if ( ! pskb_pull ( list , eat ) ) {
if ( clone )
kfree_skb ( clone ) ;
return NULL ;
}
break ;
}
} while ( eat ) ;
/* Free pulled out fragments. */
while ( ( list = skb_shinfo ( skb ) - > frag_list ) ! = insp ) {
skb_shinfo ( skb ) - > frag_list = list - > next ;
kfree_skb ( list ) ;
}
/* And insert new clone at head. */
if ( clone ) {
clone - > next = list ;
skb_shinfo ( skb ) - > frag_list = clone ;
}
}
/* Success! Now we may commit changes to skb data. */
pull_pages :
eat = delta ;
k = 0 ;
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
if ( skb_shinfo ( skb ) - > frags [ i ] . size < = eat ) {
put_page ( skb_shinfo ( skb ) - > frags [ i ] . page ) ;
eat - = skb_shinfo ( skb ) - > frags [ i ] . size ;
} else {
skb_shinfo ( skb ) - > frags [ k ] = skb_shinfo ( skb ) - > frags [ i ] ;
if ( eat ) {
skb_shinfo ( skb ) - > frags [ k ] . page_offset + = eat ;
skb_shinfo ( skb ) - > frags [ k ] . size - = eat ;
eat = 0 ;
}
k + + ;
}
}
skb_shinfo ( skb ) - > nr_frags = k ;
skb - > tail + = delta ;
skb - > data_len - = delta ;
2007-04-19 20:29:13 -07:00
return skb_tail_pointer ( skb ) ;
2005-04-16 15:20:36 -07:00
}
/* Copy some data bits from skb to kernel buffer. */
int skb_copy_bits ( const struct sk_buff * skb , int offset , void * to , int len )
{
int i , copy ;
2007-04-27 15:21:23 -07:00
int start = skb_headlen ( skb ) ;
2005-04-16 15:20:36 -07:00
if ( offset > ( int ) skb - > len - len )
goto fault ;
/* Copy header. */
2007-04-27 15:21:23 -07:00
if ( ( copy = start - offset ) > 0 ) {
2005-04-16 15:20:36 -07:00
if ( copy > len )
copy = len ;
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data_offset ( skb , offset , to , copy ) ;
2005-04-16 15:20:36 -07:00
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
to + = copy ;
}
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
2007-04-27 15:21:23 -07:00
int end ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
BUG_TRAP ( start < = offset + len ) ;
end = start + skb_shinfo ( skb ) - > frags [ i ] . size ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
u8 * vaddr ;
if ( copy > len )
copy = len ;
vaddr = kmap_skb_frag ( & skb_shinfo ( skb ) - > frags [ i ] ) ;
memcpy ( to ,
2007-04-27 15:21:23 -07:00
vaddr + skb_shinfo ( skb ) - > frags [ i ] . page_offset +
offset - start , copy ) ;
2005-04-16 15:20:36 -07:00
kunmap_skb_frag ( vaddr ) ;
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
to + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list ; list = list - > next ) {
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
end = start + list - > len ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
if ( copy > len )
copy = len ;
2007-04-27 15:21:23 -07:00
if ( skb_copy_bits ( list , offset - start ,
to , copy ) )
2005-04-16 15:20:36 -07:00
goto fault ;
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
to + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
}
if ( ! len )
return 0 ;
fault :
return - EFAULT ;
}
2007-11-06 23:30:13 -08:00
/*
* Callback from splice_to_pipe ( ) , if we need to release some pages
* at the end of the spd in case we error ' ed out in filling the pipe .
*/
static void sock_spd_release ( struct splice_pipe_desc * spd , unsigned int i )
{
struct sk_buff * skb = ( struct sk_buff * ) spd - > partial [ i ] . private ;
kfree_skb ( skb ) ;
}
/*
* Fill page / offset / length into spd , if it can hold more pages .
*/
static inline int spd_fill_page ( struct splice_pipe_desc * spd , struct page * page ,
unsigned int len , unsigned int offset ,
struct sk_buff * skb )
{
if ( unlikely ( spd - > nr_pages = = PIPE_BUFFERS ) )
return 1 ;
spd - > pages [ spd - > nr_pages ] = page ;
spd - > partial [ spd - > nr_pages ] . len = len ;
spd - > partial [ spd - > nr_pages ] . offset = offset ;
spd - > partial [ spd - > nr_pages ] . private = ( unsigned long ) skb_get ( skb ) ;
spd - > nr_pages + + ;
return 0 ;
}
/*
* Map linear and fragment data from the skb to spd . Returns number of
* pages mapped .
*/
static int __skb_splice_bits ( struct sk_buff * skb , unsigned int * offset ,
unsigned int * total_len ,
struct splice_pipe_desc * spd )
{
unsigned int nr_pages = spd - > nr_pages ;
unsigned int poff , plen , len , toff , tlen ;
int headlen , seg ;
toff = * offset ;
tlen = * total_len ;
if ( ! tlen )
goto err ;
/*
* if the offset is greater than the linear part , go directly to
* the fragments .
*/
headlen = skb_headlen ( skb ) ;
if ( toff > = headlen ) {
toff - = headlen ;
goto map_frag ;
}
/*
* first map the linear region into the pages / partial map , skipping
* any potential initial offset .
*/
len = 0 ;
while ( len < headlen ) {
void * p = skb - > data + len ;
poff = ( unsigned long ) p & ( PAGE_SIZE - 1 ) ;
plen = min_t ( unsigned int , headlen - len , PAGE_SIZE - poff ) ;
len + = plen ;
if ( toff ) {
if ( plen < = toff ) {
toff - = plen ;
continue ;
}
plen - = toff ;
poff + = toff ;
toff = 0 ;
}
plen = min ( plen , tlen ) ;
if ( ! plen )
break ;
/*
* just jump directly to update and return , no point
* in going over fragments when the output is full .
*/
if ( spd_fill_page ( spd , virt_to_page ( p ) , plen , poff , skb ) )
goto done ;
tlen - = plen ;
}
/*
* then map the fragments
*/
map_frag :
for ( seg = 0 ; seg < skb_shinfo ( skb ) - > nr_frags ; seg + + ) {
const skb_frag_t * f = & skb_shinfo ( skb ) - > frags [ seg ] ;
plen = f - > size ;
poff = f - > page_offset ;
if ( toff ) {
if ( plen < = toff ) {
toff - = plen ;
continue ;
}
plen - = toff ;
poff + = toff ;
toff = 0 ;
}
plen = min ( plen , tlen ) ;
if ( ! plen )
break ;
if ( spd_fill_page ( spd , f - > page , plen , poff , skb ) )
break ;
tlen - = plen ;
}
done :
if ( spd - > nr_pages - nr_pages ) {
* offset = 0 ;
* total_len = tlen ;
return 0 ;
}
err :
return 1 ;
}
/*
* Map data from the skb to a pipe . Should handle both the linear part ,
* the fragments , and the frag list . It does NOT handle frag lists within
* the frag list , if such a thing exists . We ' d probably need to recurse to
* handle that cleanly .
*/
int skb_splice_bits ( struct sk_buff * __skb , unsigned int offset ,
struct pipe_inode_info * pipe , unsigned int tlen ,
unsigned int flags )
{
struct partial_page partial [ PIPE_BUFFERS ] ;
struct page * pages [ PIPE_BUFFERS ] ;
struct splice_pipe_desc spd = {
. pages = pages ,
. partial = partial ,
. flags = flags ,
. ops = & sock_pipe_buf_ops ,
. spd_release = sock_spd_release ,
} ;
struct sk_buff * skb ;
/*
* I ' d love to avoid the clone here , but tcp_read_sock ( )
* ignores reference counts and unconditonally kills the sk_buff
* on return from the actor .
*/
skb = skb_clone ( __skb , GFP_KERNEL ) ;
if ( unlikely ( ! skb ) )
return - ENOMEM ;
/*
* __skb_splice_bits ( ) only fails if the output has no room left ,
* so no point in going over the frag_list for the error case .
*/
if ( __skb_splice_bits ( skb , & offset , & tlen , & spd ) )
goto done ;
else if ( ! tlen )
goto done ;
/*
* now see if we have a frag_list to map
*/
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list & & tlen ; list = list - > next ) {
if ( __skb_splice_bits ( list , & offset , & tlen , & spd ) )
break ;
}
}
done :
/*
* drop our reference to the clone , the pipe consumption will
* drop the rest .
*/
kfree_skb ( skb ) ;
if ( spd . nr_pages ) {
int ret ;
/*
* Drop the socket lock , otherwise we have reverse
* locking dependencies between sk_lock and i_mutex
* here as compared to sendfile ( ) . We enter here
* with the socket lock held , and splice_to_pipe ( ) will
* grab the pipe inode lock . For sendfile ( ) emulation ,
* we call into - > sendpage ( ) with the i_mutex lock held
* and networking will grab the socket lock .
*/
release_sock ( __skb - > sk ) ;
ret = splice_to_pipe ( pipe , & spd ) ;
lock_sock ( __skb - > sk ) ;
return ret ;
}
return 0 ;
}
2005-04-19 22:30:14 -07:00
/**
* skb_store_bits - store bits from kernel buffer to skb
* @ skb : destination buffer
* @ offset : offset in destination
* @ from : source buffer
* @ len : number of bytes to copy
*
* Copy the specified number of bytes from the source buffer to the
* destination skb . This function handles all the messy bits of
* traversing fragment lists and such .
*/
2007-04-20 16:40:01 -07:00
int skb_store_bits ( struct sk_buff * skb , int offset , const void * from , int len )
2005-04-19 22:30:14 -07:00
{
int i , copy ;
2007-04-27 15:21:23 -07:00
int start = skb_headlen ( skb ) ;
2005-04-19 22:30:14 -07:00
if ( offset > ( int ) skb - > len - len )
goto fault ;
2007-04-27 15:21:23 -07:00
if ( ( copy = start - offset ) > 0 ) {
2005-04-19 22:30:14 -07:00
if ( copy > len )
copy = len ;
2007-03-31 11:55:19 -03:00
skb_copy_to_linear_data_offset ( skb , offset , from , copy ) ;
2005-04-19 22:30:14 -07:00
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
from + = copy ;
}
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
skb_frag_t * frag = & skb_shinfo ( skb ) - > frags [ i ] ;
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2005-04-19 22:30:14 -07:00
2007-04-27 15:21:23 -07:00
end = start + frag - > size ;
2005-04-19 22:30:14 -07:00
if ( ( copy = end - offset ) > 0 ) {
u8 * vaddr ;
if ( copy > len )
copy = len ;
vaddr = kmap_skb_frag ( frag ) ;
2007-04-27 15:21:23 -07:00
memcpy ( vaddr + frag - > page_offset + offset - start ,
from , copy ) ;
2005-04-19 22:30:14 -07:00
kunmap_skb_frag ( vaddr ) ;
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
from + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-19 22:30:14 -07:00
}
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list ; list = list - > next ) {
2007-04-27 15:21:23 -07:00
int end ;
2005-04-19 22:30:14 -07:00
2007-04-27 15:21:23 -07:00
BUG_TRAP ( start < = offset + len ) ;
end = start + list - > len ;
2005-04-19 22:30:14 -07:00
if ( ( copy = end - offset ) > 0 ) {
if ( copy > len )
copy = len ;
2007-04-27 15:21:23 -07:00
if ( skb_store_bits ( list , offset - start ,
from , copy ) )
2005-04-19 22:30:14 -07:00
goto fault ;
if ( ( len - = copy ) = = 0 )
return 0 ;
offset + = copy ;
from + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-19 22:30:14 -07:00
}
}
if ( ! len )
return 0 ;
fault :
return - EFAULT ;
}
EXPORT_SYMBOL ( skb_store_bits ) ;
2005-04-16 15:20:36 -07:00
/* Checksum skb data. */
2006-11-14 21:37:14 -08:00
__wsum skb_checksum ( const struct sk_buff * skb , int offset ,
int len , __wsum csum )
2005-04-16 15:20:36 -07:00
{
2007-04-27 15:21:23 -07:00
int start = skb_headlen ( skb ) ;
int i , copy = start - offset ;
2005-04-16 15:20:36 -07:00
int pos = 0 ;
/* Checksum header. */
if ( copy > 0 ) {
if ( copy > len )
copy = len ;
csum = csum_partial ( skb - > data + offset , copy , csum ) ;
if ( ( len - = copy ) = = 0 )
return csum ;
offset + = copy ;
pos = copy ;
}
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
end = start + skb_shinfo ( skb ) - > frags [ i ] . size ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
2006-11-14 21:36:14 -08:00
__wsum csum2 ;
2005-04-16 15:20:36 -07:00
u8 * vaddr ;
skb_frag_t * frag = & skb_shinfo ( skb ) - > frags [ i ] ;
if ( copy > len )
copy = len ;
vaddr = kmap_skb_frag ( frag ) ;
2007-04-27 15:21:23 -07:00
csum2 = csum_partial ( vaddr + frag - > page_offset +
offset - start , copy , 0 ) ;
2005-04-16 15:20:36 -07:00
kunmap_skb_frag ( vaddr ) ;
csum = csum_block_add ( csum , csum2 , pos ) ;
if ( ! ( len - = copy ) )
return csum ;
offset + = copy ;
pos + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list ; list = list - > next ) {
2007-04-27 15:21:23 -07:00
int end ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
BUG_TRAP ( start < = offset + len ) ;
end = start + list - > len ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
2006-11-14 21:36:54 -08:00
__wsum csum2 ;
2005-04-16 15:20:36 -07:00
if ( copy > len )
copy = len ;
2007-04-27 15:21:23 -07:00
csum2 = skb_checksum ( list , offset - start ,
copy , 0 ) ;
2005-04-16 15:20:36 -07:00
csum = csum_block_add ( csum , csum2 , pos ) ;
if ( ( len - = copy ) = = 0 )
return csum ;
offset + = copy ;
pos + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
}
2006-01-08 22:24:28 -08:00
BUG_ON ( len ) ;
2005-04-16 15:20:36 -07:00
return csum ;
}
/* Both of above in one bottle. */
2006-11-14 21:37:33 -08:00
__wsum skb_copy_and_csum_bits ( const struct sk_buff * skb , int offset ,
u8 * to , int len , __wsum csum )
2005-04-16 15:20:36 -07:00
{
2007-04-27 15:21:23 -07:00
int start = skb_headlen ( skb ) ;
int i , copy = start - offset ;
2005-04-16 15:20:36 -07:00
int pos = 0 ;
/* Copy header. */
if ( copy > 0 ) {
if ( copy > len )
copy = len ;
csum = csum_partial_copy_nocheck ( skb - > data + offset , to ,
copy , csum ) ;
if ( ( len - = copy ) = = 0 )
return csum ;
offset + = copy ;
to + = copy ;
pos = copy ;
}
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
end = start + skb_shinfo ( skb ) - > frags [ i ] . size ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
2006-11-14 21:36:34 -08:00
__wsum csum2 ;
2005-04-16 15:20:36 -07:00
u8 * vaddr ;
skb_frag_t * frag = & skb_shinfo ( skb ) - > frags [ i ] ;
if ( copy > len )
copy = len ;
vaddr = kmap_skb_frag ( frag ) ;
csum2 = csum_partial_copy_nocheck ( vaddr +
2007-04-27 15:21:23 -07:00
frag - > page_offset +
offset - start , to ,
copy , 0 ) ;
2005-04-16 15:20:36 -07:00
kunmap_skb_frag ( vaddr ) ;
csum = csum_block_add ( csum , csum2 , pos ) ;
if ( ! ( len - = copy ) )
return csum ;
offset + = copy ;
to + = copy ;
pos + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list ; list = list - > next ) {
2006-11-14 21:37:33 -08:00
__wsum csum2 ;
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2005-04-16 15:20:36 -07:00
2007-04-27 15:21:23 -07:00
end = start + list - > len ;
2005-04-16 15:20:36 -07:00
if ( ( copy = end - offset ) > 0 ) {
if ( copy > len )
copy = len ;
2007-04-27 15:21:23 -07:00
csum2 = skb_copy_and_csum_bits ( list ,
offset - start ,
2005-04-16 15:20:36 -07:00
to , copy , 0 ) ;
csum = csum_block_add ( csum , csum2 , pos ) ;
if ( ( len - = copy ) = = 0 )
return csum ;
offset + = copy ;
to + = copy ;
pos + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2005-04-16 15:20:36 -07:00
}
}
2006-01-08 22:24:28 -08:00
BUG_ON ( len ) ;
2005-04-16 15:20:36 -07:00
return csum ;
}
void skb_copy_and_csum_dev ( const struct sk_buff * skb , u8 * to )
{
2006-11-14 21:24:49 -08:00
__wsum csum ;
2005-04-16 15:20:36 -07:00
long csstart ;
2006-08-29 16:44:56 -07:00
if ( skb - > ip_summed = = CHECKSUM_PARTIAL )
2007-04-09 11:59:07 -07:00
csstart = skb - > csum_start - skb_headroom ( skb ) ;
2005-04-16 15:20:36 -07:00
else
csstart = skb_headlen ( skb ) ;
2006-01-08 22:24:28 -08:00
BUG_ON ( csstart > skb_headlen ( skb ) ) ;
2005-04-16 15:20:36 -07:00
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data ( skb , to , csstart ) ;
2005-04-16 15:20:36 -07:00
csum = 0 ;
if ( csstart ! = skb - > len )
csum = skb_copy_and_csum_bits ( skb , csstart , to + csstart ,
skb - > len - csstart , 0 ) ;
2006-08-29 16:44:56 -07:00
if ( skb - > ip_summed = = CHECKSUM_PARTIAL ) {
2006-11-20 18:07:29 -08:00
long csstuff = csstart + skb - > csum_offset ;
2005-04-16 15:20:36 -07:00
2006-11-14 21:24:49 -08:00
* ( ( __sum16 * ) ( to + csstuff ) ) = csum_fold ( csum ) ;
2005-04-16 15:20:36 -07:00
}
}
/**
* skb_dequeue - remove from the head of the queue
* @ list : list to dequeue from
*
* Remove the head of the list . The list lock is taken so the function
* may be used safely with other locking list functions . The head item is
* returned or % NULL if the list is empty .
*/
struct sk_buff * skb_dequeue ( struct sk_buff_head * list )
{
unsigned long flags ;
struct sk_buff * result ;
spin_lock_irqsave ( & list - > lock , flags ) ;
result = __skb_dequeue ( list ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
return result ;
}
/**
* skb_dequeue_tail - remove from the tail of the queue
* @ list : list to dequeue from
*
* Remove the tail of the list . The list lock is taken so the function
* may be used safely with other locking list functions . The tail item is
* returned or % NULL if the list is empty .
*/
struct sk_buff * skb_dequeue_tail ( struct sk_buff_head * list )
{
unsigned long flags ;
struct sk_buff * result ;
spin_lock_irqsave ( & list - > lock , flags ) ;
result = __skb_dequeue_tail ( list ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
return result ;
}
/**
* skb_queue_purge - empty a list
* @ list : list to empty
*
* Delete all buffers on an & sk_buff list . Each buffer is removed from
* the list and one reference dropped . This function takes the list
* lock and is atomic with respect to other list locking functions .
*/
void skb_queue_purge ( struct sk_buff_head * list )
{
struct sk_buff * skb ;
while ( ( skb = skb_dequeue ( list ) ) ! = NULL )
kfree_skb ( skb ) ;
}
/**
* skb_queue_head - queue a buffer at the list head
* @ list : list to use
* @ newsk : buffer to queue
*
* Queue a buffer at the start of the list . This function takes the
* list lock and can be used safely with other locking & sk_buff functions
* safely .
*
* A buffer cannot be placed on two lists at the same time .
*/
void skb_queue_head ( struct sk_buff_head * list , struct sk_buff * newsk )
{
unsigned long flags ;
spin_lock_irqsave ( & list - > lock , flags ) ;
__skb_queue_head ( list , newsk ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
}
/**
* skb_queue_tail - queue a buffer at the list tail
* @ list : list to use
* @ newsk : buffer to queue
*
* Queue a buffer at the tail of the list . This function takes the
* list lock and can be used safely with other locking & sk_buff functions
* safely .
*
* A buffer cannot be placed on two lists at the same time .
*/
void skb_queue_tail ( struct sk_buff_head * list , struct sk_buff * newsk )
{
unsigned long flags ;
spin_lock_irqsave ( & list - > lock , flags ) ;
__skb_queue_tail ( list , newsk ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
}
2005-08-09 19:25:21 -07:00
2005-04-16 15:20:36 -07:00
/**
* skb_unlink - remove a buffer from a list
* @ skb : buffer to remove
2005-08-09 19:25:21 -07:00
* @ list : list to use
2005-04-16 15:20:36 -07:00
*
2005-08-09 19:25:21 -07:00
* Remove a packet from a list . The list locks are taken and this
* function is atomic with respect to other list locked calls
2005-04-16 15:20:36 -07:00
*
2005-08-09 19:25:21 -07:00
* You must know what list the SKB is on .
2005-04-16 15:20:36 -07:00
*/
2005-08-09 19:25:21 -07:00
void skb_unlink ( struct sk_buff * skb , struct sk_buff_head * list )
2005-04-16 15:20:36 -07:00
{
2005-08-09 19:25:21 -07:00
unsigned long flags ;
2005-04-16 15:20:36 -07:00
2005-08-09 19:25:21 -07:00
spin_lock_irqsave ( & list - > lock , flags ) ;
__skb_unlink ( skb , list ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2005-04-16 15:20:36 -07:00
}
/**
* skb_append - append a buffer
* @ old : buffer to insert after
* @ newsk : buffer to insert
2005-08-09 19:25:21 -07:00
* @ list : list to use
2005-04-16 15:20:36 -07:00
*
* Place a packet after a given packet in a list . The list locks are taken
* and this function is atomic with respect to other list locked calls .
* A buffer cannot be placed on two lists at the same time .
*/
2005-08-09 19:25:21 -07:00
void skb_append ( struct sk_buff * old , struct sk_buff * newsk , struct sk_buff_head * list )
2005-04-16 15:20:36 -07:00
{
unsigned long flags ;
2005-08-09 19:25:21 -07:00
spin_lock_irqsave ( & list - > lock , flags ) ;
__skb_append ( old , newsk , list ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2005-04-16 15:20:36 -07:00
}
/**
* skb_insert - insert a buffer
* @ old : buffer to insert before
* @ newsk : buffer to insert
2005-08-09 19:25:21 -07:00
* @ list : list to use
*
* Place a packet before a given packet in a list . The list locks are
* taken and this function is atomic with respect to other list locked
* calls .
2005-04-16 15:20:36 -07:00
*
* A buffer cannot be placed on two lists at the same time .
*/
2005-08-09 19:25:21 -07:00
void skb_insert ( struct sk_buff * old , struct sk_buff * newsk , struct sk_buff_head * list )
2005-04-16 15:20:36 -07:00
{
unsigned long flags ;
2005-08-09 19:25:21 -07:00
spin_lock_irqsave ( & list - > lock , flags ) ;
__skb_insert ( newsk , old - > prev , old , list ) ;
spin_unlock_irqrestore ( & list - > lock , flags ) ;
2005-04-16 15:20:36 -07:00
}
static inline void skb_split_inside_header ( struct sk_buff * skb ,
struct sk_buff * skb1 ,
const u32 len , const int pos )
{
int i ;
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data_offset ( skb , len , skb_put ( skb1 , pos - len ) ,
pos - len ) ;
2005-04-16 15:20:36 -07:00
/* And move data appendix as is. */
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + )
skb_shinfo ( skb1 ) - > frags [ i ] = skb_shinfo ( skb ) - > frags [ i ] ;
skb_shinfo ( skb1 ) - > nr_frags = skb_shinfo ( skb ) - > nr_frags ;
skb_shinfo ( skb ) - > nr_frags = 0 ;
skb1 - > data_len = skb - > data_len ;
skb1 - > len + = skb1 - > data_len ;
skb - > data_len = 0 ;
skb - > len = len ;
2007-04-19 20:29:13 -07:00
skb_set_tail_pointer ( skb , len ) ;
2005-04-16 15:20:36 -07:00
}
static inline void skb_split_no_header ( struct sk_buff * skb ,
struct sk_buff * skb1 ,
const u32 len , int pos )
{
int i , k = 0 ;
const int nfrags = skb_shinfo ( skb ) - > nr_frags ;
skb_shinfo ( skb ) - > nr_frags = 0 ;
skb1 - > len = skb1 - > data_len = skb - > len - len ;
skb - > len = len ;
skb - > data_len = len - pos ;
for ( i = 0 ; i < nfrags ; i + + ) {
int size = skb_shinfo ( skb ) - > frags [ i ] . size ;
if ( pos + size > len ) {
skb_shinfo ( skb1 ) - > frags [ k ] = skb_shinfo ( skb ) - > frags [ i ] ;
if ( pos < len ) {
/* Split frag.
* We have two variants in this case :
* 1. Move all the frag to the second
* part , if it is possible . F . e .
* this approach is mandatory for TUX ,
* where splitting is expensive .
* 2. Split is accurately . We make this .
*/
get_page ( skb_shinfo ( skb ) - > frags [ i ] . page ) ;
skb_shinfo ( skb1 ) - > frags [ 0 ] . page_offset + = len - pos ;
skb_shinfo ( skb1 ) - > frags [ 0 ] . size - = len - pos ;
skb_shinfo ( skb ) - > frags [ i ] . size = len - pos ;
skb_shinfo ( skb ) - > nr_frags + + ;
}
k + + ;
} else
skb_shinfo ( skb ) - > nr_frags + + ;
pos + = size ;
}
skb_shinfo ( skb1 ) - > nr_frags = k ;
}
/**
* skb_split - Split fragmented skb to two parts at length len .
* @ skb : the buffer to split
* @ skb1 : the buffer to receive the second part
* @ len : new length for skb
*/
void skb_split ( struct sk_buff * skb , struct sk_buff * skb1 , const u32 len )
{
int pos = skb_headlen ( skb ) ;
if ( len < pos ) /* Split line is inside header. */
skb_split_inside_header ( skb , skb1 , len , pos ) ;
else /* Second chunk has no header, nothing to copy. */
skb_split_no_header ( skb , skb1 , len , pos ) ;
}
2005-06-23 20:59:51 -07:00
/**
* skb_prepare_seq_read - Prepare a sequential read of skb data
* @ skb : the buffer to read
* @ from : lower offset of data to be read
* @ to : upper offset of data to be read
* @ st : state variable
*
* Initializes the specified state variable . Must be called before
* invoking skb_seq_read ( ) for the first time .
*/
void skb_prepare_seq_read ( struct sk_buff * skb , unsigned int from ,
unsigned int to , struct skb_seq_state * st )
{
st - > lower_offset = from ;
st - > upper_offset = to ;
st - > root_skb = st - > cur_skb = skb ;
st - > frag_idx = st - > stepped_offset = 0 ;
st - > frag_data = NULL ;
}
/**
* skb_seq_read - Sequentially read skb data
* @ consumed : number of bytes consumed by the caller so far
* @ data : destination pointer for data to be returned
* @ st : state variable
*
* Reads a block of skb data at & consumed relative to the
* lower offset specified to skb_prepare_seq_read ( ) . Assigns
* the head of the data block to & data and returns the length
* of the block or 0 if the end of the skb data or the upper
* offset has been reached .
*
* The caller is not required to consume all of the data
* returned , i . e . & consumed is typically set to the number
* of bytes already consumed and the next call to
* skb_seq_read ( ) will return the remaining part of the block .
*
* Note : The size of each block of data returned can be arbitary ,
* this limitation is the cost for zerocopy seqeuental
* reads of potentially non linear data .
*
* Note : Fragment lists within fragments are not implemented
* at the moment , state - > root_skb could be replaced with
* a stack for this purpose .
*/
unsigned int skb_seq_read ( unsigned int consumed , const u8 * * data ,
struct skb_seq_state * st )
{
unsigned int block_limit , abs_offset = consumed + st - > lower_offset ;
skb_frag_t * frag ;
if ( unlikely ( abs_offset > = st - > upper_offset ) )
return 0 ;
next_skb :
block_limit = skb_headlen ( st - > cur_skb ) ;
if ( abs_offset < block_limit ) {
* data = st - > cur_skb - > data + abs_offset ;
return block_limit - abs_offset ;
}
if ( st - > frag_idx = = 0 & & ! st - > frag_data )
st - > stepped_offset + = skb_headlen ( st - > cur_skb ) ;
while ( st - > frag_idx < skb_shinfo ( st - > cur_skb ) - > nr_frags ) {
frag = & skb_shinfo ( st - > cur_skb ) - > frags [ st - > frag_idx ] ;
block_limit = frag - > size + st - > stepped_offset ;
if ( abs_offset < block_limit ) {
if ( ! st - > frag_data )
st - > frag_data = kmap_skb_frag ( frag ) ;
* data = ( u8 * ) st - > frag_data + frag - > page_offset +
( abs_offset - st - > stepped_offset ) ;
return block_limit - abs_offset ;
}
if ( st - > frag_data ) {
kunmap_skb_frag ( st - > frag_data ) ;
st - > frag_data = NULL ;
}
st - > frag_idx + + ;
st - > stepped_offset + = frag - > size ;
}
2007-06-23 23:11:52 -07:00
if ( st - > frag_data ) {
kunmap_skb_frag ( st - > frag_data ) ;
st - > frag_data = NULL ;
}
2005-06-23 20:59:51 -07:00
if ( st - > cur_skb - > next ) {
st - > cur_skb = st - > cur_skb - > next ;
st - > frag_idx = 0 ;
goto next_skb ;
} else if ( st - > root_skb = = st - > cur_skb & &
skb_shinfo ( st - > root_skb ) - > frag_list ) {
st - > cur_skb = skb_shinfo ( st - > root_skb ) - > frag_list ;
goto next_skb ;
}
return 0 ;
}
/**
* skb_abort_seq_read - Abort a sequential read of skb data
* @ st : state variable
*
* Must be called if skb_seq_read ( ) was not called until it
* returned 0.
*/
void skb_abort_seq_read ( struct skb_seq_state * st )
{
if ( st - > frag_data )
kunmap_skb_frag ( st - > frag_data ) ;
}
2005-06-23 21:00:17 -07:00
# define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block ( unsigned int offset , const u8 * * text ,
struct ts_config * conf ,
struct ts_state * state )
{
return skb_seq_read ( offset , text , TS_SKB_CB ( state ) ) ;
}
static void skb_ts_finish ( struct ts_config * conf , struct ts_state * state )
{
skb_abort_seq_read ( TS_SKB_CB ( state ) ) ;
}
/**
* skb_find_text - Find a text pattern in skb data
* @ skb : the buffer to look in
* @ from : search offset
* @ to : search limit
* @ config : textsearch configuration
* @ state : uninitialized textsearch state variable
*
* Finds a pattern in the skb data according to the specified
* textsearch configuration . Use textsearch_next ( ) to retrieve
* subsequent occurrences of the pattern . Returns the offset
* to the first occurrence or UINT_MAX if no match was found .
*/
unsigned int skb_find_text ( struct sk_buff * skb , unsigned int from ,
unsigned int to , struct ts_config * config ,
struct ts_state * state )
{
2006-06-26 00:00:57 -07:00
unsigned int ret ;
2005-06-23 21:00:17 -07:00
config - > get_next_block = skb_ts_get_next_block ;
config - > finish = skb_ts_finish ;
skb_prepare_seq_read ( skb , from , to , TS_SKB_CB ( state ) ) ;
2006-06-26 00:00:57 -07:00
ret = textsearch_find ( config , state ) ;
return ( ret < = to - from ? ret : UINT_MAX ) ;
2005-06-23 21:00:17 -07:00
}
2005-10-18 15:46:41 -07:00
/**
* skb_append_datato_frags : - append the user data to a skb
* @ sk : sock structure
* @ skb : skb structure to be appened with user data .
* @ getfrag : call back function to be used for getting the user data
* @ from : pointer to user message iov
* @ length : length of the iov message
*
* Description : This procedure append the user data in the fragment part
* of the skb if any page alloc fails user this procedure returns - ENOMEM
*/
int skb_append_datato_frags ( struct sock * sk , struct sk_buff * skb ,
2005-12-05 13:40:12 -08:00
int ( * getfrag ) ( void * from , char * to , int offset ,
2005-10-18 15:46:41 -07:00
int len , int odd , struct sk_buff * skb ) ,
void * from , int length )
{
int frg_cnt = 0 ;
skb_frag_t * frag = NULL ;
struct page * page = NULL ;
int copy , left ;
int offset = 0 ;
int ret ;
do {
/* Return error if we don't have space for new frag */
frg_cnt = skb_shinfo ( skb ) - > nr_frags ;
if ( frg_cnt > = MAX_SKB_FRAGS )
return - EFAULT ;
/* allocate a new page for next frag */
page = alloc_pages ( sk - > sk_allocation , 0 ) ;
/* If alloc_page fails just return failure and caller will
* free previous allocated pages by doing kfree_skb ( )
*/
if ( page = = NULL )
return - ENOMEM ;
/* initialize the next frag */
sk - > sk_sndmsg_page = page ;
sk - > sk_sndmsg_off = 0 ;
skb_fill_page_desc ( skb , frg_cnt , page , 0 , 0 ) ;
skb - > truesize + = PAGE_SIZE ;
atomic_add ( PAGE_SIZE , & sk - > sk_wmem_alloc ) ;
/* get the new initialized frag */
frg_cnt = skb_shinfo ( skb ) - > nr_frags ;
frag = & skb_shinfo ( skb ) - > frags [ frg_cnt - 1 ] ;
/* copy the user data to page */
left = PAGE_SIZE - frag - > page_offset ;
copy = ( length > left ) ? left : length ;
ret = getfrag ( from , ( page_address ( frag - > page ) +
frag - > page_offset + frag - > size ) ,
offset , copy , 0 , skb ) ;
if ( ret < 0 )
return - EFAULT ;
/* copy was successful so update the size parameters */
sk - > sk_sndmsg_off + = copy ;
frag - > size + = copy ;
skb - > len + = copy ;
skb - > data_len + = copy ;
offset + = copy ;
length - = copy ;
} while ( length > 0 ) ;
return 0 ;
}
2006-03-20 22:43:56 -08:00
/**
* skb_pull_rcsum - pull skb and update receive checksum
* @ skb : buffer to update
* @ start : start of data before pull
* @ len : length of data pulled
*
* This function performs an skb_pull on the packet and updates
2006-08-29 16:44:56 -07:00
* update the CHECKSUM_COMPLETE checksum . It should be used on
* receive path processing instead of skb_pull unless you know
* that the checksum difference is zero ( e . g . , a valid IP header )
* or you are setting ip_summed to CHECKSUM_NONE .
2006-03-20 22:43:56 -08:00
*/
unsigned char * skb_pull_rcsum ( struct sk_buff * skb , unsigned int len )
{
BUG_ON ( len > skb - > len ) ;
skb - > len - = len ;
BUG_ON ( skb - > len < skb - > data_len ) ;
skb_postpull_rcsum ( skb , skb - > data , len ) ;
return skb - > data + = len ;
}
2006-03-20 22:47:55 -08:00
EXPORT_SYMBOL_GPL ( skb_pull_rcsum ) ;
2006-06-22 03:02:40 -07:00
/**
* skb_segment - Perform protocol segmentation on skb .
* @ skb : buffer to segment
2006-06-27 13:22:38 -07:00
* @ features : features for the output path ( see dev - > features )
2006-06-22 03:02:40 -07:00
*
* This function performs segmentation on the given skb . It returns
* the segment at the given position . It returns NULL if there are
* no more segments to generate , or when an error is encountered .
*/
2006-06-27 13:22:38 -07:00
struct sk_buff * skb_segment ( struct sk_buff * skb , int features )
2006-06-22 03:02:40 -07:00
{
struct sk_buff * segs = NULL ;
struct sk_buff * tail = NULL ;
unsigned int mss = skb_shinfo ( skb ) - > gso_size ;
2007-03-19 15:33:04 -07:00
unsigned int doffset = skb - > data - skb_mac_header ( skb ) ;
2006-06-22 03:02:40 -07:00
unsigned int offset = doffset ;
unsigned int headroom ;
unsigned int len ;
2006-06-27 13:22:38 -07:00
int sg = features & NETIF_F_SG ;
2006-06-22 03:02:40 -07:00
int nfrags = skb_shinfo ( skb ) - > nr_frags ;
int err = - ENOMEM ;
int i = 0 ;
int pos ;
__skb_push ( skb , doffset ) ;
headroom = skb_headroom ( skb ) ;
pos = skb_headlen ( skb ) ;
do {
struct sk_buff * nskb ;
skb_frag_t * frag ;
2006-10-29 15:59:41 -08:00
int hsize ;
2006-06-22 03:02:40 -07:00
int k ;
int size ;
len = skb - > len - offset ;
if ( len > mss )
len = mss ;
hsize = skb_headlen ( skb ) - offset ;
if ( hsize < 0 )
hsize = 0 ;
2006-10-29 15:59:41 -08:00
if ( hsize > len | | ! sg )
hsize = len ;
2006-06-22 03:02:40 -07:00
2006-10-29 15:59:41 -08:00
nskb = alloc_skb ( hsize + doffset + headroom , GFP_ATOMIC ) ;
2006-06-22 03:02:40 -07:00
if ( unlikely ( ! nskb ) )
goto err ;
if ( segs )
tail - > next = nskb ;
else
segs = nskb ;
tail = nskb ;
nskb - > dev = skb - > dev ;
2007-07-06 13:36:20 -07:00
skb_copy_queue_mapping ( nskb , skb ) ;
2006-06-22 03:02:40 -07:00
nskb - > priority = skb - > priority ;
nskb - > protocol = skb - > protocol ;
nskb - > dst = dst_clone ( skb - > dst ) ;
memcpy ( nskb - > cb , skb - > cb , sizeof ( skb - > cb ) ) ;
nskb - > pkt_type = skb - > pkt_type ;
nskb - > mac_len = skb - > mac_len ;
skb_reserve ( nskb , headroom ) ;
2007-03-19 15:30:44 -07:00
skb_reset_mac_header ( nskb ) ;
2007-03-15 21:42:27 -03:00
skb_set_network_header ( nskb , skb - > mac_len ) ;
2007-04-10 21:21:55 -07:00
nskb - > transport_header = ( nskb - > network_header +
skb_network_header_len ( skb ) ) ;
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data ( skb , skb_put ( nskb , doffset ) ,
doffset ) ;
2006-06-22 03:02:40 -07:00
if ( ! sg ) {
nskb - > csum = skb_copy_and_csum_bits ( skb , offset ,
skb_put ( nskb , len ) ,
len , 0 ) ;
continue ;
}
frag = skb_shinfo ( nskb ) - > frags ;
k = 0 ;
2006-08-29 16:44:56 -07:00
nskb - > ip_summed = CHECKSUM_PARTIAL ;
2006-06-22 03:02:40 -07:00
nskb - > csum = skb - > csum ;
2007-03-27 18:55:52 -03:00
skb_copy_from_linear_data_offset ( skb , offset ,
skb_put ( nskb , hsize ) , hsize ) ;
2006-06-22 03:02:40 -07:00
while ( pos < offset + len ) {
BUG_ON ( i > = nfrags ) ;
* frag = skb_shinfo ( skb ) - > frags [ i ] ;
get_page ( frag - > page ) ;
size = frag - > size ;
if ( pos < offset ) {
frag - > page_offset + = offset - pos ;
frag - > size - = offset - pos ;
}
k + + ;
if ( pos + size < = offset + len ) {
i + + ;
pos + = size ;
} else {
frag - > size - = pos + size - ( offset + len ) ;
break ;
}
frag + + ;
}
skb_shinfo ( nskb ) - > nr_frags = k ;
nskb - > data_len = len - hsize ;
nskb - > len + = nskb - > data_len ;
nskb - > truesize + = nskb - > data_len ;
} while ( ( offset + = len ) < skb - > len ) ;
return segs ;
err :
while ( ( skb = segs ) ) {
segs = skb - > next ;
2007-02-27 09:57:37 -08:00
kfree_skb ( skb ) ;
2006-06-22 03:02:40 -07:00
}
return ERR_PTR ( err ) ;
}
EXPORT_SYMBOL_GPL ( skb_segment ) ;
2005-04-16 15:20:36 -07:00
void __init skb_init ( void )
{
skbuff_head_cache = kmem_cache_create ( " skbuff_head_cache " ,
sizeof ( struct sk_buff ) ,
0 ,
2006-08-26 19:25:52 -07:00
SLAB_HWCACHE_ALIGN | SLAB_PANIC ,
2007-07-20 10:11:58 +09:00
NULL ) ;
2005-08-17 14:57:30 -07:00
skbuff_fclone_cache = kmem_cache_create ( " skbuff_fclone_cache " ,
( 2 * sizeof ( struct sk_buff ) ) +
sizeof ( atomic_t ) ,
0 ,
2006-08-26 19:25:52 -07:00
SLAB_HWCACHE_ALIGN | SLAB_PANIC ,
2007-07-20 10:11:58 +09:00
NULL ) ;
2005-04-16 15:20:36 -07:00
}
2007-04-02 20:19:53 -07:00
/**
* skb_to_sgvec - Fill a scatter - gather list from a socket buffer
* @ skb : Socket buffer containing the buffers to be mapped
* @ sg : The scatter - gather list to map into
* @ offset : The offset into the buffer ' s contents to start mapping
* @ len : Length of buffer space to be mapped
*
* Fill the specified scatter - gather list with mappings / pointers into a
* region of the buffer space attached to a socket buffer .
*/
2007-10-30 21:29:29 -07:00
static int
__skb_to_sgvec ( struct sk_buff * skb , struct scatterlist * sg , int offset , int len )
2007-04-02 20:19:53 -07:00
{
2007-04-27 15:21:23 -07:00
int start = skb_headlen ( skb ) ;
int i , copy = start - offset ;
2007-04-02 20:19:53 -07:00
int elt = 0 ;
if ( copy > 0 ) {
if ( copy > len )
copy = len ;
2007-10-24 11:20:47 +02:00
sg_set_buf ( sg , skb - > data + offset , copy ) ;
2007-04-02 20:19:53 -07:00
elt + + ;
if ( ( len - = copy ) = = 0 )
return elt ;
offset + = copy ;
}
for ( i = 0 ; i < skb_shinfo ( skb ) - > nr_frags ; i + + ) {
2007-04-27 15:21:23 -07:00
int end ;
2007-04-02 20:19:53 -07:00
2007-04-27 15:21:23 -07:00
BUG_TRAP ( start < = offset + len ) ;
end = start + skb_shinfo ( skb ) - > frags [ i ] . size ;
2007-04-02 20:19:53 -07:00
if ( ( copy = end - offset ) > 0 ) {
skb_frag_t * frag = & skb_shinfo ( skb ) - > frags [ i ] ;
if ( copy > len )
copy = len ;
2007-10-24 11:20:47 +02:00
sg_set_page ( & sg [ elt ] , frag - > page , copy ,
frag - > page_offset + offset - start ) ;
2007-04-02 20:19:53 -07:00
elt + + ;
if ( ! ( len - = copy ) )
return elt ;
offset + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2007-04-02 20:19:53 -07:00
}
if ( skb_shinfo ( skb ) - > frag_list ) {
struct sk_buff * list = skb_shinfo ( skb ) - > frag_list ;
for ( ; list ; list = list - > next ) {
2007-04-27 15:21:23 -07:00
int end ;
BUG_TRAP ( start < = offset + len ) ;
2007-04-02 20:19:53 -07:00
2007-04-27 15:21:23 -07:00
end = start + list - > len ;
2007-04-02 20:19:53 -07:00
if ( ( copy = end - offset ) > 0 ) {
if ( copy > len )
copy = len ;
2007-10-30 21:29:29 -07:00
elt + = __skb_to_sgvec ( list , sg + elt , offset - start ,
copy ) ;
2007-04-02 20:19:53 -07:00
if ( ( len - = copy ) = = 0 )
return elt ;
offset + = copy ;
}
2007-04-27 15:21:23 -07:00
start = end ;
2007-04-02 20:19:53 -07:00
}
}
BUG_ON ( len ) ;
return elt ;
}
2007-10-30 21:29:29 -07:00
int skb_to_sgvec ( struct sk_buff * skb , struct scatterlist * sg , int offset , int len )
{
int nsg = __skb_to_sgvec ( skb , sg , offset , len ) ;
2007-10-31 12:06:37 +01:00
sg_mark_end ( & sg [ nsg - 1 ] ) ;
2007-10-30 21:29:29 -07:00
return nsg ;
}
2007-04-02 20:19:53 -07:00
/**
* skb_cow_data - Check that a socket buffer ' s data buffers are writable
* @ skb : The socket buffer to check .
* @ tailbits : Amount of trailing space to be added
* @ trailer : Returned pointer to the skb where the @ tailbits space begins
*
* Make sure that the data buffers attached to a socket buffer are
* writable . If they are not , private copies are made of the data buffers
* and the socket buffer is set to use these instead .
*
* If @ tailbits is given , make sure that there is space to write @ tailbits
* bytes of data beyond current end of socket buffer . @ trailer will be
* set to point to the skb in which this space begins .
*
* The number of scatterlist elements required to completely map the
* COW ' d and extended socket buffer will be returned .
*/
int skb_cow_data ( struct sk_buff * skb , int tailbits , struct sk_buff * * trailer )
{
int copyflag ;
int elt ;
struct sk_buff * skb1 , * * skb_p ;
/* If skb is cloned or its head is paged, reallocate
* head pulling out all the pages ( pages are considered not writable
* at the moment even if they are anonymous ) .
*/
if ( ( skb_cloned ( skb ) | | skb_shinfo ( skb ) - > nr_frags ) & &
__pskb_pull_tail ( skb , skb_pagelen ( skb ) - skb_headlen ( skb ) ) = = NULL )
return - ENOMEM ;
/* Easy case. Most of packets will go this way. */
if ( ! skb_shinfo ( skb ) - > frag_list ) {
/* A little of trouble, not enough of space for trailer.
* This should not happen , when stack is tuned to generate
* good frames . OK , on miss we reallocate and reserve even more
* space , 128 bytes is fair . */
if ( skb_tailroom ( skb ) < tailbits & &
pskb_expand_head ( skb , 0 , tailbits - skb_tailroom ( skb ) + 128 , GFP_ATOMIC ) )
return - ENOMEM ;
/* Voila! */
* trailer = skb ;
return 1 ;
}
/* Misery. We are in troubles, going to mincer fragments... */
elt = 1 ;
skb_p = & skb_shinfo ( skb ) - > frag_list ;
copyflag = 0 ;
while ( ( skb1 = * skb_p ) ! = NULL ) {
int ntail = 0 ;
/* The fragment is partially pulled by someone,
* this can happen on input . Copy it and everything
* after it . */
if ( skb_shared ( skb1 ) )
copyflag = 1 ;
/* If the skb is the last, worry about trailer. */
if ( skb1 - > next = = NULL & & tailbits ) {
if ( skb_shinfo ( skb1 ) - > nr_frags | |
skb_shinfo ( skb1 ) - > frag_list | |
skb_tailroom ( skb1 ) < tailbits )
ntail = tailbits + 128 ;
}
if ( copyflag | |
skb_cloned ( skb1 ) | |
ntail | |
skb_shinfo ( skb1 ) - > nr_frags | |
skb_shinfo ( skb1 ) - > frag_list ) {
struct sk_buff * skb2 ;
/* Fuck, we are miserable poor guys... */
if ( ntail = = 0 )
skb2 = skb_copy ( skb1 , GFP_ATOMIC ) ;
else
skb2 = skb_copy_expand ( skb1 ,
skb_headroom ( skb1 ) ,
ntail ,
GFP_ATOMIC ) ;
if ( unlikely ( skb2 = = NULL ) )
return - ENOMEM ;
if ( skb1 - > sk )
skb_set_owner_w ( skb2 , skb1 - > sk ) ;
/* Looking around. Are we still alive?
* OK , link new skb , drop old one */
skb2 - > next = skb1 - > next ;
* skb_p = skb2 ;
kfree_skb ( skb1 ) ;
skb1 = skb2 ;
}
elt + + ;
* trailer = skb1 ;
skb_p = & skb1 - > next ;
}
return elt ;
}
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( ___pskb_trim ) ;
EXPORT_SYMBOL ( __kfree_skb ) ;
2006-03-20 21:28:35 -08:00
EXPORT_SYMBOL ( kfree_skb ) ;
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( __pskb_pull_tail ) ;
2005-08-17 14:57:30 -07:00
EXPORT_SYMBOL ( __alloc_skb ) ;
2006-07-31 22:35:23 -07:00
EXPORT_SYMBOL ( __netdev_alloc_skb ) ;
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( pskb_copy ) ;
EXPORT_SYMBOL ( pskb_expand_head ) ;
EXPORT_SYMBOL ( skb_checksum ) ;
EXPORT_SYMBOL ( skb_clone ) ;
EXPORT_SYMBOL ( skb_copy ) ;
EXPORT_SYMBOL ( skb_copy_and_csum_bits ) ;
EXPORT_SYMBOL ( skb_copy_and_csum_dev ) ;
EXPORT_SYMBOL ( skb_copy_bits ) ;
EXPORT_SYMBOL ( skb_copy_expand ) ;
EXPORT_SYMBOL ( skb_over_panic ) ;
EXPORT_SYMBOL ( skb_pad ) ;
EXPORT_SYMBOL ( skb_realloc_headroom ) ;
EXPORT_SYMBOL ( skb_under_panic ) ;
EXPORT_SYMBOL ( skb_dequeue ) ;
EXPORT_SYMBOL ( skb_dequeue_tail ) ;
EXPORT_SYMBOL ( skb_insert ) ;
EXPORT_SYMBOL ( skb_queue_purge ) ;
EXPORT_SYMBOL ( skb_queue_head ) ;
EXPORT_SYMBOL ( skb_queue_tail ) ;
EXPORT_SYMBOL ( skb_unlink ) ;
EXPORT_SYMBOL ( skb_append ) ;
EXPORT_SYMBOL ( skb_split ) ;
2005-06-23 20:59:51 -07:00
EXPORT_SYMBOL ( skb_prepare_seq_read ) ;
EXPORT_SYMBOL ( skb_seq_read ) ;
EXPORT_SYMBOL ( skb_abort_seq_read ) ;
2005-06-23 21:00:17 -07:00
EXPORT_SYMBOL ( skb_find_text ) ;
2005-10-18 15:46:41 -07:00
EXPORT_SYMBOL ( skb_append_datato_frags ) ;
2007-04-02 20:19:53 -07:00
EXPORT_SYMBOL_GPL ( skb_to_sgvec ) ;
EXPORT_SYMBOL_GPL ( skb_cow_data ) ;