2005-04-17 02:20:36 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Definitions for the AF_INET socket handler .
*
* Version : @ ( # ) sock . h 1.0 .4 05 / 13 / 93
*
2005-05-06 03:16:16 +04:00
* Authors : Ross Biro
2005-04-17 02:20:36 +04:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Corey Minyard < wf - rch ! minyard @ relay . EU . net >
* Florian La Roche < flla @ stud . uni - sb . de >
*
* Fixes :
* Alan Cox : Volatiles in skbuff pointers . See
* skbuff comments . May be overdone ,
* better to prove they can be removed
* than the reverse .
* Alan Cox : Added a zapped field for tcp to note
* a socket is reset and must stay shut up
* Alan Cox : New fields for options
* Pauline Middelink : identd support
* Alan Cox : Eliminate low level recv / recvfrom
* David S . Miller : New socket lookup architecture .
* Steve Whitehouse : Default routines for sock_ops
* Arnaldo C . Melo : removed net_pinfo , tp_pinfo and made
* protinfo be just a void pointer , as the
* protocol specific parts were moved to
* respective headers and ipv4 / v6 , etc now
* use private slabcaches for its socks
* Pedro Hortas : New flags field for socket options
*
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# ifndef _SOCK_H
# define _SOCK_H
# include <linux/list.h>
# include <linux/timer.h>
# include <linux/cache.h>
# include <linux/module.h>
2006-07-03 11:25:35 +04:00
# include <linux/lockdep.h>
2005-04-17 02:20:36 +04:00
# include <linux/netdevice.h>
# include <linux/skbuff.h> /* struct sk_buff */
2006-12-04 07:15:30 +03:00
# include <linux/mm.h>
2005-04-17 02:20:36 +04:00
# include <linux/security.h>
# include <linux/filter.h>
# include <asm/atomic.h>
# include <net/dst.h>
# include <net/checksum.h>
/*
* This structure really needs to be cleaned up .
* Most of it is for TCP , and not used by any of
* the other protocols .
*/
/* Define this to get the SOCK_DBG debugging facility. */
# define SOCK_DEBUGGING
# ifdef SOCK_DEBUGGING
# define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
printk ( KERN_DEBUG msg ) ; } while ( 0 )
# else
# define SOCK_DEBUG(sk, msg...) do { } while (0)
# endif
/* This is the per-socket lock. The spinlock provides a synchronization
* between user contexts and software interrupt processing , whereas the
* mini - semaphore synchronizes multiple users amongst themselves .
*/
struct sock_iocb ;
typedef struct {
spinlock_t slock ;
struct sock_iocb * owner ;
wait_queue_head_t wq ;
2006-07-03 11:25:35 +04:00
/*
* We express the mutex - alike socket_lock semantics
* to the lock validator by explicitly managing
* the slock as a lock variant ( in addition to
* the slock itself ) :
*/
# ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map ;
# endif
2005-04-17 02:20:36 +04:00
} socket_lock_t ;
struct sock ;
2005-08-10 07:09:30 +04:00
struct proto ;
2005-04-17 02:20:36 +04:00
/**
2005-05-01 19:59:25 +04:00
* struct sock_common - minimal network layer representation of sockets
* @ skc_family : network address family
* @ skc_state : Connection state
* @ skc_reuse : % SO_REUSEADDR setting
* @ skc_bound_dev_if : bound device index if ! = 0
* @ skc_node : main hash linkage for various protocol lookup tables
* @ skc_bind_node : bind hash linkage for various protocol lookup tables
* @ skc_refcnt : reference count
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
* @ skc_hash : hash value used with various protocol lookup tables
2005-08-10 07:09:30 +04:00
* @ skc_prot : protocol handlers inside a network family
2005-05-01 19:59:25 +04:00
*
* This is the minimal network layer representation of sockets , the header
2005-08-10 07:09:30 +04:00
* for struct sock and struct inet_timewait_sock .
*/
2005-04-17 02:20:36 +04:00
struct sock_common {
unsigned short skc_family ;
volatile unsigned char skc_state ;
unsigned char skc_reuse ;
int skc_bound_dev_if ;
struct hlist_node skc_node ;
struct hlist_node skc_bind_node ;
atomic_t skc_refcnt ;
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
unsigned int skc_hash ;
2005-08-10 07:09:30 +04:00
struct proto * skc_prot ;
2005-04-17 02:20:36 +04:00
} ;
/**
* struct sock - network layer representation of sockets
2005-08-10 07:09:30 +04:00
* @ __sk_common : shared layout with inet_timewait_sock
2005-05-01 19:59:25 +04:00
* @ sk_shutdown : mask of % SEND_SHUTDOWN and / or % RCV_SHUTDOWN
* @ sk_userlocks : % SO_SNDBUF and % SO_RCVBUF settings
* @ sk_lock : synchronizer
* @ sk_rcvbuf : size of receive buffer in bytes
* @ sk_sleep : sock wait queue
* @ sk_dst_cache : destination cache
* @ sk_dst_lock : destination cache lock
* @ sk_policy : flow policy
* @ sk_rmem_alloc : receive queue bytes committed
* @ sk_receive_queue : incoming packets
* @ sk_wmem_alloc : transmit queue bytes committed
* @ sk_write_queue : Packet sending queue
2006-05-24 04:55:33 +04:00
* @ sk_async_wait_queue : DMA copied packets
2005-05-01 19:59:25 +04:00
* @ sk_omem_alloc : " o " is " option " or " other "
* @ sk_wmem_queued : persistent queue size
* @ sk_forward_alloc : space allocated forward
* @ sk_allocation : allocation mode
* @ sk_sndbuf : size of send buffer in bytes
* @ sk_flags : % SO_LINGER ( l_onoff ) , % SO_BROADCAST , % SO_KEEPALIVE , % SO_OOBINLINE settings
* @ sk_no_check : % SO_NO_CHECK setting , wether or not checkup packets
* @ sk_route_caps : route capabilities ( e . g . % NETIF_F_TSO )
2006-07-01 00:36:35 +04:00
* @ sk_gso_type : GSO type ( e . g . % SKB_GSO_TCPV4 )
2005-05-01 19:59:25 +04:00
* @ sk_lingertime : % SO_LINGER l_linger setting
* @ sk_backlog : always used with the per - socket spinlock held
* @ sk_callback_lock : used with the callbacks in the end of this struct
* @ sk_error_queue : rarely used
2005-05-06 00:35:15 +04:00
* @ sk_prot_creator : sk_prot of original sock creator ( see ipv6_setsockopt , IPV6_ADDRFORM for instance )
2005-05-01 19:59:25 +04:00
* @ sk_err : last error
* @ sk_err_soft : errors that don ' t cause failure but are the cause of a persistent failure not just ' timed out '
* @ sk_ack_backlog : current listen backlog
* @ sk_max_ack_backlog : listen backlog set in listen ( )
* @ sk_priority : % SO_PRIORITY setting
* @ sk_type : socket type ( % SOCK_STREAM , etc )
* @ sk_protocol : which protocol this socket belongs in this network family
* @ sk_peercred : % SO_PEERCRED setting
* @ sk_rcvlowat : % SO_RCVLOWAT setting
* @ sk_rcvtimeo : % SO_RCVTIMEO setting
* @ sk_sndtimeo : % SO_SNDTIMEO setting
* @ sk_filter : socket filtering instructions
* @ sk_protinfo : private area , net family specific , when not using slab
* @ sk_timer : sock cleanup timer
* @ sk_stamp : time stamp of last packet received
* @ sk_socket : Identd and reporting IO signals
* @ sk_user_data : RPC layer private data
* @ sk_sndmsg_page : cached page for sendmsg
* @ sk_sndmsg_off : cached offset for sendmsg
* @ sk_send_head : front of stuff to transmit
2005-05-01 19:59:26 +04:00
* @ sk_security : used by security modules
2005-05-01 19:59:25 +04:00
* @ sk_write_pending : a write to stream socket waits to start
* @ sk_state_change : callback to indicate change in the state of the sock
* @ sk_data_ready : callback to indicate there is data to be processed
* @ sk_write_space : callback to indicate there is bf sending space available
* @ sk_error_report : callback to indicate errors ( e . g . % MSG_ERRQUEUE )
* @ sk_backlog_rcv : callback to process the backlog
* @ sk_destruct : called at sock freeing time , i . e . when all refcnt = = 0
2005-04-17 02:20:36 +04:00
*/
struct sock {
/*
2005-08-10 07:09:30 +04:00
* Now struct inet_timewait_sock also uses sock_common , so please just
2005-04-17 02:20:36 +04:00
* don ' t add nothing before this first member ( __sk_common ) - - acme
*/
struct sock_common __sk_common ;
# define sk_family __sk_common.skc_family
# define sk_state __sk_common.skc_state
# define sk_reuse __sk_common.skc_reuse
# define sk_bound_dev_if __sk_common.skc_bound_dev_if
# define sk_node __sk_common.skc_node
# define sk_bind_node __sk_common.skc_bind_node
# define sk_refcnt __sk_common.skc_refcnt
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
# define sk_hash __sk_common.skc_hash
2005-08-10 07:09:30 +04:00
# define sk_prot __sk_common.skc_prot
2005-04-17 02:20:36 +04:00
unsigned char sk_shutdown : 2 ,
sk_no_check : 2 ,
sk_userlocks : 4 ;
unsigned char sk_protocol ;
unsigned short sk_type ;
int sk_rcvbuf ;
socket_lock_t sk_lock ;
2007-03-05 03:05:44 +03:00
/*
* The backlog queue is special , it is always used with
* the per - socket spinlock held and requires low latency
* access . Therefore we special case it ' s implementation .
*/
struct {
struct sk_buff * head ;
struct sk_buff * tail ;
} sk_backlog ;
2005-04-17 02:20:36 +04:00
wait_queue_head_t * sk_sleep ;
struct dst_entry * sk_dst_cache ;
struct xfrm_policy * sk_policy [ 2 ] ;
rwlock_t sk_dst_lock ;
atomic_t sk_rmem_alloc ;
atomic_t sk_wmem_alloc ;
atomic_t sk_omem_alloc ;
struct sk_buff_head sk_receive_queue ;
struct sk_buff_head sk_write_queue ;
2006-05-24 04:55:33 +04:00
struct sk_buff_head sk_async_wait_queue ;
2005-04-17 02:20:36 +04:00
int sk_wmem_queued ;
int sk_forward_alloc ;
2005-10-21 11:20:43 +04:00
gfp_t sk_allocation ;
2005-04-17 02:20:36 +04:00
int sk_sndbuf ;
int sk_route_caps ;
2006-07-01 00:36:35 +04:00
int sk_gso_type ;
2006-03-25 02:12:37 +03:00
int sk_rcvlowat ;
2005-04-17 02:20:36 +04:00
unsigned long sk_flags ;
unsigned long sk_lingertime ;
struct sk_buff_head sk_error_queue ;
2005-05-06 00:35:15 +04:00
struct proto * sk_prot_creator ;
2005-04-17 02:20:36 +04:00
rwlock_t sk_callback_lock ;
int sk_err ,
sk_err_soft ;
unsigned short sk_ack_backlog ;
unsigned short sk_max_ack_backlog ;
__u32 sk_priority ;
struct ucred sk_peercred ;
long sk_rcvtimeo ;
long sk_sndtimeo ;
struct sk_filter * sk_filter ;
void * sk_protinfo ;
struct timer_list sk_timer ;
2007-04-20 03:16:32 +04:00
ktime_t sk_stamp ;
2005-04-17 02:20:36 +04:00
struct socket * sk_socket ;
void * sk_user_data ;
struct page * sk_sndmsg_page ;
struct sk_buff * sk_send_head ;
__u32 sk_sndmsg_off ;
int sk_write_pending ;
void * sk_security ;
void ( * sk_state_change ) ( struct sock * sk ) ;
void ( * sk_data_ready ) ( struct sock * sk , int bytes ) ;
void ( * sk_write_space ) ( struct sock * sk ) ;
void ( * sk_error_report ) ( struct sock * sk ) ;
int ( * sk_backlog_rcv ) ( struct sock * sk ,
struct sk_buff * skb ) ;
void ( * sk_destruct ) ( struct sock * sk ) ;
} ;
/*
* Hashed lists helper routines
*/
2005-08-10 07:09:46 +04:00
static inline struct sock * __sk_head ( const struct hlist_head * head )
2005-04-17 02:20:36 +04:00
{
return hlist_entry ( head - > first , struct sock , sk_node ) ;
}
2005-08-10 07:09:46 +04:00
static inline struct sock * sk_head ( const struct hlist_head * head )
2005-04-17 02:20:36 +04:00
{
return hlist_empty ( head ) ? NULL : __sk_head ( head ) ;
}
2005-08-10 07:09:46 +04:00
static inline struct sock * sk_next ( const struct sock * sk )
2005-04-17 02:20:36 +04:00
{
return sk - > sk_node . next ?
hlist_entry ( sk - > sk_node . next , struct sock , sk_node ) : NULL ;
}
2005-08-10 07:09:46 +04:00
static inline int sk_unhashed ( const struct sock * sk )
2005-04-17 02:20:36 +04:00
{
return hlist_unhashed ( & sk - > sk_node ) ;
}
2005-08-10 07:09:46 +04:00
static inline int sk_hashed ( const struct sock * sk )
2005-04-17 02:20:36 +04:00
{
2006-04-29 02:21:23 +04:00
return ! sk_unhashed ( sk ) ;
2005-04-17 02:20:36 +04:00
}
static __inline__ void sk_node_init ( struct hlist_node * node )
{
node - > pprev = NULL ;
}
static __inline__ void __sk_del_node ( struct sock * sk )
{
__hlist_del ( & sk - > sk_node ) ;
}
static __inline__ int __sk_del_node_init ( struct sock * sk )
{
if ( sk_hashed ( sk ) ) {
__sk_del_node ( sk ) ;
sk_node_init ( & sk - > sk_node ) ;
return 1 ;
}
return 0 ;
}
/* Grab socket reference count. This operation is valid only
when sk is ALREADY grabbed f . e . it is found in hash table
or a list and the lookup is made under lock preventing hash table
modifications .
*/
static inline void sock_hold ( struct sock * sk )
{
atomic_inc ( & sk - > sk_refcnt ) ;
}
/* Ungrab socket in the context, which assumes that socket refcnt
cannot hit zero , f . e . it is true in context of any socketcall .
*/
static inline void __sock_put ( struct sock * sk )
{
atomic_dec ( & sk - > sk_refcnt ) ;
}
static __inline__ int sk_del_node_init ( struct sock * sk )
{
int rc = __sk_del_node_init ( sk ) ;
if ( rc ) {
/* paranoid for a while -acme */
WARN_ON ( atomic_read ( & sk - > sk_refcnt ) = = 1 ) ;
__sock_put ( sk ) ;
}
return rc ;
}
static __inline__ void __sk_add_node ( struct sock * sk , struct hlist_head * list )
{
hlist_add_head ( & sk - > sk_node , list ) ;
}
static __inline__ void sk_add_node ( struct sock * sk , struct hlist_head * list )
{
sock_hold ( sk ) ;
__sk_add_node ( sk , list ) ;
}
static __inline__ void __sk_del_bind_node ( struct sock * sk )
{
__hlist_del ( & sk - > sk_bind_node ) ;
}
static __inline__ void sk_add_bind_node ( struct sock * sk ,
struct hlist_head * list )
{
hlist_add_head ( & sk - > sk_bind_node , list ) ;
}
# define sk_for_each(__sk, node, list) \
hlist_for_each_entry ( __sk , node , list , sk_node )
# define sk_for_each_from(__sk, node) \
if ( __sk & & ( { node = & ( __sk ) - > sk_node ; 1 ; } ) ) \
hlist_for_each_entry_from ( __sk , node , sk_node )
# define sk_for_each_continue(__sk, node) \
if ( __sk & & ( { node = & ( __sk ) - > sk_node ; 1 ; } ) ) \
hlist_for_each_entry_continue ( __sk , node , sk_node )
# define sk_for_each_safe(__sk, node, tmp, list) \
hlist_for_each_entry_safe ( __sk , node , tmp , list , sk_node )
# define sk_for_each_bound(__sk, node, list) \
hlist_for_each_entry ( __sk , node , list , sk_bind_node )
/* Sock flags */
enum sock_flags {
SOCK_DEAD ,
SOCK_DONE ,
SOCK_URGINLINE ,
SOCK_KEEPOPEN ,
SOCK_LINGER ,
SOCK_DESTROY ,
SOCK_BROADCAST ,
SOCK_TIMESTAMP ,
SOCK_ZAPPED ,
SOCK_USE_WRITE_QUEUE , /* whether to call sk->sk_write_space in sock_wfree */
SOCK_DBG , /* %SO_DEBUG setting */
SOCK_RCVTSTAMP , /* %SO_TIMESTAMP setting */
2007-03-26 09:14:49 +04:00
SOCK_RCVTSTAMPNS , /* %SO_TIMESTAMPNS setting */
2005-04-17 02:20:36 +04:00
SOCK_LOCALROUTE , /* route locally only, %SO_DONTROUTE setting */
SOCK_QUEUE_SHRUNK , /* write queue has been shrunk recently */
} ;
2005-08-23 21:11:30 +04:00
static inline void sock_copy_flags ( struct sock * nsk , struct sock * osk )
{
nsk - > sk_flags = osk - > sk_flags ;
}
2005-04-17 02:20:36 +04:00
static inline void sock_set_flag ( struct sock * sk , enum sock_flags flag )
{
__set_bit ( flag , & sk - > sk_flags ) ;
}
static inline void sock_reset_flag ( struct sock * sk , enum sock_flags flag )
{
__clear_bit ( flag , & sk - > sk_flags ) ;
}
static inline int sock_flag ( struct sock * sk , enum sock_flags flag )
{
return test_bit ( flag , & sk - > sk_flags ) ;
}
static inline void sk_acceptq_removed ( struct sock * sk )
{
sk - > sk_ack_backlog - - ;
}
static inline void sk_acceptq_added ( struct sock * sk )
{
sk - > sk_ack_backlog + + ;
}
static inline int sk_acceptq_is_full ( struct sock * sk )
{
2007-03-06 22:21:05 +03:00
return sk - > sk_ack_backlog > sk - > sk_max_ack_backlog ;
2005-04-17 02:20:36 +04:00
}
/*
* Compute minimal free write space needed to queue new packets .
*/
static inline int sk_stream_min_wspace ( struct sock * sk )
{
return sk - > sk_wmem_queued / 2 ;
}
static inline int sk_stream_wspace ( struct sock * sk )
{
return sk - > sk_sndbuf - sk - > sk_wmem_queued ;
}
extern void sk_stream_write_space ( struct sock * sk ) ;
static inline int sk_stream_memory_free ( struct sock * sk )
{
return sk - > sk_wmem_queued < sk - > sk_sndbuf ;
}
extern void sk_stream_rfree ( struct sk_buff * skb ) ;
static inline void sk_stream_set_owner_r ( struct sk_buff * skb , struct sock * sk )
{
skb - > sk = sk ;
skb - > destructor = sk_stream_rfree ;
atomic_add ( skb - > truesize , & sk - > sk_rmem_alloc ) ;
sk - > sk_forward_alloc - = skb - > truesize ;
}
static inline void sk_stream_free_skb ( struct sock * sk , struct sk_buff * skb )
{
2006-04-20 11:10:50 +04:00
skb_truesize_check ( skb ) ;
2005-04-17 02:20:36 +04:00
sock_set_flag ( sk , SOCK_QUEUE_SHRUNK ) ;
sk - > sk_wmem_queued - = skb - > truesize ;
sk - > sk_forward_alloc + = skb - > truesize ;
__kfree_skb ( skb ) ;
}
/* The per-socket spinlock must be held here. */
2005-11-08 20:39:42 +03:00
static inline void sk_add_backlog ( struct sock * sk , struct sk_buff * skb )
{
if ( ! sk - > sk_backlog . tail ) {
sk - > sk_backlog . head = sk - > sk_backlog . tail = skb ;
} else {
sk - > sk_backlog . tail - > next = skb ;
sk - > sk_backlog . tail = skb ;
}
skb - > next = NULL ;
}
2005-04-17 02:20:36 +04:00
# define sk_wait_event(__sk, __timeo, __condition) \
( { int rc ; \
release_sock ( __sk ) ; \
rc = __condition ; \
if ( ! rc ) { \
* ( __timeo ) = schedule_timeout ( * ( __timeo ) ) ; \
} \
lock_sock ( __sk ) ; \
2006-03-18 03:05:43 +03:00
rc = __condition ; \
2005-04-17 02:20:36 +04:00
rc ; \
} )
extern int sk_stream_wait_connect ( struct sock * sk , long * timeo_p ) ;
extern int sk_stream_wait_memory ( struct sock * sk , long * timeo_p ) ;
extern void sk_stream_wait_close ( struct sock * sk , long timeo_p ) ;
extern int sk_stream_error ( struct sock * sk , int flags , int err ) ;
extern void sk_stream_kill_queues ( struct sock * sk ) ;
extern int sk_wait_data ( struct sock * sk , long * timeo ) ;
2005-06-19 09:47:21 +04:00
struct request_sock_ops ;
2005-12-14 10:25:19 +03:00
struct timewait_sock_ops ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
2005-04-17 02:20:36 +04:00
/* Networking protocol blocks we attach to sockets.
* socket layer - > transport layer interface
* transport - > network interface is defined by struct inet_proto
*/
struct proto {
void ( * close ) ( struct sock * sk ,
long timeout ) ;
int ( * connect ) ( struct sock * sk ,
struct sockaddr * uaddr ,
int addr_len ) ;
int ( * disconnect ) ( struct sock * sk , int flags ) ;
struct sock * ( * accept ) ( struct sock * sk , int flags , int * err ) ;
int ( * ioctl ) ( struct sock * sk , int cmd ,
unsigned long arg ) ;
int ( * init ) ( struct sock * sk ) ;
int ( * destroy ) ( struct sock * sk ) ;
void ( * shutdown ) ( struct sock * sk , int how ) ;
int ( * setsockopt ) ( struct sock * sk , int level ,
int optname , char __user * optval ,
int optlen ) ;
int ( * getsockopt ) ( struct sock * sk , int level ,
int optname , char __user * optval ,
int __user * option ) ;
2006-03-21 09:45:21 +03:00
int ( * compat_setsockopt ) ( struct sock * sk ,
int level ,
int optname , char __user * optval ,
int optlen ) ;
int ( * compat_getsockopt ) ( struct sock * sk ,
int level ,
int optname , char __user * optval ,
int __user * option ) ;
2005-04-17 02:20:36 +04:00
int ( * sendmsg ) ( struct kiocb * iocb , struct sock * sk ,
struct msghdr * msg , size_t len ) ;
int ( * recvmsg ) ( struct kiocb * iocb , struct sock * sk ,
struct msghdr * msg ,
size_t len , int noblock , int flags ,
int * addr_len ) ;
int ( * sendpage ) ( struct sock * sk , struct page * page ,
int offset , size_t size , int flags ) ;
int ( * bind ) ( struct sock * sk ,
struct sockaddr * uaddr , int addr_len ) ;
int ( * backlog_rcv ) ( struct sock * sk ,
struct sk_buff * skb ) ;
/* Keeping track of sk's, looking them up, and port selection methods. */
void ( * hash ) ( struct sock * sk ) ;
void ( * unhash ) ( struct sock * sk ) ;
int ( * get_port ) ( struct sock * sk , unsigned short snum ) ;
/* Memory pressure */
void ( * enter_memory_pressure ) ( void ) ;
atomic_t * memory_allocated ; /* Current allocated memory. */
atomic_t * sockets_allocated ; /* Current number of sockets. */
/*
* Pressure flag : try to collapse .
* Technical note : it is used by multiple contexts non atomically .
* All the sk_stream_mem_schedule ( ) is of this nature : accounting
* is strict , actions are advisory and have some latency .
*/
int * memory_pressure ;
int * sysctl_mem ;
int * sysctl_wmem ;
int * sysctl_rmem ;
int max_header ;
2006-12-07 07:33:20 +03:00
struct kmem_cache * slab ;
2005-04-17 02:20:36 +04:00
unsigned int obj_size ;
2005-08-10 07:11:41 +04:00
atomic_t * orphan_count ;
2005-08-10 07:09:30 +04:00
2005-06-19 09:47:21 +04:00
struct request_sock_ops * rsk_prot ;
2005-12-14 10:25:19 +03:00
struct timewait_sock_ops * twsk_prot ;
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
2005-04-17 02:20:36 +04:00
struct module * owner ;
char name [ 32 ] ;
struct list_head node ;
2005-08-10 06:45:38 +04:00
# ifdef SOCK_REFCNT_DEBUG
atomic_t socks ;
# endif
2005-04-17 02:20:36 +04:00
struct {
int inuse ;
u8 __pad [ SMP_CACHE_BYTES - sizeof ( int ) ] ;
} stats [ NR_CPUS ] ;
} ;
extern int proto_register ( struct proto * prot , int alloc_slab ) ;
extern void proto_unregister ( struct proto * prot ) ;
2005-08-10 06:45:38 +04:00
# ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc ( struct sock * sk )
{
atomic_inc ( & sk - > sk_prot - > socks ) ;
}
static inline void sk_refcnt_debug_dec ( struct sock * sk )
{
atomic_dec ( & sk - > sk_prot - > socks ) ;
printk ( KERN_DEBUG " %s socket %p released, %d are still alive \n " ,
sk - > sk_prot - > name , sk , atomic_read ( & sk - > sk_prot - > socks ) ) ;
}
static inline void sk_refcnt_debug_release ( const struct sock * sk )
{
if ( atomic_read ( & sk - > sk_refcnt ) ! = 1 )
printk ( KERN_DEBUG " Destruction of the %s socket %p delayed, refcnt=%d \n " ,
sk - > sk_prot - > name , sk , atomic_read ( & sk - > sk_refcnt ) ) ;
}
# else /* SOCK_REFCNT_DEBUG */
# define sk_refcnt_debug_inc(sk) do { } while (0)
# define sk_refcnt_debug_dec(sk) do { } while (0)
# define sk_refcnt_debug_release(sk) do { } while (0)
# endif /* SOCK_REFCNT_DEBUG */
2005-04-17 02:20:36 +04:00
/* Called with local bh disabled */
static __inline__ void sock_prot_inc_use ( struct proto * prot )
{
prot - > stats [ smp_processor_id ( ) ] . inuse + + ;
}
static __inline__ void sock_prot_dec_use ( struct proto * prot )
{
prot - > stats [ smp_processor_id ( ) ] . inuse - - ;
}
2005-08-10 06:47:37 +04:00
/* With per-bucket locks this operation is not-atomic, so that
* this version is not worse .
*/
static inline void __sk_prot_rehash ( struct sock * sk )
{
sk - > sk_prot - > unhash ( sk ) ;
sk - > sk_prot - > hash ( sk ) ;
}
2005-04-17 02:20:36 +04:00
/* About 10 seconds */
# define SOCK_DESTROY_TIME (10*HZ)
/* Sockets 0-1023 can't be bound to unless you are superuser */
# define PROT_SOCK 1024
# define SHUTDOWN_MASK 3
# define RCV_SHUTDOWN 1
# define SEND_SHUTDOWN 2
# define SOCK_SNDBUF_LOCK 1
# define SOCK_RCVBUF_LOCK 2
# define SOCK_BINDADDR_LOCK 4
# define SOCK_BINDPORT_LOCK 8
/* sock_iocb: used to kick off async processing of socket ios */
struct sock_iocb {
struct list_head list ;
int flags ;
int size ;
struct socket * sock ;
struct sock * sk ;
struct scm_cookie * scm ;
struct msghdr * msg , async_msg ;
struct kiocb * kiocb ;
} ;
static inline struct sock_iocb * kiocb_to_siocb ( struct kiocb * iocb )
{
return ( struct sock_iocb * ) iocb - > private ;
}
static inline struct kiocb * siocb_to_kiocb ( struct sock_iocb * si )
{
return si - > kiocb ;
}
struct socket_alloc {
struct socket socket ;
struct inode vfs_inode ;
} ;
static inline struct socket * SOCKET_I ( struct inode * inode )
{
return & container_of ( inode , struct socket_alloc , vfs_inode ) - > socket ;
}
static inline struct inode * SOCK_INODE ( struct socket * socket )
{
return & container_of ( socket , struct socket_alloc , socket ) - > vfs_inode ;
}
extern void __sk_stream_mem_reclaim ( struct sock * sk ) ;
extern int sk_stream_mem_schedule ( struct sock * sk , int size , int kind ) ;
# define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE)
static inline int sk_stream_pages ( int amt )
{
return ( amt + SK_STREAM_MEM_QUANTUM - 1 ) / SK_STREAM_MEM_QUANTUM ;
}
static inline void sk_stream_mem_reclaim ( struct sock * sk )
{
if ( sk - > sk_forward_alloc > = SK_STREAM_MEM_QUANTUM )
__sk_stream_mem_reclaim ( sk ) ;
}
static inline int sk_stream_rmem_schedule ( struct sock * sk , struct sk_buff * skb )
{
return ( int ) skb - > truesize < = sk - > sk_forward_alloc | |
sk_stream_mem_schedule ( sk , skb - > truesize , 1 ) ;
}
2005-09-02 04:48:23 +04:00
static inline int sk_stream_wmem_schedule ( struct sock * sk , int size )
{
return size < = sk - > sk_forward_alloc | |
sk_stream_mem_schedule ( sk , size , 0 ) ;
}
2005-04-17 02:20:36 +04:00
/* Used by processes to "lock" a socket state, so that
* interrupts and bottom half handlers won ' t change it
* from under us . It essentially blocks any incoming
* packets , so that we won ' t get any new data or any
* packets that change the state of the socket .
*
* While locked , BH processing will add new packets to
* the backlog queue . This queue is processed by the
* owner of the socket lock right before it is released .
*
* Since ~ 2.3 .5 it is also exclusive sleep lock serializing
* accesses from user process context .
*/
# define sock_owned_by_user(sk) ((sk)->sk_lock.owner)
2006-12-07 07:35:24 +03:00
/*
* Macro so as to not evaluate some arguments when
* lockdep is not enabled .
*
* Mark both the sk_lock and the sk_lock . slock as a
* per - address - family lock class .
*/
# define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
do { \
sk - > sk_lock . owner = NULL ; \
init_waitqueue_head ( & sk - > sk_lock . wq ) ; \
spin_lock_init ( & ( sk ) - > sk_lock . slock ) ; \
debug_check_no_locks_freed ( ( void * ) & ( sk ) - > sk_lock , \
sizeof ( ( sk ) - > sk_lock ) ) ; \
lockdep_set_class_and_name ( & ( sk ) - > sk_lock . slock , \
( skey ) , ( sname ) ) ; \
lockdep_init_map ( & ( sk ) - > sk_lock . dep_map , ( name ) , ( key ) , 0 ) ; \
} while ( 0 )
2006-11-09 09:44:35 +03:00
extern void FASTCALL ( lock_sock_nested ( struct sock * sk , int subclass ) ) ;
static inline void lock_sock ( struct sock * sk )
{
lock_sock_nested ( sk , 0 ) ;
}
2005-04-17 02:20:36 +04:00
extern void FASTCALL ( release_sock ( struct sock * sk ) ) ;
/* BH context may only use the following locking interface. */
# define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock))
2006-07-03 11:25:13 +04:00
# define bh_lock_sock_nested(__sk) \
spin_lock_nested ( & ( ( __sk ) - > sk_lock . slock ) , \
SINGLE_DEPTH_NESTING )
2005-04-17 02:20:36 +04:00
# define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
2005-07-09 01:57:47 +04:00
extern struct sock * sk_alloc ( int family ,
2005-10-07 10:46:04 +04:00
gfp_t priority ,
2005-04-17 02:20:36 +04:00
struct proto * prot , int zero_it ) ;
extern void sk_free ( struct sock * sk ) ;
2005-08-10 07:10:12 +04:00
extern struct sock * sk_clone ( const struct sock * sk ,
2005-10-07 10:46:04 +04:00
const gfp_t priority ) ;
2005-04-17 02:20:36 +04:00
extern struct sk_buff * sock_wmalloc ( struct sock * sk ,
unsigned long size , int force ,
2005-10-07 10:46:04 +04:00
gfp_t priority ) ;
2005-04-17 02:20:36 +04:00
extern struct sk_buff * sock_rmalloc ( struct sock * sk ,
unsigned long size , int force ,
2005-10-07 10:46:04 +04:00
gfp_t priority ) ;
2005-04-17 02:20:36 +04:00
extern void sock_wfree ( struct sk_buff * skb ) ;
extern void sock_rfree ( struct sk_buff * skb ) ;
extern int sock_setsockopt ( struct socket * sock , int level ,
int op , char __user * optval ,
int optlen ) ;
extern int sock_getsockopt ( struct socket * sock , int level ,
int op , char __user * optval ,
int __user * optlen ) ;
extern struct sk_buff * sock_alloc_send_skb ( struct sock * sk ,
unsigned long size ,
int noblock ,
int * errcode ) ;
2005-07-09 01:57:47 +04:00
extern void * sock_kmalloc ( struct sock * sk , int size ,
2005-10-07 10:46:04 +04:00
gfp_t priority ) ;
2005-04-17 02:20:36 +04:00
extern void sock_kfree_s ( struct sock * sk , void * mem , int size ) ;
extern void sk_send_sigurg ( struct sock * sk ) ;
/*
* Functions to fill in entries in struct proto_ops when a protocol
* does not implement a particular function .
*/
extern int sock_no_bind ( struct socket * ,
struct sockaddr * , int ) ;
extern int sock_no_connect ( struct socket * ,
struct sockaddr * , int , int ) ;
extern int sock_no_socketpair ( struct socket * ,
struct socket * ) ;
extern int sock_no_accept ( struct socket * ,
struct socket * , int ) ;
extern int sock_no_getname ( struct socket * ,
struct sockaddr * , int * , int ) ;
extern unsigned int sock_no_poll ( struct file * , struct socket * ,
struct poll_table_struct * ) ;
extern int sock_no_ioctl ( struct socket * , unsigned int ,
unsigned long ) ;
extern int sock_no_listen ( struct socket * , int ) ;
extern int sock_no_shutdown ( struct socket * , int ) ;
extern int sock_no_getsockopt ( struct socket * , int , int ,
char __user * , int __user * ) ;
extern int sock_no_setsockopt ( struct socket * , int , int ,
char __user * , int ) ;
extern int sock_no_sendmsg ( struct kiocb * , struct socket * ,
struct msghdr * , size_t ) ;
extern int sock_no_recvmsg ( struct kiocb * , struct socket * ,
struct msghdr * , size_t , int ) ;
extern int sock_no_mmap ( struct file * file ,
struct socket * sock ,
struct vm_area_struct * vma ) ;
extern ssize_t sock_no_sendpage ( struct socket * sock ,
struct page * page ,
int offset , size_t size ,
int flags ) ;
/*
* Functions to fill in entries in struct proto_ops when a protocol
* uses the inet style .
*/
extern int sock_common_getsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int __user * optlen ) ;
extern int sock_common_recvmsg ( struct kiocb * iocb , struct socket * sock ,
struct msghdr * msg , size_t size , int flags ) ;
extern int sock_common_setsockopt ( struct socket * sock , int level , int optname ,
char __user * optval , int optlen ) ;
2006-03-21 09:45:21 +03:00
extern int compat_sock_common_getsockopt ( struct socket * sock , int level ,
int optname , char __user * optval , int __user * optlen ) ;
extern int compat_sock_common_setsockopt ( struct socket * sock , int level ,
int optname , char __user * optval , int optlen ) ;
2005-04-17 02:20:36 +04:00
extern void sk_common_release ( struct sock * sk ) ;
/*
* Default socket callbacks and setup code
*/
/* Initialise core socket variables */
extern void sock_init_data ( struct socket * sock , struct sock * sk ) ;
/**
* sk_filter - run a packet through a socket filter
* @ sk : sock associated with & sk_buff
* @ skb : buffer to filter
* @ needlock : set to 1 if the sock is not locked by caller .
*
* Run the filter code and then cut skb - > data to correct size returned by
* sk_run_filter . If pkt_len is 0 we toss packet . If skb - > len is smaller
* than pkt_len we keep whole skb - > data . This is the socket level
* wrapper to sk_run_filter . It returns 0 if the packet should
* be accepted or - EPERM if the packet should be tossed .
*
*/
2006-09-01 02:28:39 +04:00
static inline int sk_filter ( struct sock * sk , struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
int err ;
2006-09-01 02:28:39 +04:00
struct sk_filter * filter ;
2005-04-17 02:20:36 +04:00
err = security_sock_rcv_skb ( sk , skb ) ;
if ( err )
return err ;
2006-09-01 02:28:39 +04:00
rcu_read_lock_bh ( ) ;
filter = sk - > sk_filter ;
if ( filter ) {
unsigned int pkt_len = sk_run_filter ( skb , filter - > insns ,
filter - > len ) ;
err = pkt_len ? pskb_trim ( skb , pkt_len ) : - EPERM ;
2005-04-17 02:20:36 +04:00
}
2006-09-01 02:28:39 +04:00
rcu_read_unlock_bh ( ) ;
2005-04-17 02:20:36 +04:00
return err ;
}
/**
2006-11-24 04:56:13 +03:00
* sk_filter_rcu_free : Free a socket filter
* @ rcu : rcu_head that contains the sk_filter to free
2005-04-17 02:20:36 +04:00
*/
2006-09-01 02:28:39 +04:00
static inline void sk_filter_rcu_free ( struct rcu_head * rcu )
{
struct sk_filter * fp = container_of ( rcu , struct sk_filter , rcu ) ;
kfree ( fp ) ;
}
2006-11-24 04:56:13 +03:00
/**
* sk_filter_release : Release a socket filter
* @ sk : socket
* @ fp : filter to remove
*
* Remove a filter from a socket and release its resources .
*/
2005-04-17 02:20:36 +04:00
static inline void sk_filter_release ( struct sock * sk , struct sk_filter * fp )
{
unsigned int size = sk_filter_len ( fp ) ;
atomic_sub ( size , & sk - > sk_omem_alloc ) ;
if ( atomic_dec_and_test ( & fp - > refcnt ) )
2006-09-01 02:28:39 +04:00
call_rcu_bh ( & fp - > rcu , sk_filter_rcu_free ) ;
2005-04-17 02:20:36 +04:00
}
static inline void sk_filter_charge ( struct sock * sk , struct sk_filter * fp )
{
atomic_inc ( & fp - > refcnt ) ;
atomic_add ( sk_filter_len ( fp ) , & sk - > sk_omem_alloc ) ;
}
/*
* Socket reference counting postulates .
*
* * Each user of socket SHOULD hold a reference count .
* * Each access point to socket ( an hash table bucket , reference from a list ,
* running timer , skb in flight MUST hold a reference count .
* * When reference count hits 0 , it means it will never increase back .
* * When reference count hits 0 , it means that no references from
* outside exist to this socket and current process on current CPU
* is last user and may / should destroy this socket .
* * sk_free is called from any context : process , BH , IRQ . When
* it is called , socket has no references from outside - > sk_free
* may release descendant resources allocated by the socket , but
* to the time when it is called , socket is NOT referenced by any
* hash tables , lists etc .
* * Packets , delivered from outside ( from network or from another process )
* and enqueued on receive / error queues SHOULD NOT grab reference count ,
* when they sit in queue . Otherwise , packets will leak to hole , when
* socket is looked up by one cpu and unhasing is made by another CPU .
* It is true for udp / raw , netlink ( leak to receive and error queues ) , tcp
* ( leak to backlog ) . Packet socket does all the processing inside
* BR_NETPROTO_LOCK , so that it has not this race condition . UNIX sockets
* use separate SMP lock , so that they are prone too .
*/
/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put ( struct sock * sk )
{
if ( atomic_dec_and_test ( & sk - > sk_refcnt ) )
sk_free ( sk ) ;
}
2006-11-16 19:06:06 +03:00
extern int sk_receive_skb ( struct sock * sk , struct sk_buff * skb ,
const int nested ) ;
2005-12-27 07:42:22 +03:00
2005-04-17 02:20:36 +04:00
/* Detach socket from process context.
* Announce socket dead , detach it from wait queue and inode .
* Note that parent inode held reference count on this struct sock ,
* we do not release it in this function , because protocol
* probably wants some additional cleanups or even continuing
* to work with this socket ( TCP ) .
*/
static inline void sock_orphan ( struct sock * sk )
{
write_lock_bh ( & sk - > sk_callback_lock ) ;
sock_set_flag ( sk , SOCK_DEAD ) ;
sk - > sk_socket = NULL ;
sk - > sk_sleep = NULL ;
write_unlock_bh ( & sk - > sk_callback_lock ) ;
}
static inline void sock_graft ( struct sock * sk , struct socket * parent )
{
write_lock_bh ( & sk - > sk_callback_lock ) ;
sk - > sk_sleep = & parent - > wait ;
parent - > sk = sk ;
sk - > sk_socket = parent ;
2006-07-25 10:32:50 +04:00
security_sock_graft ( sk , parent ) ;
2005-04-17 02:20:36 +04:00
write_unlock_bh ( & sk - > sk_callback_lock ) ;
}
2006-08-05 10:08:56 +04:00
static inline void sock_copy ( struct sock * nsk , const struct sock * osk )
{
# ifdef CONFIG_SECURITY_NETWORK
void * sptr = nsk - > sk_security ;
# endif
memcpy ( nsk , osk , osk - > sk_prot - > obj_size ) ;
# ifdef CONFIG_SECURITY_NETWORK
nsk - > sk_security = sptr ;
security_sk_clone ( osk , nsk ) ;
# endif
}
2005-04-17 02:20:36 +04:00
extern int sock_i_uid ( struct sock * sk ) ;
extern unsigned long sock_i_ino ( struct sock * sk ) ;
static inline struct dst_entry *
__sk_dst_get ( struct sock * sk )
{
return sk - > sk_dst_cache ;
}
static inline struct dst_entry *
sk_dst_get ( struct sock * sk )
{
struct dst_entry * dst ;
read_lock ( & sk - > sk_dst_lock ) ;
dst = sk - > sk_dst_cache ;
if ( dst )
dst_hold ( dst ) ;
read_unlock ( & sk - > sk_dst_lock ) ;
return dst ;
}
static inline void
__sk_dst_set ( struct sock * sk , struct dst_entry * dst )
{
struct dst_entry * old_dst ;
old_dst = sk - > sk_dst_cache ;
sk - > sk_dst_cache = dst ;
dst_release ( old_dst ) ;
}
static inline void
sk_dst_set ( struct sock * sk , struct dst_entry * dst )
{
write_lock ( & sk - > sk_dst_lock ) ;
__sk_dst_set ( sk , dst ) ;
write_unlock ( & sk - > sk_dst_lock ) ;
}
static inline void
__sk_dst_reset ( struct sock * sk )
{
struct dst_entry * old_dst ;
old_dst = sk - > sk_dst_cache ;
sk - > sk_dst_cache = NULL ;
dst_release ( old_dst ) ;
}
static inline void
sk_dst_reset ( struct sock * sk )
{
write_lock ( & sk - > sk_dst_lock ) ;
__sk_dst_reset ( sk ) ;
write_unlock ( & sk - > sk_dst_lock ) ;
}
2006-03-28 13:08:21 +04:00
extern struct dst_entry * __sk_dst_check ( struct sock * sk , u32 cookie ) ;
2005-04-17 02:20:36 +04:00
2006-03-28 13:08:21 +04:00
extern struct dst_entry * sk_dst_check ( struct sock * sk , u32 cookie ) ;
2005-04-17 02:20:36 +04:00
2006-07-01 00:36:35 +04:00
static inline int sk_can_gso ( const struct sock * sk )
{
return net_gso_ok ( sk - > sk_route_caps , sk - > sk_gso_type ) ;
}
2005-08-10 06:49:02 +04:00
static inline void sk_setup_caps ( struct sock * sk , struct dst_entry * dst )
{
__sk_dst_set ( sk , dst ) ;
sk - > sk_route_caps = dst - > dev - > features ;
2006-06-22 14:07:29 +04:00
if ( sk - > sk_route_caps & NETIF_F_GSO )
2006-07-01 00:36:35 +04:00
sk - > sk_route_caps | = NETIF_F_GSO_MASK ;
if ( sk_can_gso ( sk ) ) {
2006-06-29 23:30:00 +04:00
if ( dst - > header_len )
2006-07-01 00:36:35 +04:00
sk - > sk_route_caps & = ~ NETIF_F_GSO_MASK ;
2006-06-22 14:07:29 +04:00
else
sk - > sk_route_caps | = NETIF_F_SG | NETIF_F_HW_CSUM ;
2005-08-10 06:49:02 +04:00
}
}
2005-04-17 02:20:36 +04:00
static inline void sk_charge_skb ( struct sock * sk , struct sk_buff * skb )
{
sk - > sk_wmem_queued + = skb - > truesize ;
sk - > sk_forward_alloc - = skb - > truesize ;
}
static inline int skb_copy_to_page ( struct sock * sk , char __user * from ,
struct sk_buff * skb , struct page * page ,
int off , int copy )
{
if ( skb - > ip_summed = = CHECKSUM_NONE ) {
int err = 0 ;
2006-11-15 08:36:34 +03:00
__wsum csum = csum_and_copy_from_user ( from ,
2005-04-17 02:20:36 +04:00
page_address ( page ) + off ,
copy , 0 , & err ) ;
if ( err )
return err ;
skb - > csum = csum_block_add ( skb - > csum , csum , skb - > len ) ;
} else if ( copy_from_user ( page_address ( page ) + off , from , copy ) )
return - EFAULT ;
skb - > len + = copy ;
skb - > data_len + = copy ;
skb - > truesize + = copy ;
sk - > sk_wmem_queued + = copy ;
sk - > sk_forward_alloc - = copy ;
return 0 ;
}
/*
* Queue a received datagram if it will fit . Stream and sequenced
* protocols can ' t normally use this as they need to fit buffers in
* and play with them .
*
* Inlined as it ' s very short and called for pretty much every
* packet ever received .
*/
static inline void skb_set_owner_w ( struct sk_buff * skb , struct sock * sk )
{
sock_hold ( sk ) ;
skb - > sk = sk ;
skb - > destructor = sock_wfree ;
atomic_add ( skb - > truesize , & sk - > sk_wmem_alloc ) ;
}
static inline void skb_set_owner_r ( struct sk_buff * skb , struct sock * sk )
{
skb - > sk = sk ;
skb - > destructor = sock_rfree ;
atomic_add ( skb - > truesize , & sk - > sk_rmem_alloc ) ;
}
extern void sk_reset_timer ( struct sock * sk , struct timer_list * timer ,
unsigned long expires ) ;
extern void sk_stop_timer ( struct sock * sk , struct timer_list * timer ) ;
2006-03-28 13:08:21 +04:00
extern int sock_queue_rcv_skb ( struct sock * sk , struct sk_buff * skb ) ;
2005-04-17 02:20:36 +04:00
static inline int sock_queue_err_skb ( struct sock * sk , struct sk_buff * skb )
{
/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with - W - - ANK
*/
if ( atomic_read ( & sk - > sk_rmem_alloc ) + skb - > truesize > =
( unsigned ) sk - > sk_rcvbuf )
return - ENOMEM ;
skb_set_owner_r ( skb , sk ) ;
skb_queue_tail ( & sk - > sk_error_queue , skb ) ;
if ( ! sock_flag ( sk , SOCK_DEAD ) )
sk - > sk_data_ready ( sk , skb - > len ) ;
return 0 ;
}
/*
* Recover an error report and clear atomically
*/
static inline int sock_error ( struct sock * sk )
{
2005-12-14 10:22:19 +03:00
int err ;
if ( likely ( ! sk - > sk_err ) )
return 0 ;
err = xchg ( & sk - > sk_err , 0 ) ;
2005-04-17 02:20:36 +04:00
return - err ;
}
static inline unsigned long sock_wspace ( struct sock * sk )
{
int amt = 0 ;
if ( ! ( sk - > sk_shutdown & SEND_SHUTDOWN ) ) {
amt = sk - > sk_sndbuf - atomic_read ( & sk - > sk_wmem_alloc ) ;
if ( amt < 0 )
amt = 0 ;
}
return amt ;
}
static inline void sk_wake_async ( struct sock * sk , int how , int band )
{
if ( sk - > sk_socket & & sk - > sk_socket - > fasync_list )
sock_wake_async ( sk - > sk_socket , how , band ) ;
}
# define SOCK_MIN_SNDBUF 2048
# define SOCK_MIN_RCVBUF 256
static inline void sk_stream_moderate_sndbuf ( struct sock * sk )
{
if ( ! ( sk - > sk_userlocks & SOCK_SNDBUF_LOCK ) ) {
sk - > sk_sndbuf = min ( sk - > sk_sndbuf , sk - > sk_wmem_queued / 2 ) ;
sk - > sk_sndbuf = max ( sk - > sk_sndbuf , SOCK_MIN_SNDBUF ) ;
}
}
static inline struct sk_buff * sk_stream_alloc_pskb ( struct sock * sk ,
2005-07-09 01:57:47 +04:00
int size , int mem ,
2005-10-07 10:46:04 +04:00
gfp_t gfp )
2005-04-17 02:20:36 +04:00
{
2005-07-06 02:17:25 +04:00
struct sk_buff * skb ;
int hdr_len ;
2005-04-17 02:20:36 +04:00
2005-07-06 02:17:25 +04:00
hdr_len = SKB_DATA_ALIGN ( sk - > sk_prot - > max_header ) ;
2005-08-18 01:57:30 +04:00
skb = alloc_skb_fclone ( size + hdr_len , gfp ) ;
2005-04-17 02:20:36 +04:00
if ( skb ) {
skb - > truesize + = mem ;
2005-09-02 04:48:23 +04:00
if ( sk_stream_wmem_schedule ( sk , skb - > truesize ) ) {
2005-07-06 02:17:25 +04:00
skb_reserve ( skb , hdr_len ) ;
2005-04-17 02:20:36 +04:00
return skb ;
}
__kfree_skb ( skb ) ;
} else {
sk - > sk_prot - > enter_memory_pressure ( ) ;
sk_stream_moderate_sndbuf ( sk ) ;
}
return NULL ;
}
static inline struct sk_buff * sk_stream_alloc_skb ( struct sock * sk ,
2005-07-09 01:57:47 +04:00
int size ,
2005-10-07 10:46:04 +04:00
gfp_t gfp )
2005-04-17 02:20:36 +04:00
{
return sk_stream_alloc_pskb ( sk , size , 0 , gfp ) ;
}
static inline struct page * sk_stream_alloc_page ( struct sock * sk )
{
struct page * page = NULL ;
2005-09-02 04:48:59 +04:00
page = alloc_pages ( sk - > sk_allocation , 0 ) ;
if ( ! page ) {
2005-04-17 02:20:36 +04:00
sk - > sk_prot - > enter_memory_pressure ( ) ;
sk_stream_moderate_sndbuf ( sk ) ;
}
return page ;
}
/*
* Default write policy as shown to user space via poll / select / SIGIO
*/
static inline int sock_writeable ( const struct sock * sk )
{
return atomic_read ( & sk - > sk_wmem_alloc ) < ( sk - > sk_sndbuf / 2 ) ;
}
2005-10-07 10:46:04 +04:00
static inline gfp_t gfp_any ( void )
2005-04-17 02:20:36 +04:00
{
2007-02-27 20:56:42 +03:00
return in_atomic ( ) ? GFP_ATOMIC : GFP_KERNEL ;
2005-04-17 02:20:36 +04:00
}
static inline long sock_rcvtimeo ( const struct sock * sk , int noblock )
{
return noblock ? 0 : sk - > sk_rcvtimeo ;
}
static inline long sock_sndtimeo ( const struct sock * sk , int noblock )
{
return noblock ? 0 : sk - > sk_sndtimeo ;
}
static inline int sock_rcvlowat ( const struct sock * sk , int waitall , int len )
{
return ( waitall ? len : min_t ( int , sk - > sk_rcvlowat , len ) ) ? : 1 ;
}
/* Alas, with timeout socket operations are not restartable.
* Compare this to poll ( ) .
*/
static inline int sock_intr_errno ( long timeo )
{
return timeo = = MAX_SCHEDULE_TIMEOUT ? - ERESTARTSYS : - EINTR ;
}
2007-03-26 09:14:49 +04:00
extern void __sock_recv_timestamp ( struct msghdr * msg , struct sock * sk ,
struct sk_buff * skb ) ;
2005-04-17 02:20:36 +04:00
static __inline__ void
sock_recv_timestamp ( struct msghdr * msg , struct sock * sk , struct sk_buff * skb )
{
2007-04-20 03:16:32 +04:00
ktime_t kt = skb - > tstamp ;
2005-08-15 04:24:31 +04:00
2007-03-26 09:14:49 +04:00
if ( sock_flag ( sk , SOCK_RCVTSTAMP ) )
__sock_recv_timestamp ( msg , sk , skb ) ;
else
2007-04-20 03:16:32 +04:00
sk - > sk_stamp = kt ;
2005-04-17 02:20:36 +04:00
}
/**
* sk_eat_skb - Release a skb if it is no longer needed
2005-05-01 19:59:25 +04:00
* @ sk : socket to eat this skb from
* @ skb : socket buffer to eat
2006-06-23 03:00:11 +04:00
* @ copied_early : flag indicating whether DMA operations copied this data early
2005-04-17 02:20:36 +04:00
*
* This routine must be called with interrupts disabled or with the socket
* locked so that the sk_buff queue operation is ok .
*/
2006-05-24 05:01:28 +04:00
# ifdef CONFIG_NET_DMA
static inline void sk_eat_skb ( struct sock * sk , struct sk_buff * skb , int copied_early )
{
__skb_unlink ( skb , & sk - > sk_receive_queue ) ;
if ( ! copied_early )
__kfree_skb ( skb ) ;
else
__skb_queue_tail ( & sk - > sk_async_wait_queue , skb ) ;
}
# else
static inline void sk_eat_skb ( struct sock * sk , struct sk_buff * skb , int copied_early )
2005-04-17 02:20:36 +04:00
{
__skb_unlink ( skb , & sk - > sk_receive_queue ) ;
__kfree_skb ( skb ) ;
}
2006-05-24 05:01:28 +04:00
# endif
2005-04-17 02:20:36 +04:00
extern void sock_enable_timestamp ( struct sock * sk ) ;
extern int sock_get_timestamp ( struct sock * , struct timeval __user * ) ;
2007-03-19 03:33:16 +03:00
extern int sock_get_timestampns ( struct sock * , struct timespec __user * ) ;
2005-04-17 02:20:36 +04:00
/*
* Enable debug / info messages
*/
2007-03-09 07:41:08 +03:00
extern int net_msg_warn ;
# define NETDEBUG(fmt, args...) \
do { if ( net_msg_warn ) printk ( fmt , # # args ) ; } while ( 0 )
2005-04-17 02:20:36 +04:00
2007-03-09 07:41:08 +03:00
# define LIMIT_NETDEBUG(fmt, args...) \
do { if ( net_msg_warn & & net_ratelimit ( ) ) printk ( fmt , # # args ) ; } while ( 0 )
2005-04-17 02:20:36 +04:00
/*
* Macros for sleeping on a socket . Use them like this :
*
* SOCK_SLEEP_PRE ( sk )
* if ( condition )
* schedule ( ) ;
* SOCK_SLEEP_POST ( sk )
*
* N . B . These are now obsolete and were , afaik , only ever used in DECnet
* and when the last use of them in DECnet has gone , I ' m intending to
* remove them .
*/
# define SOCK_SLEEP_PRE(sk) { struct task_struct *tsk = current; \
DECLARE_WAITQUEUE ( wait , tsk ) ; \
tsk - > state = TASK_INTERRUPTIBLE ; \
add_wait_queue ( ( sk ) - > sk_sleep , & wait ) ; \
release_sock ( sk ) ;
# define SOCK_SLEEP_POST(sk) tsk->state = TASK_RUNNING; \
remove_wait_queue ( ( sk ) - > sk_sleep , & wait ) ; \
lock_sock ( sk ) ; \
}
static inline void sock_valbool_flag ( struct sock * sk , int bit , int valbool )
{
if ( valbool )
sock_set_flag ( sk , bit ) ;
else
sock_reset_flag ( sk , bit ) ;
}
extern __u32 sysctl_wmem_max ;
extern __u32 sysctl_rmem_max ;
# ifdef CONFIG_NET
int siocdevprivate_ioctl ( unsigned int fd , unsigned int cmd , unsigned long arg ) ;
# else
static inline int siocdevprivate_ioctl ( unsigned int fd , unsigned int cmd , unsigned long arg )
{
return - ENODEV ;
}
# endif
2005-08-16 09:18:02 +04:00
extern void sk_init ( void ) ;
# ifdef CONFIG_SYSCTL
extern struct ctl_table core_table [ ] ;
# endif
2005-09-06 05:14:11 +04:00
extern int sysctl_optmem_max ;
2005-08-16 09:18:02 +04:00
extern __u32 sysctl_wmem_default ;
extern __u32 sysctl_rmem_default ;
2005-04-17 02:20:36 +04:00
# endif /* _SOCK_H */