2005-08-10 07:09:46 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Generic TIME_WAIT sockets functions
*
* From code orinally in TCP
*/
2007-08-29 02:50:33 +04:00
# include <linux/kernel.h>
2008-10-18 19:37:51 +04:00
# include <linux/kmemcheck.h>
2005-08-10 07:09:46 +04:00
# include <net/inet_hashtables.h>
# include <net/inet_timewait_sock.h>
2005-08-10 07:45:03 +04:00
# include <net/ip.h>
2005-08-10 07:09:46 +04:00
/* Must be called with locally disabled BHs. */
2007-07-15 06:00:59 +04:00
static void __inet_twsk_kill ( struct inet_timewait_sock * tw ,
struct inet_hashinfo * hashinfo )
2005-08-10 07:09:46 +04:00
{
struct inet_bind_hashbucket * bhead ;
struct inet_bind_bucket * tb ;
/* Unlink from established hashes. */
2008-11-21 07:39:09 +03:00
spinlock_t * lock = inet_ehash_lockp ( hashinfo , tw - > tw_hash ) ;
2005-08-10 07:09:46 +04:00
2008-11-21 07:39:09 +03:00
spin_lock ( lock ) ;
2008-11-17 06:40:17 +03:00
if ( hlist_nulls_unhashed ( & tw - > tw_node ) ) {
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
2005-08-10 07:09:46 +04:00
return ;
}
2008-11-17 06:40:17 +03:00
hlist_nulls_del_rcu ( & tw - > tw_node ) ;
sk_nulls_node_init ( & tw - > tw_node ) ;
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
2005-08-10 07:09:46 +04:00
/* Disassociate with bind bucket. */
2008-06-17 04:12:49 +04:00
bhead = & hashinfo - > bhash [ inet_bhashfn ( twsk_net ( tw ) , tw - > tw_num ,
hashinfo - > bhash_size ) ] ;
2005-08-10 07:09:46 +04:00
spin_lock ( & bhead - > lock ) ;
tb = tw - > tw_tb ;
__hlist_del ( & tw - > tw_bind_node ) ;
tw - > tw_tb = NULL ;
inet_bind_bucket_destroy ( hashinfo - > bind_bucket_cachep , tb ) ;
spin_unlock ( & bhead - > lock ) ;
# ifdef SOCK_REFCNT_DEBUG
if ( atomic_read ( & tw - > tw_refcnt ) ! = 1 ) {
printk ( KERN_DEBUG " %s timewait_sock %p refcnt=%d \n " ,
tw - > tw_prot - > name , tw , atomic_read ( & tw - > tw_refcnt ) ) ;
}
# endif
inet_twsk_put ( tw ) ;
}
2009-05-07 03:50:52 +04:00
static noinline void inet_twsk_free ( struct inet_timewait_sock * tw )
2007-12-21 02:32:54 +03:00
{
2009-05-07 03:50:52 +04:00
struct module * owner = tw - > tw_prot - > owner ;
twsk_destructor ( ( struct sock * ) tw ) ;
2007-12-21 02:32:54 +03:00
# ifdef SOCK_REFCNT_DEBUG
2009-05-07 03:50:52 +04:00
pr_debug ( " %s timewait_sock %p released \n " , tw - > tw_prot - > name , tw ) ;
2007-12-21 02:32:54 +03:00
# endif
2009-05-07 03:50:52 +04:00
release_net ( twsk_net ( tw ) ) ;
kmem_cache_free ( tw - > tw_prot - > twsk_prot - > twsk_slab , tw ) ;
module_put ( owner ) ;
}
void inet_twsk_put ( struct inet_timewait_sock * tw )
{
if ( atomic_dec_and_test ( & tw - > tw_refcnt ) )
inet_twsk_free ( tw ) ;
2007-12-21 02:32:54 +03:00
}
EXPORT_SYMBOL_GPL ( inet_twsk_put ) ;
2005-08-10 07:09:46 +04:00
/*
* Enter the time wait state . This is called with locally disabled BH .
* Essentially we whip up a timewait bucket , copy the relevant info into it
* from the SK , and mess with hash chains and list linkage .
*/
void __inet_twsk_hashdance ( struct inet_timewait_sock * tw , struct sock * sk ,
struct inet_hashinfo * hashinfo )
{
const struct inet_sock * inet = inet_sk ( sk ) ;
2005-08-10 07:10:42 +04:00
const struct inet_connection_sock * icsk = inet_csk ( sk ) ;
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
struct inet_ehash_bucket * ehead = inet_ehash_bucket ( hashinfo , sk - > sk_hash ) ;
2008-11-21 07:39:09 +03:00
spinlock_t * lock = inet_ehash_lockp ( hashinfo , sk - > sk_hash ) ;
2005-08-10 07:09:46 +04:00
struct inet_bind_hashbucket * bhead ;
/* Step 1: Put TW into bind hash. Original socket stays there too.
Note , that any socket with inet - > num ! = 0 MUST be bound in
binding cache , even if it is closed .
*/
2008-06-17 04:12:49 +04:00
bhead = & hashinfo - > bhash [ inet_bhashfn ( twsk_net ( tw ) , inet - > num ,
hashinfo - > bhash_size ) ] ;
2005-08-10 07:09:46 +04:00
spin_lock ( & bhead - > lock ) ;
2005-08-10 07:10:42 +04:00
tw - > tw_tb = icsk - > icsk_bind_hash ;
2008-07-26 08:43:18 +04:00
WARN_ON ( ! icsk - > icsk_bind_hash ) ;
2005-08-10 07:09:46 +04:00
inet_twsk_add_bind_node ( tw , & tw - > tw_tb - > owners ) ;
spin_unlock ( & bhead - > lock ) ;
2008-11-21 07:39:09 +03:00
spin_lock ( lock ) ;
2005-08-10 07:09:46 +04:00
2008-11-17 06:40:17 +03:00
/*
* Step 2 : Hash TW into TIMEWAIT chain .
* Should be done before removing sk from established chain
* because readers are lockless and search established first .
*/
2005-08-10 07:09:46 +04:00
atomic_inc ( & tw - > tw_refcnt ) ;
2008-11-17 06:40:17 +03:00
inet_twsk_add_node_rcu ( tw , & ehead - > twchain ) ;
/* Step 3: Remove SK from established hash. */
if ( __sk_nulls_del_node_init_rcu ( sk ) )
sock_prot_inuse_add ( sock_net ( sk ) , sk - > sk_prot , - 1 ) ;
2005-08-10 07:09:46 +04:00
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
2005-08-10 07:09:46 +04:00
}
2005-08-10 07:09:59 +04:00
2005-08-10 07:45:03 +04:00
EXPORT_SYMBOL_GPL ( __inet_twsk_hashdance ) ;
2005-08-10 07:09:59 +04:00
struct inet_timewait_sock * inet_twsk_alloc ( const struct sock * sk , const int state )
{
2005-12-14 10:25:19 +03:00
struct inet_timewait_sock * tw =
kmem_cache_alloc ( sk - > sk_prot_creator - > twsk_prot - > twsk_slab ,
2006-12-07 07:33:16 +03:00
GFP_ATOMIC ) ;
2005-08-10 07:09:59 +04:00
if ( tw ! = NULL ) {
const struct inet_sock * inet = inet_sk ( sk ) ;
2008-10-18 19:37:51 +04:00
kmemcheck_annotate_bitfield ( tw , flags ) ;
2005-08-10 07:09:59 +04:00
/* Give us an identity. */
tw - > tw_daddr = inet - > daddr ;
tw - > tw_rcv_saddr = inet - > rcv_saddr ;
tw - > tw_bound_dev_if = sk - > sk_bound_dev_if ;
tw - > tw_num = inet - > num ;
tw - > tw_state = TCP_TIME_WAIT ;
tw - > tw_substate = state ;
tw - > tw_sport = inet - > sport ;
tw - > tw_dport = inet - > dport ;
tw - > tw_family = sk - > sk_family ;
tw - > tw_reuse = sk - > sk_reuse ;
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
tw - > tw_hash = sk - > sk_hash ;
2005-08-10 07:09:59 +04:00
tw - > tw_ipv6only = 0 ;
2008-10-01 18:30:02 +04:00
tw - > tw_transparent = inet - > transparent ;
2005-08-10 07:09:59 +04:00
tw - > tw_prot = sk - > sk_prot_creator ;
2008-04-16 13:00:28 +04:00
twsk_net_set ( tw , hold_net ( sock_net ( sk ) ) ) ;
2005-08-10 07:09:59 +04:00
atomic_set ( & tw - > tw_refcnt , 1 ) ;
inet_twsk_dead_node_init ( tw ) ;
2005-10-11 08:25:23 +04:00
__module_get ( tw - > tw_prot - > owner ) ;
2005-08-10 07:09:59 +04:00
}
return tw ;
}
2005-08-10 07:45:03 +04:00
EXPORT_SYMBOL_GPL ( inet_twsk_alloc ) ;
/* Returns non-zero if quota exceeded. */
static int inet_twdr_do_twkill_work ( struct inet_timewait_death_row * twdr ,
const int slot )
{
struct inet_timewait_sock * tw ;
struct hlist_node * node ;
unsigned int killed ;
int ret ;
/* NOTE: compare this to previous version where lock
* was released after detaching chain . It was racy ,
* because tw buckets are scheduled in not serialized context
* in 2.3 ( with netfilter ) , and with softnet it is common , because
* soft irqs are not sequenced .
*/
killed = 0 ;
ret = 0 ;
rescan :
inet_twsk_for_each_inmate ( tw , node , & twdr - > cells [ slot ] ) {
__inet_twsk_del_dead_node ( tw ) ;
spin_unlock ( & twdr - > death_lock ) ;
__inet_twsk_kill ( tw , twdr - > hashinfo ) ;
2008-07-17 07:32:25 +04:00
# ifdef CONFIG_NET_NS
NET_INC_STATS_BH ( twsk_net ( tw ) , LINUX_MIB_TIMEWAITED ) ;
# endif
2005-08-10 07:45:03 +04:00
inet_twsk_put ( tw ) ;
killed + + ;
spin_lock ( & twdr - > death_lock ) ;
if ( killed > INET_TWDR_TWKILL_QUOTA ) {
ret = 1 ;
break ;
}
/* While we dropped twdr->death_lock, another cpu may have
* killed off the next TW bucket in the list , therefore
* do a fresh re - read of the hlist head node with the
* lock reacquired . We still use the hlist traversal
* macro in order to get the prefetches .
*/
goto rescan ;
}
twdr - > tw_count - = killed ;
2008-07-17 07:32:25 +04:00
# ifndef CONFIG_NET_NS
NET_ADD_STATS_BH ( & init_net , LINUX_MIB_TIMEWAITED , killed ) ;
# endif
2005-08-10 07:45:03 +04:00
return ret ;
}
void inet_twdr_hangman ( unsigned long data )
{
struct inet_timewait_death_row * twdr ;
int unsigned need_timer ;
twdr = ( struct inet_timewait_death_row * ) data ;
spin_lock ( & twdr - > death_lock ) ;
if ( twdr - > tw_count = = 0 )
goto out ;
need_timer = 0 ;
if ( inet_twdr_do_twkill_work ( twdr , twdr - > slot ) ) {
twdr - > thread_slots | = ( 1 < < twdr - > slot ) ;
schedule_work ( & twdr - > twkill_work ) ;
need_timer = 1 ;
} else {
/* We purged the entire slot, anything left? */
if ( twdr - > tw_count )
need_timer = 1 ;
tcp: fix premature termination of FIN_WAIT2 time-wait sockets
There is a race condition in the time-wait sockets code that can lead
to premature termination of FIN_WAIT2 and, subsequently, to RST
generation when the FIN,ACK from the peer finally arrives:
Time TCP header
0.000000 30755 > http [SYN] Seq=0 Win=2920 Len=0 MSS=1460 TSV=282912 TSER=0
0.000008 http > 30755 aSYN, ACK] Seq=0 Ack=1 Win=2896 Len=0 MSS=1460 TSV=...
0.136899 HEAD /1b.html?n1Lg=v1 HTTP/1.0 [Packet size limited during capture]
0.136934 HTTP/1.0 200 OK [Packet size limited during capture]
0.136945 http > 30755 [FIN, ACK] Seq=187 Ack=207 Win=2690 Len=0 TSV=270521...
0.136974 30755 > http [ACK] Seq=207 Ack=187 Win=2734 Len=0 TSV=283049 TSER=...
0.177983 30755 > http [ACK] Seq=207 Ack=188 Win=2733 Len=0 TSV=283089 TSER=...
0.238618 30755 > http [FIN, ACK] Seq=207 Ack=188 Win=2733 Len=0 TSV=283151...
0.238625 http > 30755 [RST] Seq=188 Win=0 Len=0
Say twdr->slot = 1 and we are running inet_twdr_hangman and in this
instance inet_twdr_do_twkill_work returns 1. At that point we will
mark slot 1 and schedule inet_twdr_twkill_work. We will also make
twdr->slot = 2.
Next, a connection is closed and tcp_time_wait(TCP_FIN_WAIT2, timeo)
is called which will create a new FIN_WAIT2 time-wait socket and will
place it in the last to be reached slot, i.e. twdr->slot = 1.
At this point say inet_twdr_twkill_work will run which will start
destroying the time-wait sockets in slot 1, including the just added
TCP_FIN_WAIT2 one.
To avoid this issue we increment the slot only if all entries in the
slot have been purged.
This change may delay the slots cleanup by a time-wait death row
period but only if the worker thread didn't had the time to run/purge
the current slot in the next period (6 seconds with default sysctl
settings). However, on such a busy system even without this change we
would probably see delays...
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-08-29 11:00:35 +04:00
twdr - > slot = ( ( twdr - > slot + 1 ) & ( INET_TWDR_TWKILL_SLOTS - 1 ) ) ;
2005-08-10 07:45:03 +04:00
}
if ( need_timer )
mod_timer ( & twdr - > tw_timer , jiffies + twdr - > period ) ;
out :
spin_unlock ( & twdr - > death_lock ) ;
}
EXPORT_SYMBOL_GPL ( inet_twdr_hangman ) ;
2006-11-22 17:55:48 +03:00
void inet_twdr_twkill_work ( struct work_struct * work )
2005-08-10 07:45:03 +04:00
{
2006-11-22 17:55:48 +03:00
struct inet_timewait_death_row * twdr =
container_of ( work , struct inet_timewait_death_row , twkill_work ) ;
2005-08-10 07:45:03 +04:00
int i ;
2007-12-11 13:12:36 +03:00
BUILD_BUG_ON ( ( INET_TWDR_TWKILL_SLOTS - 1 ) >
( sizeof ( twdr - > thread_slots ) * 8 ) ) ;
2005-08-10 07:45:03 +04:00
while ( twdr - > thread_slots ) {
spin_lock_bh ( & twdr - > death_lock ) ;
for ( i = 0 ; i < INET_TWDR_TWKILL_SLOTS ; i + + ) {
if ( ! ( twdr - > thread_slots & ( 1 < < i ) ) )
continue ;
while ( inet_twdr_do_twkill_work ( twdr , i ) ! = 0 ) {
if ( need_resched ( ) ) {
spin_unlock_bh ( & twdr - > death_lock ) ;
schedule ( ) ;
spin_lock_bh ( & twdr - > death_lock ) ;
}
}
twdr - > thread_slots & = ~ ( 1 < < i ) ;
}
spin_unlock_bh ( & twdr - > death_lock ) ;
}
}
EXPORT_SYMBOL_GPL ( inet_twdr_twkill_work ) ;
/* These are always called from BH context. See callers in
* tcp_input . c to verify this .
*/
/* This is for handling early-kills of TIME_WAIT sockets. */
void inet_twsk_deschedule ( struct inet_timewait_sock * tw ,
struct inet_timewait_death_row * twdr )
{
spin_lock ( & twdr - > death_lock ) ;
if ( inet_twsk_del_dead_node ( tw ) ) {
inet_twsk_put ( tw ) ;
if ( - - twdr - > tw_count = = 0 )
del_timer ( & twdr - > tw_timer ) ;
}
spin_unlock ( & twdr - > death_lock ) ;
__inet_twsk_kill ( tw , twdr - > hashinfo ) ;
}
EXPORT_SYMBOL ( inet_twsk_deschedule ) ;
void inet_twsk_schedule ( struct inet_timewait_sock * tw ,
struct inet_timewait_death_row * twdr ,
const int timeo , const int timewait_len )
{
struct hlist_head * list ;
int slot ;
/* timeout := RTO * 3.5
*
* 3.5 = 1 + 2 + 0.5 to wait for two retransmits .
*
* RATIONALE : if FIN arrived and we entered TIME - WAIT state ,
* our ACK acking that FIN can be lost . If N subsequent retransmitted
* FINs ( or previous seqments ) are lost ( probability of such event
* is p ^ ( N + 1 ) , where p is probability to lose single packet and
* time to detect the loss is about RTO * ( 2 ^ N - 1 ) with exponential
* backoff ) . Normal timewait length is calculated so , that we
* waited at least for one retransmitted FIN ( maximal RTO is 120 sec ) .
* [ BTW Linux . following BSD , violates this requirement waiting
* only for 60 sec , we should wait at least for 240 secs .
* Well , 240 consumes too much of resources 8 )
* ]
* This interval is not reduced to catch old duplicate and
* responces to our wandering segments living for two MSLs .
* However , if we use PAWS to detect
* old duplicates , we can reduce the interval to bounds required
* by RTO , rather than MSL . So , if peer understands PAWS , we
* kill tw bucket after 3.5 * RTO ( it is important that this number
* is greater than TS tick ! ) and detect old duplicates with help
* of PAWS .
*/
slot = ( timeo + ( 1 < < INET_TWDR_RECYCLE_TICK ) - 1 ) > > INET_TWDR_RECYCLE_TICK ;
spin_lock ( & twdr - > death_lock ) ;
/* Unlink it, if it was scheduled */
if ( inet_twsk_del_dead_node ( tw ) )
twdr - > tw_count - - ;
else
atomic_inc ( & tw - > tw_refcnt ) ;
if ( slot > = INET_TWDR_RECYCLE_SLOTS ) {
/* Schedule to slow timer */
if ( timeo > = timewait_len ) {
slot = INET_TWDR_TWKILL_SLOTS - 1 ;
} else {
2007-08-29 02:50:33 +04:00
slot = DIV_ROUND_UP ( timeo , twdr - > period ) ;
2005-08-10 07:45:03 +04:00
if ( slot > = INET_TWDR_TWKILL_SLOTS )
slot = INET_TWDR_TWKILL_SLOTS - 1 ;
}
tw - > tw_ttd = jiffies + timeo ;
slot = ( twdr - > slot + slot ) & ( INET_TWDR_TWKILL_SLOTS - 1 ) ;
list = & twdr - > cells [ slot ] ;
} else {
tw - > tw_ttd = jiffies + ( slot < < INET_TWDR_RECYCLE_TICK ) ;
if ( twdr - > twcal_hand < 0 ) {
twdr - > twcal_hand = 0 ;
twdr - > twcal_jiffie = jiffies ;
twdr - > twcal_timer . expires = twdr - > twcal_jiffie +
( slot < < INET_TWDR_RECYCLE_TICK ) ;
add_timer ( & twdr - > twcal_timer ) ;
} else {
if ( time_after ( twdr - > twcal_timer . expires ,
jiffies + ( slot < < INET_TWDR_RECYCLE_TICK ) ) )
mod_timer ( & twdr - > twcal_timer ,
jiffies + ( slot < < INET_TWDR_RECYCLE_TICK ) ) ;
slot = ( twdr - > twcal_hand + slot ) & ( INET_TWDR_RECYCLE_SLOTS - 1 ) ;
}
list = & twdr - > twcal_row [ slot ] ;
}
hlist_add_head ( & tw - > tw_death_node , list ) ;
if ( twdr - > tw_count + + = = 0 )
mod_timer ( & twdr - > tw_timer , jiffies + twdr - > period ) ;
spin_unlock ( & twdr - > death_lock ) ;
}
EXPORT_SYMBOL_GPL ( inet_twsk_schedule ) ;
void inet_twdr_twcal_tick ( unsigned long data )
{
struct inet_timewait_death_row * twdr ;
int n , slot ;
unsigned long j ;
unsigned long now = jiffies ;
int killed = 0 ;
int adv = 0 ;
twdr = ( struct inet_timewait_death_row * ) data ;
spin_lock ( & twdr - > death_lock ) ;
if ( twdr - > twcal_hand < 0 )
goto out ;
slot = twdr - > twcal_hand ;
j = twdr - > twcal_jiffie ;
for ( n = 0 ; n < INET_TWDR_RECYCLE_SLOTS ; n + + ) {
if ( time_before_eq ( j , now ) ) {
struct hlist_node * node , * safe ;
struct inet_timewait_sock * tw ;
inet_twsk_for_each_inmate_safe ( tw , node , safe ,
& twdr - > twcal_row [ slot ] ) {
__inet_twsk_del_dead_node ( tw ) ;
__inet_twsk_kill ( tw , twdr - > hashinfo ) ;
2008-07-17 07:32:25 +04:00
# ifdef CONFIG_NET_NS
NET_INC_STATS_BH ( twsk_net ( tw ) , LINUX_MIB_TIMEWAITKILLED ) ;
# endif
2005-08-10 07:45:03 +04:00
inet_twsk_put ( tw ) ;
killed + + ;
}
} else {
if ( ! adv ) {
adv = 1 ;
twdr - > twcal_jiffie = j ;
twdr - > twcal_hand = slot ;
}
if ( ! hlist_empty ( & twdr - > twcal_row [ slot ] ) ) {
mod_timer ( & twdr - > twcal_timer , j ) ;
goto out ;
}
}
j + = 1 < < INET_TWDR_RECYCLE_TICK ;
slot = ( slot + 1 ) & ( INET_TWDR_RECYCLE_SLOTS - 1 ) ;
}
twdr - > twcal_hand = - 1 ;
out :
if ( ( twdr - > tw_count - = killed ) = = 0 )
del_timer ( & twdr - > tw_timer ) ;
2008-07-17 07:32:25 +04:00
# ifndef CONFIG_NET_NS
NET_ADD_STATS_BH ( & init_net , LINUX_MIB_TIMEWAITKILLED , killed ) ;
# endif
2005-08-10 07:45:03 +04:00
spin_unlock ( & twdr - > death_lock ) ;
}
EXPORT_SYMBOL_GPL ( inet_twdr_twcal_tick ) ;
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
void inet_twsk_purge ( struct net * net , struct inet_hashinfo * hashinfo ,
struct inet_timewait_death_row * twdr , int family )
{
struct inet_timewait_sock * tw ;
struct sock * sk ;
2008-11-17 06:40:17 +03:00
struct hlist_nulls_node * node ;
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
int h ;
local_bh_disable ( ) ;
for ( h = 0 ; h < ( hashinfo - > ehash_size ) ; h + + ) {
struct inet_ehash_bucket * head =
inet_ehash_bucket ( hashinfo , h ) ;
2008-11-21 07:39:09 +03:00
spinlock_t * lock = inet_ehash_lockp ( hashinfo , h ) ;
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
restart :
2008-11-21 07:39:09 +03:00
spin_lock ( lock ) ;
2008-11-17 06:40:17 +03:00
sk_nulls_for_each ( sk , node , & head - > twchain ) {
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
tw = inet_twsk ( sk ) ;
if ( ! net_eq ( twsk_net ( tw ) , net ) | |
tw - > tw_family ! = family )
continue ;
atomic_inc ( & tw - > tw_refcnt ) ;
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
inet_twsk_deschedule ( tw , twdr ) ;
inet_twsk_put ( tw ) ;
goto restart ;
}
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-09 00:17:27 +04:00
}
local_bh_enable ( ) ;
}
EXPORT_SYMBOL_GPL ( inet_twsk_purge ) ;