2005-08-09 20:09:30 -07:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Definitions for a generic INET TIMEWAIT sock
*
* From code originally in net / tcp . h
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# ifndef _INET_TIMEWAIT_SOCK_
# define _INET_TIMEWAIT_SOCK_
2008-10-18 17:37:51 +02:00
# include <linux/kmemcheck.h>
2005-08-09 20:09:30 -07:00
# include <linux/list.h>
2005-10-10 21:25:23 -07:00
# include <linux/module.h>
2005-08-09 20:44:40 -07:00
# include <linux/timer.h>
2005-08-09 20:09:30 -07:00
# include <linux/types.h>
2005-08-09 20:44:40 -07:00
# include <linux/workqueue.h>
2005-08-09 20:09:30 -07:00
2005-12-27 02:43:12 -02:00
# include <net/inet_sock.h>
2005-08-09 20:09:30 -07:00
# include <net/sock.h>
# include <net/tcp_states.h>
2005-12-13 23:25:19 -08:00
# include <net/timewait_sock.h>
2005-08-09 20:09:30 -07:00
# include <asm/atomic.h>
2005-08-09 20:44:40 -07:00
struct inet_hashinfo ;
# define INET_TWDR_RECYCLE_SLOTS_LOG 5
# define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
/*
* If time > 4 sec , it is " slow " path , no recycling is required ,
* so that we select tick to get range about 4 seconds .
*/
# if HZ <= 16 || HZ > 4096
# error Unsupported: HZ <= 16 or HZ > 4096
# elif HZ <= 32
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 64
# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 128
# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 256
# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 512
# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 1024
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# elif HZ <= 2048
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# else
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
# endif
/* TIME_WAIT reaping mechanism. */
# define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
# define INET_TWDR_TWKILL_QUOTA 100
struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand ;
2007-03-05 13:32:48 -08:00
unsigned long twcal_jiffie ;
2005-08-09 20:44:40 -07:00
struct timer_list twcal_timer ;
struct hlist_head twcal_row [ INET_TWDR_RECYCLE_SLOTS ] ;
spinlock_t death_lock ;
int tw_count ;
int period ;
u32 thread_slots ;
struct work_struct twkill_work ;
struct timer_list tw_timer ;
int slot ;
struct hlist_head cells [ INET_TWDR_TWKILL_SLOTS ] ;
struct inet_hashinfo * hashinfo ;
int sysctl_tw_recycle ;
int sysctl_max_tw_buckets ;
} ;
2005-08-09 20:45:03 -07:00
extern void inet_twdr_hangman ( unsigned long data ) ;
2006-11-22 14:55:48 +00:00
extern void inet_twdr_twkill_work ( struct work_struct * work ) ;
2005-08-09 20:45:03 -07:00
extern void inet_twdr_twcal_tick ( unsigned long data ) ;
2005-08-09 20:09:30 -07:00
# if (BITS_PER_LONG == 64)
# define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8
# else
# define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4
# endif
struct inet_bind_bucket ;
/*
* This is a TIME_WAIT sock . It works around the memory consumption
* problems of sockets in such a state on heavily loaded servers , but
* without violating the protocol specification .
*/
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common , so please just
* don ' t add nothing before this first member ( __tw_common ) - - acme
*/
struct sock_common __tw_common ;
# define tw_family __tw_common.skc_family
# define tw_state __tw_common.skc_state
# define tw_reuse __tw_common.skc_reuse
# define tw_bound_dev_if __tw_common.skc_bound_dev_if
2008-11-16 19:40:17 -08:00
# define tw_node __tw_common.skc_nulls_node
2005-08-09 20:09:30 -07:00
# define tw_bind_node __tw_common.skc_bind_node
# define tw_refcnt __tw_common.skc_refcnt
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-03 14:13:38 -07:00
# define tw_hash __tw_common.skc_hash
2005-08-09 20:09:30 -07:00
# define tw_prot __tw_common.skc_prot
2007-09-12 11:58:02 +02:00
# define tw_net __tw_common.skc_net
2008-02-03 04:08:26 -08:00
int tw_timeout ;
2005-08-09 20:09:30 -07:00
volatile unsigned char tw_substate ;
/* 3 bits hole, try to pack */
unsigned char tw_rcv_wscale ;
/* Socket demultiplex comparisons on incoming packets. */
/* these five are in inet_sock */
2006-09-27 18:43:50 -07:00
__be16 tw_sport ;
__be32 tw_daddr __attribute__ ( ( aligned ( INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES ) ) ) ;
__be32 tw_rcv_saddr ;
__be16 tw_dport ;
2005-08-09 20:09:30 -07:00
__u16 tw_num ;
2008-10-18 17:37:51 +02:00
kmemcheck_bitfield_begin ( flags ) ;
2005-08-09 20:09:30 -07:00
/* And these are ours. */
2009-10-18 22:48:51 +00:00
unsigned int tw_ipv6only : 1 ,
tw_transparent : 1 ,
tw_pad : 14 , /* 14 bits hole */
tw_ipv6_offset : 16 ;
2008-10-18 17:37:51 +02:00
kmemcheck_bitfield_end ( flags ) ;
2005-08-09 20:09:30 -07:00
unsigned long tw_ttd ;
struct inet_bind_bucket * tw_tb ;
struct hlist_node tw_death_node ;
} ;
2008-11-16 19:40:17 -08:00
static inline void inet_twsk_add_node_rcu ( struct inet_timewait_sock * tw ,
struct hlist_nulls_head * list )
2005-08-09 20:09:30 -07:00
{
2008-11-16 19:40:17 -08:00
hlist_nulls_add_head_rcu ( & tw - > tw_node , list ) ;
2005-08-09 20:09:30 -07:00
}
static inline void inet_twsk_add_bind_node ( struct inet_timewait_sock * tw ,
struct hlist_head * list )
{
hlist_add_head ( & tw - > tw_bind_node , list ) ;
}
static inline int inet_twsk_dead_hashed ( const struct inet_timewait_sock * tw )
{
2006-04-28 15:21:23 -07:00
return ! hlist_unhashed ( & tw - > tw_death_node ) ;
2005-08-09 20:09:30 -07:00
}
static inline void inet_twsk_dead_node_init ( struct inet_timewait_sock * tw )
{
tw - > tw_death_node . pprev = NULL ;
}
static inline void __inet_twsk_del_dead_node ( struct inet_timewait_sock * tw )
{
__hlist_del ( & tw - > tw_death_node ) ;
inet_twsk_dead_node_init ( tw ) ;
}
static inline int inet_twsk_del_dead_node ( struct inet_timewait_sock * tw )
{
if ( inet_twsk_dead_hashed ( tw ) ) {
__inet_twsk_del_dead_node ( tw ) ;
return 1 ;
}
return 0 ;
}
# define inet_twsk_for_each(tw, node, head) \
2008-11-16 19:40:17 -08:00
hlist_nulls_for_each_entry ( tw , node , head , tw_node )
2005-08-09 20:09:30 -07:00
# define inet_twsk_for_each_inmate(tw, node, jail) \
hlist_for_each_entry ( tw , node , jail , tw_death_node )
# define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \
hlist_for_each_entry_safe ( tw , node , safe , jail , tw_death_node )
static inline struct inet_timewait_sock * inet_twsk ( const struct sock * sk )
{
return ( struct inet_timewait_sock * ) sk ;
}
2006-09-27 18:44:10 -07:00
static inline __be32 inet_rcv_saddr ( const struct sock * sk )
2005-08-09 20:09:30 -07:00
{
return likely ( sk - > sk_state ! = TCP_TIME_WAIT ) ?
2009-10-15 06:30:45 +00:00
inet_sk ( sk ) - > inet_rcv_saddr : inet_twsk ( sk ) - > tw_rcv_saddr ;
2005-08-09 20:09:30 -07:00
}
2007-12-20 15:32:54 -08:00
extern void inet_twsk_put ( struct inet_timewait_sock * tw ) ;
2005-08-09 20:09:46 -07:00
2009-12-02 22:31:19 +00:00
extern int inet_twsk_unhash ( struct inet_timewait_sock * tw ) ;
2009-12-04 03:47:42 +00:00
extern int inet_twsk_bind_unhash ( struct inet_timewait_sock * tw ,
struct inet_hashinfo * hashinfo ) ;
2005-08-09 20:09:59 -07:00
extern struct inet_timewait_sock * inet_twsk_alloc ( const struct sock * sk ,
const int state ) ;
2005-08-09 20:09:46 -07:00
extern void __inet_twsk_hashdance ( struct inet_timewait_sock * tw ,
struct sock * sk ,
struct inet_hashinfo * hashinfo ) ;
2005-08-09 20:45:03 -07:00
extern void inet_twsk_schedule ( struct inet_timewait_sock * tw ,
struct inet_timewait_death_row * twdr ,
const int timeo , const int timewait_len ) ;
extern void inet_twsk_deschedule ( struct inet_timewait_sock * tw ,
struct inet_timewait_death_row * twdr ) ;
2008-03-26 02:26:21 +09:00
2009-12-03 02:29:09 +00:00
extern void inet_twsk_purge ( struct inet_hashinfo * hashinfo ,
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-08 13:17:27 -07:00
struct inet_timewait_death_row * twdr , int family ) ;
2008-03-26 02:26:21 +09:00
static inline
struct net * twsk_net ( const struct inet_timewait_sock * twsk )
{
# ifdef CONFIG_NET_NS
2009-12-03 02:29:09 +00:00
return rcu_dereference ( twsk - > tw_net ) ;
2008-03-26 02:26:21 +09:00
# else
return & init_net ;
# endif
}
static inline
2008-03-26 00:48:17 -07:00
void twsk_net_set ( struct inet_timewait_sock * twsk , struct net * net )
2008-03-26 02:26:21 +09:00
{
# ifdef CONFIG_NET_NS
2009-12-03 02:29:09 +00:00
rcu_assign_pointer ( twsk - > tw_net , net ) ;
2008-03-26 02:26:21 +09:00
# endif
}
2005-08-09 20:09:30 -07:00
# endif /* _INET_TIMEWAIT_SOCK_ */