2005-04-17 02:20:36 +04:00
/*
2005-08-12 19:51:49 +04:00
* inet_diag . c Module for monitoring INET transport protocols sockets .
2005-04-17 02:20:36 +04:00
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
2007-08-29 02:50:33 +04:00
# include <linux/kernel.h>
2005-04-17 02:20:36 +04:00
# include <linux/module.h>
# include <linux/types.h>
# include <linux/fcntl.h>
# include <linux/random.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2005-04-17 02:20:36 +04:00
# include <linux/cache.h>
# include <linux/init.h>
# include <linux/time.h>
# include <net/icmp.h>
# include <net/tcp.h>
# include <net/ipv6.h>
# include <net/inet_common.h>
2005-08-12 16:19:38 +04:00
# include <net/inet_connection_sock.h>
# include <net/inet_hashtables.h>
# include <net/inet_timewait_sock.h>
# include <net/inet6_hashtables.h>
2007-03-26 10:06:12 +04:00
# include <net/netlink.h>
2005-04-17 02:20:36 +04:00
# include <linux/inet.h>
# include <linux/stddef.h>
2005-08-12 19:56:38 +04:00
# include <linux/inet_diag.h>
2011-12-06 11:58:03 +04:00
# include <linux/sock_diag.h>
2005-04-17 02:20:36 +04:00
2005-08-12 16:27:49 +04:00
static const struct inet_diag_handler * * inet_diag_table ;
2005-08-12 19:51:49 +04:00
struct inet_diag_entry {
2015-03-10 17:15:53 +03:00
const __be32 * saddr ;
const __be32 * daddr ;
2005-04-17 02:20:36 +04:00
u16 sport ;
u16 dport ;
u16 family ;
u16 userlocks ;
} ;
2007-12-03 07:51:25 +03:00
static DEFINE_MUTEX ( inet_diag_table_mutex ) ;
2011-12-06 12:05:24 +04:00
static const struct inet_diag_handler * inet_diag_lock_handler ( int proto )
2007-12-03 07:51:25 +03:00
{
2011-12-06 12:05:24 +04:00
if ( ! inet_diag_table [ proto ] )
2011-12-15 06:43:27 +04:00
request_module ( " net-pf-%d-proto-%d-type-%d-%d " , PF_NETLINK ,
NETLINK_SOCK_DIAG , AF_INET , proto ) ;
2007-12-03 07:51:25 +03:00
mutex_lock ( & inet_diag_table_mutex ) ;
2011-12-06 12:05:24 +04:00
if ( ! inet_diag_table [ proto ] )
2007-12-03 07:51:25 +03:00
return ERR_PTR ( - ENOENT ) ;
2011-12-06 12:05:24 +04:00
return inet_diag_table [ proto ] ;
2007-12-03 07:51:25 +03:00
}
2015-03-10 17:15:53 +03:00
static void inet_diag_unlock_handler ( const struct inet_diag_handler * handler )
2007-12-03 07:51:25 +03:00
{
mutex_unlock ( & inet_diag_table_mutex ) ;
}
2015-03-14 01:51:12 +03:00
static void inet_diag_msg_common_fill ( struct inet_diag_msg * r , struct sock * sk )
{
r - > idiag_family = sk - > sk_family ;
r - > id . idiag_sport = htons ( sk - > sk_num ) ;
r - > id . idiag_dport = sk - > sk_dport ;
r - > id . idiag_if = sk - > sk_bound_dev_if ;
sock_diag_save_cookie ( sk , r - > id . idiag_cookie ) ;
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 ) {
* ( struct in6_addr * ) r - > id . idiag_src = sk - > sk_v6_rcv_saddr ;
* ( struct in6_addr * ) r - > id . idiag_dst = sk - > sk_v6_daddr ;
} else
# endif
{
memset ( & r - > id . idiag_src , 0 , sizeof ( r - > id . idiag_src ) ) ;
memset ( & r - > id . idiag_dst , 0 , sizeof ( r - > id . idiag_dst ) ) ;
r - > id . idiag_src [ 0 ] = sk - > sk_rcv_saddr ;
r - > id . idiag_dst [ 0 ] = sk - > sk_daddr ;
}
}
2015-03-13 19:49:59 +03:00
static size_t inet_sk_attr_size ( void )
{
return nla_total_size ( sizeof ( struct tcp_info ) )
+ nla_total_size ( 1 ) /* INET_DIAG_SHUTDOWN */
+ nla_total_size ( 1 ) /* INET_DIAG_TOS */
+ nla_total_size ( 1 ) /* INET_DIAG_TCLASS */
+ nla_total_size ( sizeof ( struct inet_diag_meminfo ) )
+ nla_total_size ( sizeof ( struct inet_diag_msg ) )
+ nla_total_size ( SK_MEMINFO_VARS * sizeof ( u32 ) )
+ nla_total_size ( TCP_CA_NAME_MAX )
+ nla_total_size ( sizeof ( struct tcpvegas_info ) )
+ 64 ;
}
2011-12-09 10:23:00 +04:00
int inet_sk_diag_fill ( struct sock * sk , struct inet_connection_sock * icsk ,
2015-03-10 17:15:54 +03:00
struct sk_buff * skb , const struct inet_diag_req_v2 * req ,
2015-03-10 17:15:53 +03:00
struct user_namespace * user_ns ,
u32 portid , u32 seq , u16 nlmsg_flags ,
const struct nlmsghdr * unlh )
2005-04-17 02:20:36 +04:00
{
2005-08-10 07:10:42 +04:00
const struct inet_sock * inet = inet_sk ( sk ) ;
2015-04-17 04:10:35 +03:00
const struct tcp_congestion_ops * ca_ops ;
2015-03-10 17:15:53 +03:00
const struct inet_diag_handler * handler ;
int ext = req - > idiag_ext ;
2005-08-12 19:51:49 +04:00
struct inet_diag_msg * r ;
2005-04-17 02:20:36 +04:00
struct nlmsghdr * nlh ;
2012-06-27 03:36:12 +04:00
struct nlattr * attr ;
2005-08-12 16:27:49 +04:00
void * info = NULL ;
2011-12-06 11:59:32 +04:00
handler = inet_diag_table [ req - > sdiag_protocol ] ;
2015-03-10 17:15:53 +03:00
BUG_ON ( ! handler ) ;
2005-04-17 02:20:36 +04:00
2012-09-08 00:12:54 +04:00
nlh = nlmsg_put ( skb , portid , seq , unlh - > nlmsg_type , sizeof ( * r ) ,
2012-06-27 03:36:12 +04:00
nlmsg_flags ) ;
if ( ! nlh )
2012-06-27 08:28:54 +04:00
return - EMSGSIZE ;
2005-08-12 16:27:49 +04:00
2012-06-27 08:28:54 +04:00
r = nlmsg_data ( nlh ) ;
2015-03-16 07:12:14 +03:00
BUG_ON ( ! sk_fullsock ( sk ) ) ;
2006-01-10 01:56:38 +03:00
2015-03-14 01:51:12 +03:00
inet_diag_msg_common_fill ( r , sk ) ;
2005-08-12 19:51:49 +04:00
r - > idiag_state = sk - > sk_state ;
r - > idiag_timer = 0 ;
r - > idiag_retrans = 0 ;
2005-04-17 02:20:36 +04:00
2012-10-23 22:29:56 +04:00
if ( nla_put_u8 ( skb , INET_DIAG_SHUTDOWN , sk - > sk_shutdown ) )
goto errout ;
2011-11-23 01:03:10 +04:00
/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
* hence this needs to be included regardless of socket family .
*/
if ( ext & ( 1 < < ( INET_DIAG_TOS - 1 ) ) )
2012-06-27 03:36:12 +04:00
if ( nla_put_u8 ( skb , INET_DIAG_TOS , inet - > tos ) < 0 )
goto errout ;
2011-11-23 01:03:10 +04:00
2011-12-10 13:48:31 +04:00
# if IS_ENABLED(CONFIG_IPV6)
2005-08-12 19:51:49 +04:00
if ( r - > idiag_family = = AF_INET6 ) {
2011-11-07 18:23:11 +04:00
if ( ext & ( 1 < < ( INET_DIAG_TCLASS - 1 ) ) )
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
if ( nla_put_u8 ( skb , INET_DIAG_TCLASS ,
inet6_sk ( sk ) - > tclass ) < 0 )
2012-06-27 03:36:12 +04:00
goto errout ;
2015-06-24 12:02:51 +03:00
2015-07-10 12:39:57 +03:00
if ( ( ( 1 < < sk - > sk_state ) & ( TCPF_LISTEN | TCPF_CLOSE ) ) & &
nla_put_u8 ( skb , INET_DIAG_SKV6ONLY , ipv6_only_sock ( sk ) ) )
2015-06-24 12:02:51 +03:00
goto errout ;
2005-04-17 02:20:36 +04:00
}
# endif
2012-05-25 03:58:08 +04:00
r - > idiag_uid = from_kuid_munged ( user_ns , sock_i_uid ( sk ) ) ;
2011-12-09 10:23:00 +04:00
r - > idiag_inode = sock_i_ino ( sk ) ;
2012-06-27 03:36:12 +04:00
if ( ext & ( 1 < < ( INET_DIAG_MEMINFO - 1 ) ) ) {
struct inet_diag_meminfo minfo = {
. idiag_rmem = sk_rmem_alloc_get ( sk ) ,
. idiag_wmem = sk - > sk_wmem_queued ,
. idiag_fmem = sk - > sk_forward_alloc ,
. idiag_tmem = sk_wmem_alloc_get ( sk ) ,
} ;
if ( nla_put ( skb , INET_DIAG_MEMINFO , sizeof ( minfo ) , & minfo ) < 0 )
goto errout ;
2011-12-09 10:23:00 +04:00
}
2011-12-30 04:53:32 +04:00
if ( ext & ( 1 < < ( INET_DIAG_SKMEMINFO - 1 ) ) )
if ( sock_diag_put_meminfo ( sk , skb , INET_DIAG_SKMEMINFO ) )
2012-06-27 03:36:12 +04:00
goto errout ;
2011-12-30 04:53:32 +04:00
2015-03-10 17:15:53 +03:00
if ( ! icsk ) {
2012-04-24 22:15:41 +04:00
handler - > idiag_get_info ( sk , r , NULL ) ;
2011-12-09 10:23:00 +04:00
goto out ;
}
2007-08-29 02:50:33 +04:00
# define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
2005-04-17 02:20:36 +04:00
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-11 14:00:43 +04:00
if ( icsk - > icsk_pending = = ICSK_TIME_RETRANS | |
icsk - > icsk_pending = = ICSK_TIME_EARLY_RETRANS | |
icsk - > icsk_pending = = ICSK_TIME_LOSS_PROBE ) {
2005-08-12 19:51:49 +04:00
r - > idiag_timer = 1 ;
r - > idiag_retrans = icsk - > icsk_retransmits ;
r - > idiag_expires = EXPIRES_IN_MS ( icsk - > icsk_timeout ) ;
2005-08-10 07:10:42 +04:00
} else if ( icsk - > icsk_pending = = ICSK_TIME_PROBE0 ) {
2005-08-12 19:51:49 +04:00
r - > idiag_timer = 4 ;
r - > idiag_retrans = icsk - > icsk_probes_out ;
r - > idiag_expires = EXPIRES_IN_MS ( icsk - > icsk_timeout ) ;
2005-04-17 02:20:36 +04:00
} else if ( timer_pending ( & sk - > sk_timer ) ) {
2005-08-12 19:51:49 +04:00
r - > idiag_timer = 2 ;
r - > idiag_retrans = icsk - > icsk_probes_out ;
r - > idiag_expires = EXPIRES_IN_MS ( sk - > sk_timer . expires ) ;
2005-04-17 02:20:36 +04:00
} else {
2005-08-12 19:51:49 +04:00
r - > idiag_timer = 0 ;
r - > idiag_expires = 0 ;
2005-04-17 02:20:36 +04:00
}
# undef EXPIRES_IN_MS
2005-08-10 12:54:28 +04:00
2015-06-15 18:26:19 +03:00
if ( ( ext & ( 1 < < ( INET_DIAG_INFO - 1 ) ) ) & & handler - > idiag_info_size ) {
2012-06-27 03:36:12 +04:00
attr = nla_reserve ( skb , INET_DIAG_INFO ,
2015-06-15 18:26:19 +03:00
handler - > idiag_info_size ) ;
2012-06-27 03:36:12 +04:00
if ( ! attr )
goto errout ;
2011-12-09 10:23:00 +04:00
2012-06-27 03:36:12 +04:00
info = nla_data ( attr ) ;
2005-04-17 02:20:36 +04:00
}
2015-04-17 04:10:35 +03:00
if ( ext & ( 1 < < ( INET_DIAG_CONG - 1 ) ) ) {
int err = 0 ;
rcu_read_lock ( ) ;
ca_ops = READ_ONCE ( icsk - > icsk_ca_ops ) ;
if ( ca_ops )
err = nla_put_string ( skb , INET_DIAG_CONG , ca_ops - > name ) ;
rcu_read_unlock ( ) ;
if ( err < 0 )
2012-06-27 03:36:12 +04:00
goto errout ;
2015-04-17 04:10:35 +03:00
}
2012-06-27 03:36:12 +04:00
2005-08-12 16:27:49 +04:00
handler - > idiag_get_info ( sk , r , info ) ;
2005-04-17 02:20:36 +04:00
2015-04-17 04:10:35 +03:00
if ( sk - > sk_state < TCP_TIME_WAIT ) {
2015-04-29 02:23:48 +03:00
union tcp_cc_info info ;
size_t sz = 0 ;
int attr ;
2015-04-17 04:10:35 +03:00
rcu_read_lock ( ) ;
ca_ops = READ_ONCE ( icsk - > icsk_ca_ops ) ;
if ( ca_ops & & ca_ops - > get_info )
2015-04-29 02:23:48 +03:00
sz = ca_ops - > get_info ( sk , ext , & attr , & info ) ;
2015-04-17 04:10:35 +03:00
rcu_read_unlock ( ) ;
2015-04-29 02:23:48 +03:00
if ( sz & & nla_put ( skb , attr , sz , & info ) < 0 )
2015-04-17 04:10:35 +03:00
goto errout ;
}
2005-04-17 02:20:36 +04:00
2011-12-09 10:23:00 +04:00
out :
2015-01-17 00:09:00 +03:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
2012-06-27 03:36:12 +04:00
errout :
nlmsg_cancel ( skb , nlh ) ;
2007-02-01 10:16:40 +03:00
return - EMSGSIZE ;
2005-04-17 02:20:36 +04:00
}
2011-12-09 10:23:00 +04:00
EXPORT_SYMBOL_GPL ( inet_sk_diag_fill ) ;
static int inet_csk_diag_fill ( struct sock * sk ,
2015-03-10 17:15:53 +03:00
struct sk_buff * skb ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * req ,
2012-05-25 03:58:08 +04:00
struct user_namespace * user_ns ,
2012-09-08 00:12:54 +04:00
u32 portid , u32 seq , u16 nlmsg_flags ,
2011-12-09 10:23:00 +04:00
const struct nlmsghdr * unlh )
{
2015-03-10 17:15:53 +03:00
return inet_sk_diag_fill ( sk , inet_csk ( sk ) , skb , req ,
user_ns , portid , seq , nlmsg_flags , unlh ) ;
2011-12-09 10:23:00 +04:00
}
2005-04-17 02:20:36 +04:00
2015-03-12 04:53:14 +03:00
static int inet_twsk_diag_fill ( struct sock * sk ,
2015-03-10 17:15:53 +03:00
struct sk_buff * skb ,
2012-09-08 00:12:54 +04:00
u32 portid , u32 seq , u16 nlmsg_flags ,
2006-01-10 01:56:38 +03:00
const struct nlmsghdr * unlh )
{
2015-03-12 04:53:14 +03:00
struct inet_timewait_sock * tw = inet_twsk ( sk ) ;
2006-01-10 01:56:38 +03:00
struct inet_diag_msg * r ;
2012-06-27 03:36:12 +04:00
struct nlmsghdr * nlh ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
long tmo ;
2012-06-27 08:28:54 +04:00
2012-09-08 00:12:54 +04:00
nlh = nlmsg_put ( skb , portid , seq , unlh - > nlmsg_type , sizeof ( * r ) ,
2012-06-27 03:36:12 +04:00
nlmsg_flags ) ;
if ( ! nlh )
2012-06-27 08:28:54 +04:00
return - EMSGSIZE ;
2006-01-10 01:56:38 +03:00
2012-06-27 08:28:54 +04:00
r = nlmsg_data ( nlh ) ;
2006-01-10 01:56:38 +03:00
BUG_ON ( tw - > tw_state ! = TCP_TIME_WAIT ) ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
tmo = tw - > tw_timer . expires - jiffies ;
2006-01-10 01:56:38 +03:00
if ( tmo < 0 )
tmo = 0 ;
2015-03-14 01:51:12 +03:00
inet_diag_msg_common_fill ( r , sk ) ;
2006-01-10 01:56:38 +03:00
r - > idiag_retrans = 0 ;
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 03:38:39 +04:00
2006-01-10 01:56:38 +03:00
r - > idiag_state = tw - > tw_substate ;
r - > idiag_timer = 3 ;
2013-10-04 01:27:25 +04:00
r - > idiag_expires = jiffies_to_msecs ( tmo ) ;
2006-01-10 01:56:38 +03:00
r - > idiag_rqueue = 0 ;
r - > idiag_wqueue = 0 ;
r - > idiag_uid = 0 ;
r - > idiag_inode = 0 ;
2012-06-27 03:36:12 +04:00
2015-01-17 00:09:00 +03:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2006-01-10 01:56:38 +03:00
}
2015-03-16 07:12:14 +03:00
static int inet_req_diag_fill ( struct sock * sk , struct sk_buff * skb ,
u32 portid , u32 seq , u16 nlmsg_flags ,
const struct nlmsghdr * unlh )
{
struct inet_diag_msg * r ;
struct nlmsghdr * nlh ;
long tmo ;
nlh = nlmsg_put ( skb , portid , seq , unlh - > nlmsg_type , sizeof ( * r ) ,
nlmsg_flags ) ;
if ( ! nlh )
return - EMSGSIZE ;
r = nlmsg_data ( nlh ) ;
inet_diag_msg_common_fill ( r , sk ) ;
r - > idiag_state = TCP_SYN_RECV ;
r - > idiag_timer = 1 ;
r - > idiag_retrans = inet_reqsk ( sk ) - > num_retrans ;
BUILD_BUG_ON ( offsetof ( struct inet_request_sock , ir_cookie ) ! =
offsetof ( struct sock , sk_cookie ) ) ;
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 05:04:20 +03:00
tmo = inet_reqsk ( sk ) - > rsk_timer . expires - jiffies ;
2015-03-16 07:12:14 +03:00
r - > idiag_expires = ( tmo > = 0 ) ? jiffies_to_msecs ( tmo ) : 0 ;
r - > idiag_rqueue = 0 ;
r - > idiag_wqueue = 0 ;
r - > idiag_uid = 0 ;
r - > idiag_inode = 0 ;
nlmsg_end ( skb , nlh ) ;
return 0 ;
}
2006-01-10 01:56:56 +03:00
static int sk_diag_fill ( struct sock * sk , struct sk_buff * skb ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * r ,
2012-05-25 03:58:08 +04:00
struct user_namespace * user_ns ,
2012-09-08 00:12:54 +04:00
u32 portid , u32 seq , u16 nlmsg_flags ,
2006-01-10 01:56:56 +03:00
const struct nlmsghdr * unlh )
{
if ( sk - > sk_state = = TCP_TIME_WAIT )
2015-03-16 07:12:14 +03:00
return inet_twsk_diag_fill ( sk , skb , portid , seq ,
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
nlmsg_flags , unlh ) ;
2015-03-16 07:12:14 +03:00
if ( sk - > sk_state = = TCP_NEW_SYN_RECV )
return inet_req_diag_fill ( sk , skb , portid , seq ,
nlmsg_flags , unlh ) ;
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
return inet_csk_diag_fill ( sk , skb , r , user_ns , portid , seq ,
nlmsg_flags , unlh ) ;
2006-01-10 01:56:56 +03:00
}
2015-12-16 06:30:02 +03:00
struct sock * inet_diag_find_one_icsk ( struct net * net ,
struct inet_hashinfo * hashinfo ,
const struct inet_diag_req_v2 * req )
2005-04-17 02:20:36 +04:00
{
2015-03-10 17:15:53 +03:00
struct sock * sk ;
2007-12-03 07:51:25 +03:00
2015-03-10 17:15:53 +03:00
if ( req - > sdiag_family = = AF_INET )
2016-02-10 19:50:38 +03:00
sk = inet_lookup ( net , hashinfo , NULL , 0 , req - > id . idiag_dst [ 0 ] ,
2005-08-12 19:51:49 +04:00
req - > id . idiag_dport , req - > id . idiag_src [ 0 ] ,
req - > id . idiag_sport , req - > id . idiag_if ) ;
2011-12-10 13:48:31 +04:00
# if IS_ENABLED(CONFIG_IPV6)
2016-01-21 03:25:01 +03:00
else if ( req - > sdiag_family = = AF_INET6 ) {
if ( ipv6_addr_v4mapped ( ( struct in6_addr * ) req - > id . idiag_dst ) & &
ipv6_addr_v4mapped ( ( struct in6_addr * ) req - > id . idiag_src ) )
2016-02-10 19:50:38 +03:00
sk = inet_lookup ( net , hashinfo , NULL , 0 , req - > id . idiag_dst [ 3 ] ,
2016-01-21 03:25:01 +03:00
req - > id . idiag_dport , req - > id . idiag_src [ 3 ] ,
req - > id . idiag_sport , req - > id . idiag_if ) ;
else
2016-02-10 19:50:38 +03:00
sk = inet6_lookup ( net , hashinfo , NULL , 0 ,
2016-01-21 03:25:01 +03:00
( struct in6_addr * ) req - > id . idiag_dst ,
req - > id . idiag_dport ,
( struct in6_addr * ) req - > id . idiag_src ,
req - > id . idiag_sport ,
req - > id . idiag_if ) ;
}
2005-04-17 02:20:36 +04:00
# endif
2015-03-10 17:15:53 +03:00
else
2015-12-16 06:30:02 +03:00
return ERR_PTR ( - EINVAL ) ;
2005-04-17 02:20:36 +04:00
2015-03-10 17:15:53 +03:00
if ( ! sk )
2015-12-16 06:30:02 +03:00
return ERR_PTR ( - ENOENT ) ;
2005-04-17 02:20:36 +04:00
2015-12-16 06:30:02 +03:00
if ( sock_diag_check_cookie ( sk , req - > id . idiag_cookie ) ) {
sock_gen_put ( sk ) ;
return ERR_PTR ( - ENOENT ) ;
}
return sk ;
}
EXPORT_SYMBOL_GPL ( inet_diag_find_one_icsk ) ;
int inet_diag_dump_one_icsk ( struct inet_hashinfo * hashinfo ,
struct sk_buff * in_skb ,
const struct nlmsghdr * nlh ,
const struct inet_diag_req_v2 * req )
{
struct net * net = sock_net ( in_skb - > sk ) ;
struct sk_buff * rep ;
struct sock * sk ;
int err ;
sk = inet_diag_find_one_icsk ( net , hashinfo , req ) ;
if ( IS_ERR ( sk ) )
return PTR_ERR ( sk ) ;
2005-04-17 02:20:36 +04:00
2015-03-13 19:49:59 +03:00
rep = nlmsg_new ( inet_sk_attr_size ( ) , GFP_KERNEL ) ;
2012-06-27 03:36:12 +04:00
if ( ! rep ) {
err = - ENOMEM ;
2005-04-17 02:20:36 +04:00
goto out ;
2012-06-27 03:36:12 +04:00
}
2005-04-17 02:20:36 +04:00
2011-12-06 11:59:32 +04:00
err = sk_diag_fill ( sk , rep , req ,
2013-04-17 10:46:57 +04:00
sk_user_ns ( NETLINK_CB ( in_skb ) . sk ) ,
2012-09-08 00:12:54 +04:00
NETLINK_CB ( in_skb ) . portid ,
2007-02-01 10:16:40 +03:00
nlh - > nlmsg_seq , 0 , nlh ) ;
if ( err < 0 ) {
WARN_ON ( err = = - EMSGSIZE ) ;
2012-06-27 03:36:12 +04:00
nlmsg_free ( rep ) ;
2007-02-01 10:16:40 +03:00
goto out ;
}
2012-09-08 00:12:54 +04:00
err = netlink_unicast ( net - > diag_nlsk , rep , NETLINK_CB ( in_skb ) . portid ,
2005-08-12 19:51:49 +04:00
MSG_DONTWAIT ) ;
2005-04-17 02:20:36 +04:00
if ( err > 0 )
err = 0 ;
out :
2013-10-11 19:54:49 +04:00
if ( sk )
sock_gen_put ( sk ) ;
2011-12-09 10:22:10 +04:00
return err ;
}
2011-12-09 10:23:18 +04:00
EXPORT_SYMBOL_GPL ( inet_diag_dump_one_icsk ) ;
2011-12-09 10:22:10 +04:00
2015-12-16 06:30:04 +03:00
static int inet_diag_cmd_exact ( int cmd , struct sk_buff * in_skb ,
2011-12-09 10:22:10 +04:00
const struct nlmsghdr * nlh ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * req )
2011-12-09 10:22:10 +04:00
{
const struct inet_diag_handler * handler ;
int err ;
handler = inet_diag_lock_handler ( req - > sdiag_protocol ) ;
if ( IS_ERR ( handler ) )
err = PTR_ERR ( handler ) ;
2015-12-16 06:30:04 +03:00
else if ( cmd = = SOCK_DIAG_BY_FAMILY )
2011-12-09 10:23:18 +04:00
err = handler - > dump_one ( in_skb , nlh , req ) ;
2015-12-16 06:30:04 +03:00
else if ( cmd = = SOCK_DESTROY & & handler - > destroy )
err = handler - > destroy ( in_skb , req ) ;
else
err = - EOPNOTSUPP ;
2007-12-03 07:51:25 +03:00
inet_diag_unlock_handler ( handler ) ;
2011-12-09 10:22:10 +04:00
2005-04-17 02:20:36 +04:00
return err ;
}
2006-09-28 05:44:30 +04:00
static int bitstring_match ( const __be32 * a1 , const __be32 * a2 , int bits )
2005-04-17 02:20:36 +04:00
{
int words = bits > > 5 ;
bits & = 0x1f ;
if ( words ) {
if ( memcmp ( a1 , a2 , words < < 2 ) )
return 0 ;
}
if ( bits ) {
2006-09-28 05:44:30 +04:00
__be32 w1 , w2 ;
__be32 mask ;
2005-04-17 02:20:36 +04:00
w1 = a1 [ words ] ;
w2 = a2 [ words ] ;
mask = htonl ( ( 0xffffffff ) < < ( 32 - bits ) ) ;
if ( ( w1 ^ w2 ) & mask )
return 0 ;
}
return 1 ;
}
2011-12-09 10:21:34 +04:00
static int inet_diag_bc_run ( const struct nlattr * _bc ,
2015-03-10 17:15:53 +03:00
const struct inet_diag_entry * entry )
2005-04-17 02:20:36 +04:00
{
2011-12-09 10:21:34 +04:00
const void * bc = nla_data ( _bc ) ;
int len = nla_len ( _bc ) ;
2005-04-17 02:20:36 +04:00
while ( len > 0 ) {
int yes = 1 ;
2005-08-12 19:51:49 +04:00
const struct inet_diag_bc_op * op = bc ;
2005-04-17 02:20:36 +04:00
switch ( op - > code ) {
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_NOP :
2005-04-17 02:20:36 +04:00
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_JMP :
2005-04-17 02:20:36 +04:00
yes = 0 ;
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_S_GE :
2005-04-17 02:20:36 +04:00
yes = entry - > sport > = op [ 1 ] . no ;
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_S_LE :
2010-01-20 01:12:20 +03:00
yes = entry - > sport < = op [ 1 ] . no ;
2005-04-17 02:20:36 +04:00
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_D_GE :
2005-04-17 02:20:36 +04:00
yes = entry - > dport > = op [ 1 ] . no ;
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_D_LE :
2005-04-17 02:20:36 +04:00
yes = entry - > dport < = op [ 1 ] . no ;
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_AUTO :
2005-04-17 02:20:36 +04:00
yes = ! ( entry - > userlocks & SOCK_BINDPORT_LOCK ) ;
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_S_COND :
2005-08-12 19:56:38 +04:00
case INET_DIAG_BC_D_COND : {
2015-03-10 17:15:53 +03:00
const struct inet_diag_hostcond * cond ;
const __be32 * addr ;
2005-04-17 02:20:36 +04:00
2015-03-10 17:15:53 +03:00
cond = ( const struct inet_diag_hostcond * ) ( op + 1 ) ;
2005-04-17 02:20:36 +04:00
if ( cond - > port ! = - 1 & &
2005-08-12 19:51:49 +04:00
cond - > port ! = ( op - > code = = INET_DIAG_BC_S_COND ?
2005-04-17 02:20:36 +04:00
entry - > sport : entry - > dport ) ) {
yes = 0 ;
break ;
}
2006-01-10 01:56:19 +03:00
2005-08-12 19:51:49 +04:00
if ( op - > code = = INET_DIAG_BC_S_COND )
2005-04-17 02:20:36 +04:00
addr = entry - > saddr ;
else
addr = entry - > daddr ;
2012-12-08 23:43:23 +04:00
if ( cond - > family ! = AF_UNSPEC & &
cond - > family ! = entry - > family ) {
if ( entry - > family = = AF_INET6 & &
cond - > family = = AF_INET ) {
if ( addr [ 0 ] = = 0 & & addr [ 1 ] = = 0 & &
addr [ 2 ] = = htonl ( 0xffff ) & &
bitstring_match ( addr + 3 ,
cond - > addr ,
cond - > prefix_len ) )
break ;
}
yes = 0 ;
break ;
}
if ( cond - > prefix_len = = 0 )
break ;
2006-01-10 01:56:19 +03:00
if ( bitstring_match ( addr , cond - > addr ,
cond - > prefix_len ) )
2005-04-17 02:20:36 +04:00
break ;
yes = 0 ;
break ;
}
}
2006-01-10 01:56:19 +03:00
if ( yes ) {
2005-04-17 02:20:36 +04:00
len - = op - > yes ;
bc + = op - > yes ;
} else {
len - = op - > no ;
bc + = op - > no ;
}
}
2010-09-23 00:43:57 +04:00
return len = = 0 ;
2005-04-17 02:20:36 +04:00
}
2015-03-14 01:51:12 +03:00
/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
*/
static void entry_fill_addrs ( struct inet_diag_entry * entry ,
const struct sock * sk )
{
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 ) {
entry - > saddr = sk - > sk_v6_rcv_saddr . s6_addr32 ;
entry - > daddr = sk - > sk_v6_daddr . s6_addr32 ;
} else
# endif
{
entry - > saddr = & sk - > sk_rcv_saddr ;
entry - > daddr = & sk - > sk_daddr ;
}
}
2011-12-09 10:22:44 +04:00
int inet_diag_bc_sk ( const struct nlattr * bc , struct sock * sk )
{
struct inet_sock * inet = inet_sk ( sk ) ;
2015-03-10 17:15:53 +03:00
struct inet_diag_entry entry ;
2011-12-09 10:22:44 +04:00
2015-03-10 17:15:53 +03:00
if ( ! bc )
2011-12-09 10:22:44 +04:00
return 1 ;
entry . family = sk - > sk_family ;
2015-03-14 01:51:12 +03:00
entry_fill_addrs ( & entry , sk ) ;
2011-12-09 10:22:44 +04:00
entry . sport = inet - > inet_num ;
entry . dport = ntohs ( inet - > inet_dport ) ;
2015-03-16 07:12:14 +03:00
entry . userlocks = sk_fullsock ( sk ) ? sk - > sk_userlocks : 0 ;
2011-12-09 10:22:44 +04:00
return inet_diag_bc_run ( bc , & entry ) ;
}
EXPORT_SYMBOL_GPL ( inet_diag_bc_sk ) ;
2005-04-17 02:20:36 +04:00
static int valid_cc ( const void * bc , int len , int cc )
{
while ( len > = 0 ) {
2005-08-12 19:51:49 +04:00
const struct inet_diag_bc_op * op = bc ;
2005-04-17 02:20:36 +04:00
if ( cc > len )
return 0 ;
if ( cc = = len )
return 1 ;
2011-06-18 00:25:39 +04:00
if ( op - > yes < 4 | | op - > yes & 3 )
2005-04-17 02:20:36 +04:00
return 0 ;
len - = op - > yes ;
bc + = op - > yes ;
}
return 0 ;
}
2012-12-08 23:43:22 +04:00
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond ( const struct inet_diag_bc_op * op , int len ,
int * min_len )
{
struct inet_diag_hostcond * cond ;
2015-03-10 17:15:53 +03:00
int addr_len ;
2012-12-08 23:43:22 +04:00
/* Check hostcond space. */
* min_len + = sizeof ( struct inet_diag_hostcond ) ;
if ( len < * min_len )
return false ;
cond = ( struct inet_diag_hostcond * ) ( op + 1 ) ;
/* Check address family and address length. */
switch ( cond - > family ) {
case AF_UNSPEC :
addr_len = 0 ;
break ;
case AF_INET :
addr_len = sizeof ( struct in_addr ) ;
break ;
case AF_INET6 :
addr_len = sizeof ( struct in6_addr ) ;
break ;
default :
return false ;
}
* min_len + = addr_len ;
if ( len < * min_len )
return false ;
/* Check prefix length (in bits) vs address length (in bytes). */
if ( cond - > prefix_len > 8 * addr_len )
return false ;
return true ;
}
2012-12-09 15:09:54 +04:00
/* Validate a port comparison operator. */
2015-03-10 17:15:53 +03:00
static bool valid_port_comparison ( const struct inet_diag_bc_op * op ,
int len , int * min_len )
2012-12-09 15:09:54 +04:00
{
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
* min_len + = sizeof ( struct inet_diag_bc_op ) ;
if ( len < * min_len )
return false ;
return true ;
}
2005-08-12 19:51:49 +04:00
static int inet_diag_bc_audit ( const void * bytecode , int bytecode_len )
2005-04-17 02:20:36 +04:00
{
2011-06-18 00:25:39 +04:00
const void * bc = bytecode ;
2005-04-17 02:20:36 +04:00
int len = bytecode_len ;
while ( len > 0 ) {
2012-12-08 23:43:22 +04:00
int min_len = sizeof ( struct inet_diag_bc_op ) ;
2015-03-10 17:15:53 +03:00
const struct inet_diag_bc_op * op = bc ;
2005-04-17 02:20:36 +04:00
switch ( op - > code ) {
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_S_COND :
case INET_DIAG_BC_D_COND :
2012-12-08 23:43:22 +04:00
if ( ! valid_hostcond ( bc , len , & min_len ) )
return - EINVAL ;
2012-12-09 15:09:54 +04:00
break ;
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_S_GE :
case INET_DIAG_BC_S_LE :
case INET_DIAG_BC_D_GE :
case INET_DIAG_BC_D_LE :
2012-12-09 15:09:54 +04:00
if ( ! valid_port_comparison ( bc , len , & min_len ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
break ;
2012-12-09 15:09:54 +04:00
case INET_DIAG_BC_AUTO :
case INET_DIAG_BC_JMP :
2005-08-12 19:51:49 +04:00
case INET_DIAG_BC_NOP :
2005-04-17 02:20:36 +04:00
break ;
default :
return - EINVAL ;
}
2012-12-09 15:09:54 +04:00
if ( op - > code ! = INET_DIAG_BC_NOP ) {
if ( op - > no < min_len | | op - > no > len + 4 | | op - > no & 3 )
return - EINVAL ;
if ( op - > no < len & &
! valid_cc ( bytecode , bytecode_len , len - op - > no ) )
return - EINVAL ;
}
2012-12-08 23:43:22 +04:00
if ( op - > yes < min_len | | op - > yes > len + 4 | | op - > yes & 3 )
2011-06-18 00:25:39 +04:00
return - EINVAL ;
2006-01-10 01:56:19 +03:00
bc + = op - > yes ;
2005-04-17 02:20:36 +04:00
len - = op - > yes ;
}
return len = = 0 ? 0 : - EINVAL ;
}
2006-01-10 01:56:56 +03:00
static int inet_csk_diag_dump ( struct sock * sk ,
struct sk_buff * skb ,
2011-12-06 11:57:26 +04:00
struct netlink_callback * cb ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * r ,
2011-12-06 11:57:26 +04:00
const struct nlattr * bc )
2005-04-17 02:20:36 +04:00
{
2011-12-09 10:22:44 +04:00
if ( ! inet_diag_bc_sk ( bc , sk ) )
return 0 ;
2005-04-17 02:20:36 +04:00
2011-12-06 11:59:32 +04:00
return inet_csk_diag_fill ( sk , skb , r ,
2013-04-17 10:46:57 +04:00
sk_user_ns ( NETLINK_CB ( cb - > skb ) . sk ) ,
2012-09-08 00:12:54 +04:00
NETLINK_CB ( cb - > skb ) . portid ,
2006-01-10 01:56:56 +03:00
cb - > nlh - > nlmsg_seq , NLM_F_MULTI , cb - > nlh ) ;
2005-04-17 02:20:36 +04:00
}
2015-03-05 21:18:14 +03:00
static void twsk_build_assert ( void )
{
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_family ) ! =
offsetof ( struct sock , sk_family ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_num ) ! =
offsetof ( struct inet_sock , inet_num ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_dport ) ! =
offsetof ( struct inet_sock , inet_dport ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_rcv_saddr ) ! =
offsetof ( struct inet_sock , inet_rcv_saddr ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_daddr ) ! =
offsetof ( struct inet_sock , inet_daddr ) ) ;
# if IS_ENABLED(CONFIG_IPV6)
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_v6_rcv_saddr ) ! =
offsetof ( struct sock , sk_v6_rcv_saddr ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_v6_daddr ) ! =
offsetof ( struct sock , sk_v6_daddr ) ) ;
# endif
}
2011-12-09 10:23:18 +04:00
void inet_diag_dump_icsk ( struct inet_hashinfo * hashinfo , struct sk_buff * skb ,
2015-03-10 17:15:53 +03:00
struct netlink_callback * cb ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * r , struct nlattr * bc )
2005-04-17 02:20:36 +04:00
{
2012-07-16 08:28:49 +04:00
struct net * net = sock_net ( skb - > sk ) ;
2015-03-10 17:15:53 +03:00
int i , num , s_i , s_num ;
2015-10-02 21:43:32 +03:00
u32 idiag_states = r - > idiag_states ;
2006-01-10 01:56:19 +03:00
2015-10-02 21:43:32 +03:00
if ( idiag_states & TCPF_SYN_RECV )
idiag_states | = TCPF_NEW_SYN_RECV ;
2005-04-17 02:20:36 +04:00
s_i = cb - > args [ 1 ] ;
s_num = num = cb - > args [ 2 ] ;
2005-08-12 16:27:49 +04:00
2005-04-17 02:20:36 +04:00
if ( cb - > args [ 0 ] = = 0 ) {
2015-10-02 21:43:32 +03:00
if ( ! ( idiag_states & TCPF_LISTEN ) )
2005-04-17 02:20:36 +04:00
goto skip_listen_ht ;
2005-08-10 12:54:28 +04:00
2005-08-10 06:59:44 +04:00
for ( i = s_i ; i < INET_LHTABLE_SIZE ; i + + ) {
2008-11-20 11:40:07 +03:00
struct inet_listen_hashbucket * ilb ;
2015-03-10 17:15:53 +03:00
struct hlist_nulls_node * node ;
struct sock * sk ;
2005-04-17 02:20:36 +04:00
num = 0 ;
2008-11-20 11:40:07 +03:00
ilb = & hashinfo - > listening_hash [ i ] ;
spin_lock_bh ( & ilb - > lock ) ;
2008-11-24 04:22:55 +03:00
sk_nulls_for_each ( sk , node , & ilb - > head ) {
2005-04-17 02:20:36 +04:00
struct inet_sock * inet = inet_sk ( sk ) ;
2012-07-16 08:28:49 +04:00
if ( ! net_eq ( sock_net ( sk ) , net ) )
continue ;
2005-04-17 02:20:36 +04:00
if ( num < s_num ) {
num + + ;
continue ;
}
2011-12-06 11:59:15 +04:00
if ( r - > sdiag_family ! = AF_UNSPEC & &
2015-03-10 17:15:53 +03:00
sk - > sk_family ! = r - > sdiag_family )
2011-12-06 11:59:15 +04:00
goto next_listen ;
2009-10-15 10:30:45 +04:00
if ( r - > id . idiag_sport ! = inet - > inet_sport & &
2005-08-12 19:51:49 +04:00
r - > id . idiag_sport )
2005-04-17 02:20:36 +04:00
goto next_listen ;
2015-10-02 21:43:32 +03:00
if ( r - > id . idiag_dport | |
2005-04-17 02:20:36 +04:00
cb - > args [ 3 ] > 0 )
goto next_listen ;
2015-10-02 21:43:32 +03:00
if ( inet_csk_diag_dump ( sk , skb , cb , r , bc ) < 0 ) {
2008-11-20 11:40:07 +03:00
spin_unlock_bh ( & ilb - > lock ) ;
2005-04-17 02:20:36 +04:00
goto done ;
}
next_listen :
cb - > args [ 3 ] = 0 ;
cb - > args [ 4 ] = 0 ;
+ + num ;
}
2008-11-20 11:40:07 +03:00
spin_unlock_bh ( & ilb - > lock ) ;
2005-04-17 02:20:36 +04:00
s_num = 0 ;
cb - > args [ 3 ] = 0 ;
cb - > args [ 4 ] = 0 ;
}
skip_listen_ht :
cb - > args [ 0 ] = 1 ;
s_i = num = s_num = 0 ;
}
2015-10-02 21:43:32 +03:00
if ( ! ( idiag_states & ~ TCPF_LISTEN ) )
2011-12-09 10:22:26 +04:00
goto out ;
2005-04-17 02:20:36 +04:00
2009-10-09 04:16:19 +04:00
for ( i = s_i ; i < = hashinfo - > ehash_mask ; i + + ) {
2005-08-10 12:54:28 +04:00
struct inet_ehash_bucket * head = & hashinfo - > ehash [ i ] ;
2008-11-22 03:39:19 +03:00
spinlock_t * lock = inet_ehash_lockp ( hashinfo , i ) ;
2008-11-17 06:40:17 +03:00
struct hlist_nulls_node * node ;
2015-03-10 17:15:53 +03:00
struct sock * sk ;
2005-04-17 02:20:36 +04:00
2008-08-28 12:09:54 +04:00
num = 0 ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if ( hlist_nulls_empty ( & head - > chain ) )
2008-08-28 12:09:54 +04:00
continue ;
2005-04-17 02:20:36 +04:00
if ( i > s_i )
s_num = 0 ;
2008-11-22 03:39:19 +03:00
spin_lock_bh ( lock ) ;
2008-11-17 06:40:17 +03:00
sk_nulls_for_each ( sk , node , & head - > chain ) {
2015-03-10 17:15:53 +03:00
int state , res ;
2005-04-17 02:20:36 +04:00
2012-07-16 08:28:49 +04:00
if ( ! net_eq ( sock_net ( sk ) , net ) )
continue ;
2005-04-17 02:20:36 +04:00
if ( num < s_num )
goto next_normal ;
2014-01-11 00:34:45 +04:00
state = ( sk - > sk_state = = TCP_TIME_WAIT ) ?
inet_twsk ( sk ) - > tw_substate : sk - > sk_state ;
2015-10-02 21:43:32 +03:00
if ( ! ( idiag_states & ( 1 < < state ) ) )
2005-04-17 02:20:36 +04:00
goto next_normal ;
2011-12-06 11:59:15 +04:00
if ( r - > sdiag_family ! = AF_UNSPEC & &
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
sk - > sk_family ! = r - > sdiag_family )
2011-12-06 11:59:15 +04:00
goto next_normal ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if ( r - > id . idiag_sport ! = htons ( sk - > sk_num ) & &
2005-08-12 19:51:49 +04:00
r - > id . idiag_sport )
2005-04-17 02:20:36 +04:00
goto next_normal ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if ( r - > id . idiag_dport ! = sk - > sk_dport & &
2006-01-10 01:56:19 +03:00
r - > id . idiag_dport )
2005-04-17 02:20:36 +04:00
goto next_normal ;
2015-03-16 07:12:14 +03:00
twsk_build_assert ( ) ;
if ( ! inet_diag_bc_sk ( bc , sk ) )
goto next_normal ;
res = sk_diag_fill ( sk , skb , r ,
sk_user_ns ( NETLINK_CB ( cb - > skb ) . sk ) ,
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
cb - > nlh ) ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if ( res < 0 ) {
2008-11-22 03:39:19 +03:00
spin_unlock_bh ( lock ) ;
2005-04-17 02:20:36 +04:00
goto done ;
}
next_normal :
+ + num ;
}
2008-11-22 03:39:19 +03:00
spin_unlock_bh ( lock ) ;
2016-03-15 01:40:00 +03:00
cond_resched ( ) ;
2005-04-17 02:20:36 +04:00
}
done :
cb - > args [ 1 ] = i ;
cb - > args [ 2 ] = num ;
2011-12-09 10:22:26 +04:00
out :
;
}
2011-12-09 10:23:18 +04:00
EXPORT_SYMBOL_GPL ( inet_diag_dump_icsk ) ;
2011-12-09 10:22:26 +04:00
static int __inet_diag_dump ( struct sk_buff * skb , struct netlink_callback * cb ,
2015-03-10 17:15:54 +03:00
const struct inet_diag_req_v2 * r ,
2015-03-10 17:15:53 +03:00
struct nlattr * bc )
2011-12-09 10:22:26 +04:00
{
const struct inet_diag_handler * handler ;
2012-11-03 13:30:34 +04:00
int err = 0 ;
2011-12-09 10:22:26 +04:00
handler = inet_diag_lock_handler ( r - > sdiag_protocol ) ;
if ( ! IS_ERR ( handler ) )
2011-12-09 10:23:18 +04:00
handler - > dump ( skb , cb , r , bc ) ;
2012-11-03 13:30:34 +04:00
else
err = PTR_ERR ( handler ) ;
2007-12-03 07:51:25 +03:00
inet_diag_unlock_handler ( handler ) ;
2011-12-09 10:22:26 +04:00
2012-11-03 13:30:34 +04:00
return err ? : skb - > len ;
2005-04-17 02:20:36 +04:00
}
2011-12-06 11:58:58 +04:00
static int inet_diag_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
2012-01-11 02:36:35 +04:00
int hdrlen = sizeof ( struct inet_diag_req_v2 ) ;
2015-03-10 17:15:53 +03:00
struct nlattr * bc = NULL ;
2011-12-06 11:58:58 +04:00
if ( nlmsg_attrlen ( cb - > nlh , hdrlen ) )
bc = nlmsg_find_attr ( cb - > nlh , hdrlen , INET_DIAG_REQ_BYTECODE ) ;
2012-06-27 08:28:54 +04:00
return __inet_diag_dump ( skb , cb , nlmsg_data ( cb - > nlh ) , bc ) ;
2011-12-06 11:58:58 +04:00
}
2015-03-10 17:15:53 +03:00
static int inet_diag_type2proto ( int type )
2011-12-06 11:59:32 +04:00
{
switch ( type ) {
case TCPDIAG_GETSOCK :
return IPPROTO_TCP ;
case DCCPDIAG_GETSOCK :
return IPPROTO_DCCP ;
default :
return 0 ;
}
}
2015-03-10 17:15:53 +03:00
static int inet_diag_dump_compat ( struct sk_buff * skb ,
struct netlink_callback * cb )
2011-12-06 11:58:58 +04:00
{
2012-06-27 08:28:54 +04:00
struct inet_diag_req * rc = nlmsg_data ( cb - > nlh ) ;
2015-03-10 17:15:53 +03:00
int hdrlen = sizeof ( struct inet_diag_req ) ;
2012-01-11 02:36:35 +04:00
struct inet_diag_req_v2 req ;
2011-12-06 11:58:58 +04:00
struct nlattr * bc = NULL ;
2011-12-06 11:59:15 +04:00
req . sdiag_family = AF_UNSPEC ; /* compatibility */
2011-12-06 11:58:58 +04:00
req . sdiag_protocol = inet_diag_type2proto ( cb - > nlh - > nlmsg_type ) ;
req . idiag_ext = rc - > idiag_ext ;
req . idiag_states = rc - > idiag_states ;
req . id = rc - > id ;
if ( nlmsg_attrlen ( cb - > nlh , hdrlen ) )
bc = nlmsg_find_attr ( cb - > nlh , hdrlen , INET_DIAG_REQ_BYTECODE ) ;
return __inet_diag_dump ( skb , cb , & req , bc ) ;
}
2011-12-06 11:58:39 +04:00
static int inet_diag_get_exact_compat ( struct sk_buff * in_skb ,
2015-03-10 17:15:53 +03:00
const struct nlmsghdr * nlh )
2011-12-06 11:58:39 +04:00
{
2012-06-27 08:28:54 +04:00
struct inet_diag_req * rc = nlmsg_data ( nlh ) ;
2012-01-11 02:36:35 +04:00
struct inet_diag_req_v2 req ;
2011-12-06 11:58:39 +04:00
req . sdiag_family = rc - > idiag_family ;
req . sdiag_protocol = inet_diag_type2proto ( nlh - > nlmsg_type ) ;
req . idiag_ext = rc - > idiag_ext ;
req . idiag_states = rc - > idiag_states ;
req . id = rc - > id ;
2015-12-16 06:30:04 +03:00
return inet_diag_cmd_exact ( SOCK_DIAG_BY_FAMILY , in_skb , nlh , & req ) ;
2011-12-06 11:58:39 +04:00
}
2011-12-06 11:57:06 +04:00
static int inet_diag_rcv_msg_compat ( struct sk_buff * skb , struct nlmsghdr * nlh )
2005-04-17 02:20:36 +04:00
{
2012-01-11 02:37:26 +04:00
int hdrlen = sizeof ( struct inet_diag_req ) ;
2012-07-16 08:28:49 +04:00
struct net * net = sock_net ( skb - > sk ) ;
2005-04-17 02:20:36 +04:00
2007-03-23 09:30:35 +03:00
if ( nlh - > nlmsg_type > = INET_DIAG_GETSOCK_MAX | |
nlmsg_len ( nlh ) < hdrlen )
return - EINVAL ;
2005-04-17 02:20:36 +04:00
2011-01-18 23:40:38 +03:00
if ( nlh - > nlmsg_flags & NLM_F_DUMP ) {
2007-03-23 09:30:35 +03:00
if ( nlmsg_attrlen ( nlh , hdrlen ) ) {
struct nlattr * attr ;
2005-04-17 02:20:36 +04:00
2007-03-23 09:30:35 +03:00
attr = nlmsg_find_attr ( nlh , hdrlen ,
INET_DIAG_REQ_BYTECODE ) ;
2015-03-10 17:15:53 +03:00
if ( ! attr | |
2007-03-23 09:30:35 +03:00
nla_len ( attr ) < sizeof ( struct inet_diag_bc_op ) | |
inet_diag_bc_audit ( nla_data ( attr ) , nla_len ( attr ) ) )
return - EINVAL ;
}
2012-02-24 18:30:15 +04:00
{
struct netlink_dump_control c = {
. dump = inet_diag_dump_compat ,
} ;
2012-07-16 08:28:49 +04:00
return netlink_dump_start ( net - > diag_nlsk , skb , nlh , & c ) ;
2012-02-24 18:30:15 +04:00
}
2005-04-17 02:20:36 +04:00
}
2007-03-23 09:30:35 +03:00
2011-12-06 11:58:39 +04:00
return inet_diag_get_exact_compat ( skb , nlh ) ;
2005-04-17 02:20:36 +04:00
}
2015-12-16 06:30:04 +03:00
static int inet_diag_handler_cmd ( struct sk_buff * skb , struct nlmsghdr * h )
2011-12-06 11:58:03 +04:00
{
2012-01-11 02:36:35 +04:00
int hdrlen = sizeof ( struct inet_diag_req_v2 ) ;
2012-07-16 08:28:49 +04:00
struct net * net = sock_net ( skb - > sk ) ;
2011-12-06 11:58:03 +04:00
if ( nlmsg_len ( h ) < hdrlen )
return - EINVAL ;
2015-12-16 06:30:04 +03:00
if ( h - > nlmsg_type = = SOCK_DIAG_BY_FAMILY & &
h - > nlmsg_flags & NLM_F_DUMP ) {
2011-12-06 11:58:58 +04:00
if ( nlmsg_attrlen ( h , hdrlen ) ) {
struct nlattr * attr ;
2015-03-10 17:15:53 +03:00
2011-12-06 11:58:58 +04:00
attr = nlmsg_find_attr ( h , hdrlen ,
INET_DIAG_REQ_BYTECODE ) ;
2015-03-10 17:15:53 +03:00
if ( ! attr | |
2011-12-06 11:58:58 +04:00
nla_len ( attr ) < sizeof ( struct inet_diag_bc_op ) | |
inet_diag_bc_audit ( nla_data ( attr ) , nla_len ( attr ) ) )
return - EINVAL ;
}
2012-02-24 18:30:15 +04:00
{
struct netlink_dump_control c = {
. dump = inet_diag_dump ,
} ;
2012-07-16 08:28:49 +04:00
return netlink_dump_start ( net - > diag_nlsk , skb , h , & c ) ;
2012-02-24 18:30:15 +04:00
}
2011-12-06 11:58:03 +04:00
}
2015-12-16 06:30:04 +03:00
return inet_diag_cmd_exact ( h - > nlmsg_type , skb , h , nlmsg_data ( h ) ) ;
2011-12-06 11:58:03 +04:00
}
2015-06-15 18:26:20 +03:00
static
int inet_diag_handler_get_info ( struct sk_buff * skb , struct sock * sk )
{
const struct inet_diag_handler * handler ;
struct nlmsghdr * nlh ;
struct nlattr * attr ;
struct inet_diag_msg * r ;
void * info = NULL ;
int err = 0 ;
nlh = nlmsg_put ( skb , 0 , 0 , SOCK_DIAG_BY_FAMILY , sizeof ( * r ) , 0 ) ;
if ( ! nlh )
return - ENOMEM ;
r = nlmsg_data ( nlh ) ;
memset ( r , 0 , sizeof ( * r ) ) ;
inet_diag_msg_common_fill ( r , sk ) ;
2015-06-17 17:59:10 +03:00
if ( sk - > sk_type = = SOCK_DGRAM | | sk - > sk_type = = SOCK_STREAM )
r - > id . idiag_sport = inet_sk ( sk ) - > inet_sport ;
2015-06-15 18:26:20 +03:00
r - > idiag_state = sk - > sk_state ;
if ( ( err = nla_put_u8 ( skb , INET_DIAG_PROTOCOL , sk - > sk_protocol ) ) ) {
nlmsg_cancel ( skb , nlh ) ;
return err ;
}
handler = inet_diag_lock_handler ( sk - > sk_protocol ) ;
if ( IS_ERR ( handler ) ) {
inet_diag_unlock_handler ( handler ) ;
nlmsg_cancel ( skb , nlh ) ;
return PTR_ERR ( handler ) ;
}
attr = handler - > idiag_info_size
? nla_reserve ( skb , INET_DIAG_INFO , handler - > idiag_info_size )
: NULL ;
if ( attr )
info = nla_data ( attr ) ;
handler - > idiag_get_info ( sk , r , info ) ;
inet_diag_unlock_handler ( handler ) ;
nlmsg_end ( skb , nlh ) ;
return 0 ;
}
2012-04-24 22:21:07 +04:00
static const struct sock_diag_handler inet_diag_handler = {
2011-12-06 11:58:03 +04:00
. family = AF_INET ,
2015-12-16 06:30:04 +03:00
. dump = inet_diag_handler_cmd ,
2015-06-15 18:26:20 +03:00
. get_info = inet_diag_handler_get_info ,
2015-12-16 06:30:04 +03:00
. destroy = inet_diag_handler_cmd ,
2011-12-06 11:58:03 +04:00
} ;
2012-04-24 22:21:07 +04:00
static const struct sock_diag_handler inet6_diag_handler = {
2011-12-06 11:58:03 +04:00
. family = AF_INET6 ,
2015-12-16 06:30:04 +03:00
. dump = inet_diag_handler_cmd ,
2015-06-15 18:26:20 +03:00
. get_info = inet_diag_handler_get_info ,
2015-12-16 06:30:04 +03:00
. destroy = inet_diag_handler_cmd ,
2011-12-06 11:58:03 +04:00
} ;
2005-08-12 16:27:49 +04:00
int inet_diag_register ( const struct inet_diag_handler * h )
{
const __u16 type = h - > idiag_type ;
int err = - EINVAL ;
2011-12-06 12:05:24 +04:00
if ( type > = IPPROTO_MAX )
2005-08-12 16:27:49 +04:00
goto out ;
2007-12-03 07:51:25 +03:00
mutex_lock ( & inet_diag_table_mutex ) ;
2005-08-12 16:27:49 +04:00
err = - EEXIST ;
2015-03-10 17:15:53 +03:00
if ( ! inet_diag_table [ type ] ) {
2005-08-12 16:27:49 +04:00
inet_diag_table [ type ] = h ;
err = 0 ;
}
2007-12-03 07:51:25 +03:00
mutex_unlock ( & inet_diag_table_mutex ) ;
2005-08-12 16:27:49 +04:00
out :
return err ;
}
EXPORT_SYMBOL_GPL ( inet_diag_register ) ;
void inet_diag_unregister ( const struct inet_diag_handler * h )
{
const __u16 type = h - > idiag_type ;
2011-12-06 12:05:24 +04:00
if ( type > = IPPROTO_MAX )
2005-08-12 16:27:49 +04:00
return ;
2007-12-03 07:51:25 +03:00
mutex_lock ( & inet_diag_table_mutex ) ;
2005-08-12 16:27:49 +04:00
inet_diag_table [ type ] = NULL ;
2007-12-03 07:51:25 +03:00
mutex_unlock ( & inet_diag_table_mutex ) ;
2005-08-12 16:27:49 +04:00
}
EXPORT_SYMBOL_GPL ( inet_diag_unregister ) ;
2005-08-12 19:51:49 +04:00
static int __init inet_diag_init ( void )
2005-04-17 02:20:36 +04:00
{
2011-12-06 12:05:24 +04:00
const int inet_diag_table_size = ( IPPROTO_MAX *
2005-08-12 16:27:49 +04:00
sizeof ( struct inet_diag_handler * ) ) ;
int err = - ENOMEM ;
2006-07-22 01:51:30 +04:00
inet_diag_table = kzalloc ( inet_diag_table_size , GFP_KERNEL ) ;
2005-08-12 16:27:49 +04:00
if ( ! inet_diag_table )
goto out ;
2011-12-06 11:58:03 +04:00
err = sock_diag_register ( & inet_diag_handler ) ;
if ( err )
goto out_free_nl ;
err = sock_diag_register ( & inet6_diag_handler ) ;
if ( err )
goto out_free_inet ;
2011-12-06 11:59:52 +04:00
sock_diag_register_inet_compat ( inet_diag_rcv_msg_compat ) ;
2005-08-12 16:27:49 +04:00
out :
return err ;
2011-12-06 11:58:03 +04:00
out_free_inet :
sock_diag_unregister ( & inet_diag_handler ) ;
out_free_nl :
2005-08-12 16:27:49 +04:00
kfree ( inet_diag_table ) ;
goto out ;
2005-04-17 02:20:36 +04:00
}
2005-08-12 19:51:49 +04:00
static void __exit inet_diag_exit ( void )
2005-04-17 02:20:36 +04:00
{
2011-12-06 11:58:03 +04:00
sock_diag_unregister ( & inet6_diag_handler ) ;
sock_diag_unregister ( & inet_diag_handler ) ;
2011-12-06 11:59:52 +04:00
sock_diag_unregister_inet_compat ( inet_diag_rcv_msg_compat ) ;
2005-08-12 16:27:49 +04:00
kfree ( inet_diag_table ) ;
2005-04-17 02:20:36 +04:00
}
2005-08-12 19:51:49 +04:00
module_init ( inet_diag_init ) ;
module_exit ( inet_diag_exit ) ;
2005-04-17 02:20:36 +04:00
MODULE_LICENSE ( " GPL " ) ;
2011-12-15 06:43:27 +04:00
MODULE_ALIAS_NET_PF_PROTO_TYPE ( PF_NETLINK , NETLINK_SOCK_DIAG , 2 /* AF_INET */ ) ;
MODULE_ALIAS_NET_PF_PROTO_TYPE ( PF_NETLINK , NETLINK_SOCK_DIAG , 10 /* AF_INET6 */ ) ;