2019-05-27 08:55:01 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-16 15:20:36 -07:00
/*
2005-08-12 12:51:49 -03:00
* inet_diag . c Module for monitoring INET transport protocols sockets .
2005-04-16 15:20:36 -07:00
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
*/
2007-08-28 15:50:33 -07:00
# include <linux/kernel.h>
2005-04-16 15:20:36 -07:00
# include <linux/module.h>
# include <linux/types.h>
# include <linux/fcntl.h>
# include <linux/random.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
# include <linux/slab.h>
2005-04-16 15:20:36 -07:00
# include <linux/cache.h>
# include <linux/init.h>
# include <linux/time.h>
# include <net/icmp.h>
# include <net/tcp.h>
# include <net/ipv6.h>
# include <net/inet_common.h>
2005-08-12 09:19:38 -03:00
# include <net/inet_connection_sock.h>
# include <net/inet_hashtables.h>
# include <net/inet_timewait_sock.h>
# include <net/inet6_hashtables.h>
2020-02-25 15:04:27 -08:00
# include <net/bpf_sk_storage.h>
2007-03-25 23:06:12 -07:00
# include <net/netlink.h>
2005-04-16 15:20:36 -07:00
# include <linux/inet.h>
# include <linux/stddef.h>
2005-08-12 12:56:38 -03:00
# include <linux/inet_diag.h>
2011-12-06 07:58:03 +00:00
# include <linux/sock_diag.h>
2005-04-16 15:20:36 -07:00
2024-01-22 11:25:58 +00:00
static const struct inet_diag_handler __rcu * * inet_diag_table ;
2005-08-12 09:27:49 -03:00
2005-08-12 12:51:49 -03:00
struct inet_diag_entry {
2015-03-10 07:15:53 -07:00
const __be32 * saddr ;
const __be32 * daddr ;
2005-04-16 15:20:36 -07:00
u16 sport ;
u16 dport ;
u16 family ;
u16 userlocks ;
2016-06-23 18:42:51 -07:00
u32 ifindex ;
2016-08-24 15:46:26 +09:00
u32 mark ;
2020-04-30 18:51:15 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
u64 cgroup_id ;
# endif
2005-04-16 15:20:36 -07:00
} ;
2011-12-06 08:05:24 +00:00
static const struct inet_diag_handler * inet_diag_lock_handler ( int proto )
2007-12-03 15:51:25 +11:00
{
2024-01-22 11:25:58 +00:00
const struct inet_diag_handler * handler ;
if ( proto < 0 | | proto > = IPPROTO_MAX )
return NULL ;
2020-07-09 15:12:39 +02:00
2024-01-22 11:25:56 +00:00
if ( ! READ_ONCE ( inet_diag_table [ proto ] ) )
sock_diag: request _diag module only when the family or proto has been registered
Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.
Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.
For example:
$ lsmod|grep sctp
$ ss
$ lsmod|grep sctp
sctp_diag 16384 0
sctp 323584 5 sctp_diag
inet_diag 24576 4 raw_diag,tcp_diag,sctp_diag,udp_diag
libcrc32c 16384 3 nf_conntrack,nf_nat,sctp
As these family and proto modules are loaded unintentionally, it
could cause some problems, like:
- Some debug tools use 'ss' to collect the socket info, which loads all
those diag and family and protocol modules. It's noisy for identifying
issues.
- Users usually expect to drop sctp init packet silently when they
have no sense of sctp protocol instead of sending abort back.
- It wastes resources (especially with multiple netns), and SCTP module
can't be unloaded once it's loaded.
...
In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.
This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.
Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.
v1->v2:
- move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
- define sock_load_diag_module in sock.c and export one symbol
only.
- improve the changelog.
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-10 18:57:50 +08:00
sock_load_diag_module ( AF_INET , proto ) ;
2007-12-03 15:51:25 +11:00
2024-01-22 11:25:58 +00:00
rcu_read_lock ( ) ;
handler = rcu_dereference ( inet_diag_table [ proto ] ) ;
if ( handler & & ! try_module_get ( handler - > owner ) )
handler = NULL ;
rcu_read_unlock ( ) ;
2007-12-03 15:51:25 +11:00
2024-01-22 11:25:58 +00:00
return handler ;
2007-12-03 15:51:25 +11:00
}
2015-03-10 07:15:53 -07:00
static void inet_diag_unlock_handler ( const struct inet_diag_handler * handler )
2007-12-03 15:51:25 +11:00
{
2024-01-22 11:25:58 +00:00
module_put ( handler - > owner ) ;
2007-12-03 15:51:25 +11:00
}
2016-04-14 15:35:32 +08:00
void inet_diag_msg_common_fill ( struct inet_diag_msg * r , struct sock * sk )
2015-03-13 15:51:12 -07:00
{
r - > idiag_family = sk - > sk_family ;
r - > id . idiag_sport = htons ( sk - > sk_num ) ;
r - > id . idiag_dport = sk - > sk_dport ;
r - > id . idiag_if = sk - > sk_bound_dev_if ;
sock_diag_save_cookie ( sk , r - > id . idiag_cookie ) ;
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 ) {
* ( struct in6_addr * ) r - > id . idiag_src = sk - > sk_v6_rcv_saddr ;
* ( struct in6_addr * ) r - > id . idiag_dst = sk - > sk_v6_daddr ;
} else
# endif
{
memset ( & r - > id . idiag_src , 0 , sizeof ( r - > id . idiag_src ) ) ;
memset ( & r - > id . idiag_dst , 0 , sizeof ( r - > id . idiag_dst ) ) ;
r - > id . idiag_src [ 0 ] = sk - > sk_rcv_saddr ;
r - > id . idiag_dst [ 0 ] = sk - > sk_daddr ;
}
}
2016-04-14 15:35:32 +08:00
EXPORT_SYMBOL_GPL ( inet_diag_msg_common_fill ) ;
2015-03-13 15:51:12 -07:00
2017-08-31 09:59:38 -07:00
static size_t inet_sk_attr_size ( struct sock * sk ,
const struct inet_diag_req_v2 * req ,
bool net_admin )
2015-03-13 09:49:59 -07:00
{
2017-08-31 09:59:38 -07:00
const struct inet_diag_handler * handler ;
size_t aux = 0 ;
2024-01-22 11:25:58 +00:00
rcu_read_lock ( ) ;
handler = rcu_dereference ( inet_diag_table [ req - > sdiag_protocol ] ) ;
DEBUG_NET_WARN_ON_ONCE ( ! handler ) ;
2017-08-31 09:59:38 -07:00
if ( handler & & handler - > idiag_get_aux_size )
aux = handler - > idiag_get_aux_size ( sk , net_admin ) ;
2024-01-22 11:25:58 +00:00
rcu_read_unlock ( ) ;
2017-08-31 09:59:38 -07:00
2015-03-13 09:49:59 -07:00
return nla_total_size ( sizeof ( struct tcp_info ) )
+ nla_total_size ( sizeof ( struct inet_diag_msg ) )
2020-03-05 15:33:12 +03:00
+ inet_diag_msg_attrs_size ( )
+ nla_total_size ( sizeof ( struct inet_diag_meminfo ) )
2015-03-13 09:49:59 -07:00
+ nla_total_size ( SK_MEMINFO_VARS * sizeof ( u32 ) )
+ nla_total_size ( TCP_CA_NAME_MAX )
+ nla_total_size ( sizeof ( struct tcpvegas_info ) )
2017-08-31 09:59:38 -07:00
+ aux
2015-03-13 09:49:59 -07:00
+ 64 ;
}
2016-04-14 15:35:32 +08:00
int inet_diag_msg_attrs_fill ( struct sock * sk , struct sk_buff * skb ,
struct inet_diag_msg * r , int ext ,
2016-09-08 00:42:25 +09:00
struct user_namespace * user_ns ,
bool net_admin )
2016-04-14 15:35:32 +08:00
{
const struct inet_sock * inet = inet_sk ( sk ) ;
2020-09-01 15:10:08 -07:00
struct inet_diag_sockopt inet_sockopt ;
2016-04-14 15:35:32 +08:00
if ( nla_put_u8 ( skb , INET_DIAG_SHUTDOWN , sk - > sk_shutdown ) )
goto errout ;
/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
* hence this needs to be included regardless of socket family .
*/
if ( ext & ( 1 < < ( INET_DIAG_TOS - 1 ) ) )
2023-09-22 03:42:16 +00:00
if ( nla_put_u8 ( skb , INET_DIAG_TOS , READ_ONCE ( inet - > tos ) ) < 0 )
2016-04-14 15:35:32 +08:00
goto errout ;
# if IS_ENABLED(CONFIG_IPV6)
if ( r - > idiag_family = = AF_INET6 ) {
if ( ext & ( 1 < < ( INET_DIAG_TCLASS - 1 ) ) )
if ( nla_put_u8 ( skb , INET_DIAG_TCLASS ,
inet6_sk ( sk ) - > tclass ) < 0 )
goto errout ;
if ( ( ( 1 < < sk - > sk_state ) & ( TCPF_LISTEN | TCPF_CLOSE ) ) & &
nla_put_u8 ( skb , INET_DIAG_SKV6ONLY , ipv6_only_sock ( sk ) ) )
goto errout ;
}
# endif
2023-07-28 15:03:15 +00:00
if ( net_admin & & nla_put_u32 ( skb , INET_DIAG_MARK , READ_ONCE ( sk - > sk_mark ) ) )
2016-09-08 00:42:25 +09:00
goto errout ;
2020-03-05 15:33:12 +03:00
if ( ext & ( 1 < < ( INET_DIAG_CLASS_ID - 1 ) ) | |
ext & ( 1 < < ( INET_DIAG_TCLASS - 1 ) ) ) {
u32 classid = 0 ;
# ifdef CONFIG_SOCK_CGROUP_DATA
classid = sock_cgroup_classid ( & sk - > sk_cgrp_data ) ;
# endif
/* Fallback to socket priority if class id isn't set.
* Classful qdiscs use it as direct reference to class .
* For cgroup2 classid is always zero .
*/
if ( ! classid )
2023-09-21 20:28:11 +00:00
classid = READ_ONCE ( sk - > sk_priority ) ;
2020-03-05 15:33:12 +03:00
if ( nla_put_u32 ( skb , INET_DIAG_CLASS_ID , classid ) )
goto errout ;
}
2020-04-30 18:51:14 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
if ( nla_put_u64_64bit ( skb , INET_DIAG_CGROUP_ID ,
cgroup_id ( sock_cgroup_ptr ( & sk - > sk_cgrp_data ) ) ,
INET_DIAG_PAD ) )
goto errout ;
# endif
2016-04-14 15:35:32 +08:00
r - > idiag_uid = from_kuid_munged ( user_ns , sock_i_uid ( sk ) ) ;
r - > idiag_inode = sock_i_ino ( sk ) ;
2020-09-01 15:10:08 -07:00
memset ( & inet_sockopt , 0 , sizeof ( inet_sockopt ) ) ;
2023-08-16 08:15:35 +00:00
inet_sockopt . recverr = inet_test_bit ( RECVERR , sk ) ;
2023-08-16 08:15:42 +00:00
inet_sockopt . is_icsk = inet_test_bit ( IS_ICSK , sk ) ;
2023-08-16 08:15:37 +00:00
inet_sockopt . freebind = inet_test_bit ( FREEBIND , sk ) ;
2023-08-16 08:15:38 +00:00
inet_sockopt . hdrincl = inet_test_bit ( HDRINCL , sk ) ;
2023-08-16 08:15:39 +00:00
inet_sockopt . mc_loop = inet_test_bit ( MC_LOOP , sk ) ;
2023-08-16 08:15:41 +00:00
inet_sockopt . transparent = inet_test_bit ( TRANSPARENT , sk ) ;
2023-08-16 08:15:40 +00:00
inet_sockopt . mc_all = inet_test_bit ( MC_ALL , sk ) ;
2023-08-16 08:15:43 +00:00
inet_sockopt . nodefrag = inet_test_bit ( NODEFRAG , sk ) ;
2023-08-16 08:15:44 +00:00
inet_sockopt . bind_address_no_port = inet_test_bit ( BIND_ADDRESS_NO_PORT , sk ) ;
2023-08-16 08:15:36 +00:00
inet_sockopt . recverr_rfc4884 = inet_test_bit ( RECVERR_RFC4884 , sk ) ;
2023-08-16 08:15:45 +00:00
inet_sockopt . defer_connect = inet_test_bit ( DEFER_CONNECT , sk ) ;
2020-09-01 15:10:08 -07:00
if ( nla_put ( skb , INET_DIAG_SOCKOPT , sizeof ( inet_sockopt ) ,
& inet_sockopt ) )
goto errout ;
2016-04-14 15:35:32 +08:00
return 0 ;
errout :
return 1 ;
}
EXPORT_SYMBOL_GPL ( inet_diag_msg_attrs_fill ) ;
2020-09-21 07:27:20 -07:00
static int inet_diag_parse_attrs ( const struct nlmsghdr * nlh , int hdrlen ,
struct nlattr * * req_nlas )
2020-07-09 15:12:39 +02:00
{
struct nlattr * nla ;
int remaining ;
nlmsg_for_each_attr ( nla , nlh , hdrlen , remaining ) {
int type = nla_type ( nla ) ;
2020-09-21 07:27:20 -07:00
if ( type = = INET_DIAG_REQ_PROTOCOL & & nla_len ( nla ) ! = sizeof ( u32 ) )
return - EINVAL ;
2020-07-09 15:12:39 +02:00
if ( type < __INET_DIAG_REQ_MAX )
req_nlas [ type ] = nla ;
}
2020-09-21 07:27:20 -07:00
return 0 ;
2020-07-09 15:12:39 +02:00
}
static int inet_diag_get_protocol ( const struct inet_diag_req_v2 * req ,
const struct inet_diag_dump_data * data )
{
if ( data - > req_nlas [ INET_DIAG_REQ_PROTOCOL ] )
return nla_get_u32 ( data - > req_nlas [ INET_DIAG_REQ_PROTOCOL ] ) ;
return req - > sdiag_protocol ;
}
2020-02-25 15:04:27 -08:00
# define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
2011-12-09 06:23:00 +00:00
int inet_sk_diag_fill ( struct sock * sk , struct inet_connection_sock * icsk ,
2020-02-25 15:04:09 -08:00
struct sk_buff * skb , struct netlink_callback * cb ,
const struct inet_diag_req_v2 * req ,
u16 nlmsg_flags , bool net_admin )
2005-04-16 15:20:36 -07:00
{
2015-04-16 18:10:35 -07:00
const struct tcp_congestion_ops * ca_ops ;
2015-03-10 07:15:53 -07:00
const struct inet_diag_handler * handler ;
2020-02-25 15:04:27 -08:00
struct inet_diag_dump_data * cb_data ;
2015-03-10 07:15:53 -07:00
int ext = req - > idiag_ext ;
2005-08-12 12:51:49 -03:00
struct inet_diag_msg * r ;
2005-04-16 15:20:36 -07:00
struct nlmsghdr * nlh ;
2012-06-26 23:36:12 +00:00
struct nlattr * attr ;
2005-08-12 09:27:49 -03:00
void * info = NULL ;
2024-01-22 11:25:58 +00:00
int protocol ;
2005-08-12 09:27:49 -03:00
2020-02-25 15:04:27 -08:00
cb_data = cb - > data ;
2024-01-22 11:25:58 +00:00
protocol = inet_diag_get_protocol ( req , cb_data ) ;
/* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
handler = rcu_dereference_protected ( inet_diag_table [ protocol ] , 1 ) ;
DEBUG_NET_WARN_ON_ONCE ( ! handler ) ;
if ( ! handler )
return - ENXIO ;
2005-04-16 15:20:36 -07:00
2020-02-25 15:04:09 -08:00
nlh = nlmsg_put ( skb , NETLINK_CB ( cb - > skb ) . portid , cb - > nlh - > nlmsg_seq ,
cb - > nlh - > nlmsg_type , sizeof ( * r ) , nlmsg_flags ) ;
2012-06-26 23:36:12 +00:00
if ( ! nlh )
2012-06-26 21:28:54 -07:00
return - EMSGSIZE ;
2005-08-12 09:27:49 -03:00
2012-06-26 21:28:54 -07:00
r = nlmsg_data ( nlh ) ;
2015-03-15 21:12:14 -07:00
BUG_ON ( ! sk_fullsock ( sk ) ) ;
2006-01-09 14:56:38 -08:00
2015-03-13 15:51:12 -07:00
inet_diag_msg_common_fill ( r , sk ) ;
2005-08-12 12:51:49 -03:00
r - > idiag_state = sk - > sk_state ;
r - > idiag_timer = 0 ;
r - > idiag_retrans = 0 ;
2021-12-09 10:50:58 -08:00
r - > idiag_expires = 0 ;
2005-04-16 15:20:36 -07:00
2020-02-25 15:04:09 -08:00
if ( inet_diag_msg_attrs_fill ( sk , skb , r , ext ,
sk_user_ns ( NETLINK_CB ( cb - > skb ) . sk ) ,
net_admin ) )
2012-10-23 22:29:56 +04:00
goto errout ;
2012-06-26 23:36:12 +00:00
if ( ext & ( 1 < < ( INET_DIAG_MEMINFO - 1 ) ) ) {
struct inet_diag_meminfo minfo = {
. idiag_rmem = sk_rmem_alloc_get ( sk ) ,
2019-10-10 20:17:46 -07:00
. idiag_wmem = READ_ONCE ( sk - > sk_wmem_queued ) ,
2021-10-26 16:29:14 -07:00
. idiag_fmem = sk_forward_alloc_get ( sk ) ,
2012-06-26 23:36:12 +00:00
. idiag_tmem = sk_wmem_alloc_get ( sk ) ,
} ;
if ( nla_put ( skb , INET_DIAG_MEMINFO , sizeof ( minfo ) , & minfo ) < 0 )
goto errout ;
2011-12-09 06:23:00 +00:00
}
2011-12-30 00:53:32 +00:00
if ( ext & ( 1 < < ( INET_DIAG_SKMEMINFO - 1 ) ) )
if ( sock_diag_put_meminfo ( sk , skb , INET_DIAG_SKMEMINFO ) )
2012-06-26 23:36:12 +00:00
goto errout ;
2011-12-30 00:53:32 +00:00
2016-10-21 13:03:44 +03:00
/*
* RAW sockets might have user - defined protocols assigned ,
* so report the one supplied on socket creation .
*/
if ( sk - > sk_type = = SOCK_RAW ) {
if ( nla_put_u8 ( skb , INET_DIAG_PROTOCOL , sk - > sk_protocol ) )
goto errout ;
}
2015-03-10 07:15:53 -07:00
if ( ! icsk ) {
2012-04-24 18:15:41 +00:00
handler - > idiag_get_info ( sk , r , NULL ) ;
2011-12-09 06:23:00 +00:00
goto out ;
}
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-11 10:00:43 +00:00
if ( icsk - > icsk_pending = = ICSK_TIME_RETRANS | |
2017-01-12 22:11:33 -08:00
icsk - > icsk_pending = = ICSK_TIME_REO_TIMEOUT | |
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-11 10:00:43 +00:00
icsk - > icsk_pending = = ICSK_TIME_LOSS_PROBE ) {
2005-08-12 12:51:49 -03:00
r - > idiag_timer = 1 ;
r - > idiag_retrans = icsk - > icsk_retransmits ;
2016-04-19 15:10:01 +08:00
r - > idiag_expires =
2019-11-05 14:11:50 -08:00
jiffies_delta_to_msecs ( icsk - > icsk_timeout - jiffies ) ;
2005-08-09 20:10:42 -07:00
} else if ( icsk - > icsk_pending = = ICSK_TIME_PROBE0 ) {
2005-08-12 12:51:49 -03:00
r - > idiag_timer = 4 ;
r - > idiag_retrans = icsk - > icsk_probes_out ;
2016-04-19 15:10:01 +08:00
r - > idiag_expires =
2019-11-05 14:11:50 -08:00
jiffies_delta_to_msecs ( icsk - > icsk_timeout - jiffies ) ;
2005-04-16 15:20:36 -07:00
} else if ( timer_pending ( & sk - > sk_timer ) ) {
2005-08-12 12:51:49 -03:00
r - > idiag_timer = 2 ;
r - > idiag_retrans = icsk - > icsk_probes_out ;
2016-04-19 15:10:01 +08:00
r - > idiag_expires =
2019-11-05 14:11:50 -08:00
jiffies_delta_to_msecs ( sk - > sk_timer . expires - jiffies ) ;
2005-04-16 15:20:36 -07:00
}
2005-08-10 05:54:28 -03:00
2015-06-15 11:26:19 -04:00
if ( ( ext & ( 1 < < ( INET_DIAG_INFO - 1 ) ) ) & & handler - > idiag_info_size ) {
2016-04-26 10:06:14 +02:00
attr = nla_reserve_64bit ( skb , INET_DIAG_INFO ,
handler - > idiag_info_size ,
INET_DIAG_PAD ) ;
2012-06-26 23:36:12 +00:00
if ( ! attr )
goto errout ;
2011-12-09 06:23:00 +00:00
2012-06-26 23:36:12 +00:00
info = nla_data ( attr ) ;
2005-04-16 15:20:36 -07:00
}
2015-04-16 18:10:35 -07:00
if ( ext & ( 1 < < ( INET_DIAG_CONG - 1 ) ) ) {
int err = 0 ;
rcu_read_lock ( ) ;
ca_ops = READ_ONCE ( icsk - > icsk_ca_ops ) ;
if ( ca_ops )
err = nla_put_string ( skb , INET_DIAG_CONG , ca_ops - > name ) ;
rcu_read_unlock ( ) ;
if ( err < 0 )
2012-06-26 23:36:12 +00:00
goto errout ;
2015-04-16 18:10:35 -07:00
}
2012-06-26 23:36:12 +00:00
2005-08-12 09:27:49 -03:00
handler - > idiag_get_info ( sk , r , info ) ;
2005-04-16 15:20:36 -07:00
2017-08-31 09:59:38 -07:00
if ( ext & ( 1 < < ( INET_DIAG_INFO - 1 ) ) & & handler - > idiag_get_aux )
if ( handler - > idiag_get_aux ( sk , net_admin , skb ) < 0 )
goto errout ;
2015-04-16 18:10:35 -07:00
if ( sk - > sk_state < TCP_TIME_WAIT ) {
2015-04-28 16:23:48 -07:00
union tcp_cc_info info ;
size_t sz = 0 ;
int attr ;
2015-04-16 18:10:35 -07:00
rcu_read_lock ( ) ;
ca_ops = READ_ONCE ( icsk - > icsk_ca_ops ) ;
if ( ca_ops & & ca_ops - > get_info )
2015-04-28 16:23:48 -07:00
sz = ca_ops - > get_info ( sk , ext , & attr , & info ) ;
2015-04-16 18:10:35 -07:00
rcu_read_unlock ( ) ;
2015-04-28 16:23:48 -07:00
if ( sz & & nla_put ( skb , attr , sz , & info ) < 0 )
2015-04-16 18:10:35 -07:00
goto errout ;
}
2005-04-16 15:20:36 -07:00
2020-02-25 15:04:27 -08:00
/* Keep it at the end for potential retry with a larger skb,
* or else do best - effort fitting , which is only done for the
* first_nlmsg .
*/
if ( cb_data - > bpf_stg_diag ) {
bool first_nlmsg = ( ( unsigned char * ) nlh = = skb - > data ) ;
unsigned int prev_min_dump_alloc ;
unsigned int total_nla_size = 0 ;
unsigned int msg_len ;
int err ;
msg_len = skb_tail_pointer ( skb ) - ( unsigned char * ) nlh ;
err = bpf_sk_storage_diag_put ( cb_data - > bpf_stg_diag , sk , skb ,
INET_DIAG_SK_BPF_STORAGES ,
& total_nla_size ) ;
if ( ! err )
goto out ;
total_nla_size + = msg_len ;
prev_min_dump_alloc = cb - > min_dump_alloc ;
if ( total_nla_size > prev_min_dump_alloc )
cb - > min_dump_alloc = min_t ( u32 , total_nla_size ,
MAX_DUMP_ALLOC_SIZE ) ;
if ( ! first_nlmsg )
goto errout ;
if ( cb - > min_dump_alloc > prev_min_dump_alloc )
/* Retry with pskb_expand_head() with
* __GFP_DIRECT_RECLAIM
*/
goto errout ;
WARN_ON_ONCE ( total_nla_size < = prev_min_dump_alloc ) ;
/* Send what we have for this sk
* and move on to the next sk in the following
* dump ( )
*/
}
2011-12-09 06:23:00 +00:00
out :
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2005-04-16 15:20:36 -07:00
2012-06-26 23:36:12 +00:00
errout :
nlmsg_cancel ( skb , nlh ) ;
2007-01-31 23:16:40 -08:00
return - EMSGSIZE ;
2005-04-16 15:20:36 -07:00
}
2011-12-09 06:23:00 +00:00
EXPORT_SYMBOL_GPL ( inet_sk_diag_fill ) ;
2015-03-11 18:53:14 -07:00
static int inet_twsk_diag_fill ( struct sock * sk ,
2015-03-10 07:15:53 -07:00
struct sk_buff * skb ,
2020-02-25 15:04:09 -08:00
struct netlink_callback * cb ,
2021-06-15 23:06:04 -07:00
u16 nlmsg_flags , bool net_admin )
2006-01-09 14:56:38 -08:00
{
2015-03-11 18:53:14 -07:00
struct inet_timewait_sock * tw = inet_twsk ( sk ) ;
2006-01-09 14:56:38 -08:00
struct inet_diag_msg * r ;
2012-06-26 23:36:12 +00:00
struct nlmsghdr * nlh ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-12 18:51:09 -07:00
long tmo ;
2012-06-26 21:28:54 -07:00
2020-02-25 15:04:09 -08:00
nlh = nlmsg_put ( skb , NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq , cb - > nlh - > nlmsg_type ,
sizeof ( * r ) , nlmsg_flags ) ;
2012-06-26 23:36:12 +00:00
if ( ! nlh )
2012-06-26 21:28:54 -07:00
return - EMSGSIZE ;
2006-01-09 14:56:38 -08:00
2012-06-26 21:28:54 -07:00
r = nlmsg_data ( nlh ) ;
2006-01-09 14:56:38 -08:00
BUG_ON ( tw - > tw_state ! = TCP_TIME_WAIT ) ;
2015-03-13 15:51:12 -07:00
inet_diag_msg_common_fill ( r , sk ) ;
2006-01-09 14:56:38 -08:00
r - > idiag_retrans = 0 ;
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 00:38:39 +01:00
2006-01-09 14:56:38 -08:00
r - > idiag_state = tw - > tw_substate ;
r - > idiag_timer = 3 ;
2019-11-05 14:11:50 -08:00
tmo = tw - > tw_timer . expires - jiffies ;
r - > idiag_expires = jiffies_delta_to_msecs ( tmo ) ;
2006-01-09 14:56:38 -08:00
r - > idiag_rqueue = 0 ;
r - > idiag_wqueue = 0 ;
r - > idiag_uid = 0 ;
r - > idiag_inode = 0 ;
2012-06-26 23:36:12 +00:00
2021-06-15 23:06:04 -07:00
if ( net_admin & & nla_put_u32 ( skb , INET_DIAG_MARK ,
tw - > tw_mark ) ) {
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
}
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2006-01-09 14:56:38 -08:00
}
2015-03-15 21:12:14 -07:00
static int inet_req_diag_fill ( struct sock * sk , struct sk_buff * skb ,
2020-02-25 15:04:09 -08:00
struct netlink_callback * cb ,
u16 nlmsg_flags , bool net_admin )
2015-03-15 21:12:14 -07:00
{
2016-09-08 00:42:25 +09:00
struct request_sock * reqsk = inet_reqsk ( sk ) ;
2015-03-15 21:12:14 -07:00
struct inet_diag_msg * r ;
struct nlmsghdr * nlh ;
long tmo ;
2020-02-25 15:04:09 -08:00
nlh = nlmsg_put ( skb , NETLINK_CB ( cb - > skb ) . portid , cb - > nlh - > nlmsg_seq ,
cb - > nlh - > nlmsg_type , sizeof ( * r ) , nlmsg_flags ) ;
2015-03-15 21:12:14 -07:00
if ( ! nlh )
return - EMSGSIZE ;
r = nlmsg_data ( nlh ) ;
inet_diag_msg_common_fill ( r , sk ) ;
r - > idiag_state = TCP_SYN_RECV ;
r - > idiag_timer = 1 ;
2016-09-08 00:42:25 +09:00
r - > idiag_retrans = reqsk - > num_retrans ;
2015-03-15 21:12:14 -07:00
BUILD_BUG_ON ( offsetof ( struct inet_request_sock , ir_cookie ) ! =
offsetof ( struct sock , sk_cookie ) ) ;
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-19 19:04:20 -07:00
tmo = inet_reqsk ( sk ) - > rsk_timer . expires - jiffies ;
2019-11-05 14:11:50 -08:00
r - > idiag_expires = jiffies_delta_to_msecs ( tmo ) ;
2015-03-15 21:12:14 -07:00
r - > idiag_rqueue = 0 ;
r - > idiag_wqueue = 0 ;
r - > idiag_uid = 0 ;
r - > idiag_inode = 0 ;
2016-09-08 00:42:25 +09:00
if ( net_admin & & nla_put_u32 ( skb , INET_DIAG_MARK ,
2020-11-16 16:20:18 +08:00
inet_rsk ( reqsk ) - > ir_mark ) ) {
nlmsg_cancel ( skb , nlh ) ;
2016-09-08 00:42:25 +09:00
return - EMSGSIZE ;
2020-11-16 16:20:18 +08:00
}
2016-09-08 00:42:25 +09:00
2015-03-15 21:12:14 -07:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
}
2006-01-09 14:56:56 -08:00
static int sk_diag_fill ( struct sock * sk , struct sk_buff * skb ,
2020-02-25 15:04:09 -08:00
struct netlink_callback * cb ,
2015-03-10 07:15:54 -07:00
const struct inet_diag_req_v2 * r ,
2020-02-25 15:04:09 -08:00
u16 nlmsg_flags , bool net_admin )
2006-01-09 14:56:56 -08:00
{
if ( sk - > sk_state = = TCP_TIME_WAIT )
2021-06-15 23:06:04 -07:00
return inet_twsk_diag_fill ( sk , skb , cb , nlmsg_flags , net_admin ) ;
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:42:29 -07:00
2015-03-15 21:12:14 -07:00
if ( sk - > sk_state = = TCP_NEW_SYN_RECV )
2020-02-25 15:04:09 -08:00
return inet_req_diag_fill ( sk , skb , cb , nlmsg_flags , net_admin ) ;
2015-03-15 21:12:14 -07:00
2020-02-25 15:04:09 -08:00
return inet_sk_diag_fill ( sk , inet_csk ( sk ) , skb , cb , r , nlmsg_flags ,
net_admin ) ;
2006-01-09 14:56:56 -08:00
}
2015-12-16 12:30:02 +09:00
struct sock * inet_diag_find_one_icsk ( struct net * net ,
struct inet_hashinfo * hashinfo ,
const struct inet_diag_req_v2 * req )
2005-04-16 15:20:36 -07:00
{
2015-03-10 07:15:53 -07:00
struct sock * sk ;
2007-12-03 15:51:25 +11:00
2016-04-01 08:52:15 -07:00
rcu_read_lock ( ) ;
2015-03-10 07:15:53 -07:00
if ( req - > sdiag_family = = AF_INET )
2016-02-10 11:50:38 -05:00
sk = inet_lookup ( net , hashinfo , NULL , 0 , req - > id . idiag_dst [ 0 ] ,
2005-08-12 12:51:49 -03:00
req - > id . idiag_dport , req - > id . idiag_src [ 0 ] ,
req - > id . idiag_sport , req - > id . idiag_if ) ;
2011-12-10 09:48:31 +00:00
# if IS_ENABLED(CONFIG_IPV6)
2016-01-20 16:25:01 -08:00
else if ( req - > sdiag_family = = AF_INET6 ) {
if ( ipv6_addr_v4mapped ( ( struct in6_addr * ) req - > id . idiag_dst ) & &
ipv6_addr_v4mapped ( ( struct in6_addr * ) req - > id . idiag_src ) )
2016-02-10 11:50:38 -05:00
sk = inet_lookup ( net , hashinfo , NULL , 0 , req - > id . idiag_dst [ 3 ] ,
2016-01-20 16:25:01 -08:00
req - > id . idiag_dport , req - > id . idiag_src [ 3 ] ,
req - > id . idiag_sport , req - > id . idiag_if ) ;
else
2016-02-10 11:50:38 -05:00
sk = inet6_lookup ( net , hashinfo , NULL , 0 ,
2016-01-20 16:25:01 -08:00
( struct in6_addr * ) req - > id . idiag_dst ,
req - > id . idiag_dport ,
( struct in6_addr * ) req - > id . idiag_src ,
req - > id . idiag_sport ,
req - > id . idiag_if ) ;
}
2005-04-16 15:20:36 -07:00
# endif
2016-04-01 08:52:15 -07:00
else {
rcu_read_unlock ( ) ;
2015-12-16 12:30:02 +09:00
return ERR_PTR ( - EINVAL ) ;
2016-04-01 08:52:15 -07:00
}
rcu_read_unlock ( ) ;
2015-03-10 07:15:53 -07:00
if ( ! sk )
2015-12-16 12:30:02 +09:00
return ERR_PTR ( - ENOENT ) ;
2005-04-16 15:20:36 -07:00
2015-12-16 12:30:02 +09:00
if ( sock_diag_check_cookie ( sk , req - > id . idiag_cookie ) ) {
sock_gen_put ( sk ) ;
return ERR_PTR ( - ENOENT ) ;
}
return sk ;
}
EXPORT_SYMBOL_GPL ( inet_diag_find_one_icsk ) ;
int inet_diag_dump_one_icsk ( struct inet_hashinfo * hashinfo ,
2020-02-25 15:04:09 -08:00
struct netlink_callback * cb ,
2015-12-16 12:30:02 +09:00
const struct inet_diag_req_v2 * req )
{
2020-02-25 15:04:09 -08:00
struct sk_buff * in_skb = cb - > skb ;
2017-08-31 09:59:38 -07:00
bool net_admin = netlink_net_capable ( in_skb , CAP_NET_ADMIN ) ;
2015-12-16 12:30:02 +09:00
struct net * net = sock_net ( in_skb - > sk ) ;
struct sk_buff * rep ;
struct sock * sk ;
int err ;
sk = inet_diag_find_one_icsk ( net , hashinfo , req ) ;
if ( IS_ERR ( sk ) )
return PTR_ERR ( sk ) ;
2005-04-16 15:20:36 -07:00
2017-08-31 09:59:38 -07:00
rep = nlmsg_new ( inet_sk_attr_size ( sk , req , net_admin ) , GFP_KERNEL ) ;
2012-06-26 23:36:12 +00:00
if ( ! rep ) {
err = - ENOMEM ;
2005-04-16 15:20:36 -07:00
goto out ;
2012-06-26 23:36:12 +00:00
}
2005-04-16 15:20:36 -07:00
2020-02-25 15:04:09 -08:00
err = sk_diag_fill ( sk , rep , cb , req , 0 , net_admin ) ;
2007-01-31 23:16:40 -08:00
if ( err < 0 ) {
WARN_ON ( err = = - EMSGSIZE ) ;
2012-06-26 23:36:12 +00:00
nlmsg_free ( rep ) ;
2007-01-31 23:16:40 -08:00
goto out ;
}
2021-07-13 10:48:24 +08:00
err = nlmsg_unicast ( net - > diag_nlsk , rep , NETLINK_CB ( in_skb ) . portid ) ;
2005-04-16 15:20:36 -07:00
out :
2013-10-11 08:54:49 -07:00
if ( sk )
sock_gen_put ( sk ) ;
2011-12-09 06:22:10 +00:00
return err ;
}
2011-12-09 06:23:18 +00:00
EXPORT_SYMBOL_GPL ( inet_diag_dump_one_icsk ) ;
2011-12-09 06:22:10 +00:00
2015-12-16 12:30:04 +09:00
static int inet_diag_cmd_exact ( int cmd , struct sk_buff * in_skb ,
2011-12-09 06:22:10 +00:00
const struct nlmsghdr * nlh ,
2020-07-09 15:12:39 +02:00
int hdrlen ,
2015-03-10 07:15:54 -07:00
const struct inet_diag_req_v2 * req )
2011-12-09 06:22:10 +00:00
{
const struct inet_diag_handler * handler ;
2020-07-09 15:12:39 +02:00
struct inet_diag_dump_data dump_data ;
int err , protocol ;
memset ( & dump_data , 0 , sizeof ( dump_data ) ) ;
2020-09-21 07:27:20 -07:00
err = inet_diag_parse_attrs ( nlh , hdrlen , dump_data . req_nlas ) ;
if ( err )
return err ;
2020-07-09 15:12:39 +02:00
protocol = inet_diag_get_protocol ( req , & dump_data ) ;
2011-12-09 06:22:10 +00:00
2020-07-09 15:12:39 +02:00
handler = inet_diag_lock_handler ( protocol ) ;
2024-01-22 11:25:58 +00:00
if ( ! handler )
return - ENOENT ;
if ( cmd = = SOCK_DIAG_BY_FAMILY ) {
2020-02-25 15:04:09 -08:00
struct netlink_callback cb = {
. nlh = nlh ,
. skb = in_skb ,
2020-07-09 15:12:39 +02:00
. data = & dump_data ,
2020-02-25 15:04:09 -08:00
} ;
err = handler - > dump_one ( & cb , req ) ;
} else if ( cmd = = SOCK_DESTROY & & handler - > destroy ) {
2015-12-16 12:30:04 +09:00
err = handler - > destroy ( in_skb , req ) ;
2020-02-25 15:04:09 -08:00
} else {
2015-12-16 12:30:04 +09:00
err = - EOPNOTSUPP ;
2020-02-25 15:04:09 -08:00
}
2007-12-03 15:51:25 +11:00
inet_diag_unlock_handler ( handler ) ;
2011-12-09 06:22:10 +00:00
2005-04-16 15:20:36 -07:00
return err ;
}
2006-09-27 18:44:30 -07:00
static int bitstring_match ( const __be32 * a1 , const __be32 * a2 , int bits )
2005-04-16 15:20:36 -07:00
{
int words = bits > > 5 ;
bits & = 0x1f ;
if ( words ) {
if ( memcmp ( a1 , a2 , words < < 2 ) )
return 0 ;
}
if ( bits ) {
2006-09-27 18:44:30 -07:00
__be32 w1 , w2 ;
__be32 mask ;
2005-04-16 15:20:36 -07:00
w1 = a1 [ words ] ;
w2 = a2 [ words ] ;
mask = htonl ( ( 0xffffffff ) < < ( 32 - bits ) ) ;
if ( ( w1 ^ w2 ) & mask )
return 0 ;
}
return 1 ;
}
2011-12-09 06:21:34 +00:00
static int inet_diag_bc_run ( const struct nlattr * _bc ,
2015-03-10 07:15:53 -07:00
const struct inet_diag_entry * entry )
2005-04-16 15:20:36 -07:00
{
2011-12-09 06:21:34 +00:00
const void * bc = nla_data ( _bc ) ;
int len = nla_len ( _bc ) ;
2005-04-16 15:20:36 -07:00
while ( len > 0 ) {
int yes = 1 ;
2005-08-12 12:51:49 -03:00
const struct inet_diag_bc_op * op = bc ;
2005-04-16 15:20:36 -07:00
switch ( op - > code ) {
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_NOP :
2005-04-16 15:20:36 -07:00
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_JMP :
2005-04-16 15:20:36 -07:00
yes = 0 ;
break ;
2017-12-27 18:27:58 +01:00
case INET_DIAG_BC_S_EQ :
yes = entry - > sport = = op [ 1 ] . no ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_S_GE :
2005-04-16 15:20:36 -07:00
yes = entry - > sport > = op [ 1 ] . no ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_S_LE :
2010-01-19 14:12:20 -08:00
yes = entry - > sport < = op [ 1 ] . no ;
2005-04-16 15:20:36 -07:00
break ;
2017-12-27 18:27:58 +01:00
case INET_DIAG_BC_D_EQ :
yes = entry - > dport = = op [ 1 ] . no ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_D_GE :
2005-04-16 15:20:36 -07:00
yes = entry - > dport > = op [ 1 ] . no ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_D_LE :
2005-04-16 15:20:36 -07:00
yes = entry - > dport < = op [ 1 ] . no ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_AUTO :
2005-04-16 15:20:36 -07:00
yes = ! ( entry - > userlocks & SOCK_BINDPORT_LOCK ) ;
break ;
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_S_COND :
2005-08-12 12:56:38 -03:00
case INET_DIAG_BC_D_COND : {
2015-03-10 07:15:53 -07:00
const struct inet_diag_hostcond * cond ;
const __be32 * addr ;
2005-04-16 15:20:36 -07:00
2015-03-10 07:15:53 -07:00
cond = ( const struct inet_diag_hostcond * ) ( op + 1 ) ;
2005-04-16 15:20:36 -07:00
if ( cond - > port ! = - 1 & &
2005-08-12 12:51:49 -03:00
cond - > port ! = ( op - > code = = INET_DIAG_BC_S_COND ?
2005-04-16 15:20:36 -07:00
entry - > sport : entry - > dport ) ) {
yes = 0 ;
break ;
}
2006-01-09 14:56:19 -08:00
2005-08-12 12:51:49 -03:00
if ( op - > code = = INET_DIAG_BC_S_COND )
2005-04-16 15:20:36 -07:00
addr = entry - > saddr ;
else
addr = entry - > daddr ;
2012-12-08 19:43:23 +00:00
if ( cond - > family ! = AF_UNSPEC & &
cond - > family ! = entry - > family ) {
if ( entry - > family = = AF_INET6 & &
cond - > family = = AF_INET ) {
if ( addr [ 0 ] = = 0 & & addr [ 1 ] = = 0 & &
addr [ 2 ] = = htonl ( 0xffff ) & &
bitstring_match ( addr + 3 ,
cond - > addr ,
cond - > prefix_len ) )
break ;
}
yes = 0 ;
break ;
}
if ( cond - > prefix_len = = 0 )
break ;
2006-01-09 14:56:19 -08:00
if ( bitstring_match ( addr , cond - > addr ,
cond - > prefix_len ) )
2005-04-16 15:20:36 -07:00
break ;
yes = 0 ;
break ;
}
2016-06-23 18:42:51 -07:00
case INET_DIAG_BC_DEV_COND : {
u32 ifindex ;
ifindex = * ( ( const u32 * ) ( op + 1 ) ) ;
if ( ifindex ! = entry - > ifindex )
yes = 0 ;
break ;
}
2016-08-24 15:46:26 +09:00
case INET_DIAG_BC_MARK_COND : {
struct inet_diag_markcond * cond ;
cond = ( struct inet_diag_markcond * ) ( op + 1 ) ;
if ( ( entry - > mark & cond - > mask ) ! = cond - > mark )
yes = 0 ;
break ;
}
2020-04-30 18:51:15 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
case INET_DIAG_BC_CGROUP_COND : {
u64 cgroup_id ;
cgroup_id = get_unaligned ( ( const u64 * ) ( op + 1 ) ) ;
if ( cgroup_id ! = entry - > cgroup_id )
yes = 0 ;
break ;
}
# endif
2005-04-16 15:20:36 -07:00
}
2006-01-09 14:56:19 -08:00
if ( yes ) {
2005-04-16 15:20:36 -07:00
len - = op - > yes ;
bc + = op - > yes ;
} else {
len - = op - > no ;
bc + = op - > no ;
}
}
2010-09-22 20:43:57 +00:00
return len = = 0 ;
2005-04-16 15:20:36 -07:00
}
2015-03-13 15:51:12 -07:00
/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
*/
static void entry_fill_addrs ( struct inet_diag_entry * entry ,
const struct sock * sk )
{
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 ) {
entry - > saddr = sk - > sk_v6_rcv_saddr . s6_addr32 ;
entry - > daddr = sk - > sk_v6_daddr . s6_addr32 ;
} else
# endif
{
entry - > saddr = & sk - > sk_rcv_saddr ;
entry - > daddr = & sk - > sk_daddr ;
}
}
2011-12-09 06:22:44 +00:00
int inet_diag_bc_sk ( const struct nlattr * bc , struct sock * sk )
{
struct inet_sock * inet = inet_sk ( sk ) ;
2015-03-10 07:15:53 -07:00
struct inet_diag_entry entry ;
2011-12-09 06:22:44 +00:00
2015-03-10 07:15:53 -07:00
if ( ! bc )
2011-12-09 06:22:44 +00:00
return 1 ;
entry . family = sk - > sk_family ;
2015-03-13 15:51:12 -07:00
entry_fill_addrs ( & entry , sk ) ;
2011-12-09 06:22:44 +00:00
entry . sport = inet - > inet_num ;
entry . dport = ntohs ( inet - > inet_dport ) ;
2016-06-23 18:42:51 -07:00
entry . ifindex = sk - > sk_bound_dev_if ;
2015-03-15 21:12:14 -07:00
entry . userlocks = sk_fullsock ( sk ) ? sk - > sk_userlocks : 0 ;
2016-08-24 15:46:26 +09:00
if ( sk_fullsock ( sk ) )
2023-07-28 15:03:15 +00:00
entry . mark = READ_ONCE ( sk - > sk_mark ) ;
2016-08-24 15:46:26 +09:00
else if ( sk - > sk_state = = TCP_NEW_SYN_RECV )
entry . mark = inet_rsk ( inet_reqsk ( sk ) ) - > ir_mark ;
2021-06-15 23:06:04 -07:00
else if ( sk - > sk_state = = TCP_TIME_WAIT )
entry . mark = inet_twsk ( sk ) - > tw_mark ;
2016-08-24 15:46:26 +09:00
else
entry . mark = 0 ;
2020-04-30 18:51:15 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
2020-05-02 18:34:42 +03:00
entry . cgroup_id = sk_fullsock ( sk ) ?
cgroup_id ( sock_cgroup_ptr ( & sk - > sk_cgrp_data ) ) : 0 ;
2020-04-30 18:51:15 +03:00
# endif
2011-12-09 06:22:44 +00:00
return inet_diag_bc_run ( bc , & entry ) ;
}
EXPORT_SYMBOL_GPL ( inet_diag_bc_sk ) ;
2005-04-16 15:20:36 -07:00
static int valid_cc ( const void * bc , int len , int cc )
{
while ( len > = 0 ) {
2005-08-12 12:51:49 -03:00
const struct inet_diag_bc_op * op = bc ;
2005-04-16 15:20:36 -07:00
if ( cc > len )
return 0 ;
if ( cc = = len )
return 1 ;
2011-06-17 16:25:39 -04:00
if ( op - > yes < 4 | | op - > yes & 3 )
2005-04-16 15:20:36 -07:00
return 0 ;
len - = op - > yes ;
bc + = op - > yes ;
}
return 0 ;
}
2016-06-23 18:42:51 -07:00
/* data is u32 ifindex */
static bool valid_devcond ( const struct inet_diag_bc_op * op , int len ,
int * min_len )
{
/* Check ifindex space. */
* min_len + = sizeof ( u32 ) ;
if ( len < * min_len )
return false ;
return true ;
}
2012-12-08 19:43:22 +00:00
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond ( const struct inet_diag_bc_op * op , int len ,
int * min_len )
{
struct inet_diag_hostcond * cond ;
2015-03-10 07:15:53 -07:00
int addr_len ;
2012-12-08 19:43:22 +00:00
/* Check hostcond space. */
* min_len + = sizeof ( struct inet_diag_hostcond ) ;
if ( len < * min_len )
return false ;
cond = ( struct inet_diag_hostcond * ) ( op + 1 ) ;
/* Check address family and address length. */
switch ( cond - > family ) {
case AF_UNSPEC :
addr_len = 0 ;
break ;
case AF_INET :
addr_len = sizeof ( struct in_addr ) ;
break ;
case AF_INET6 :
addr_len = sizeof ( struct in6_addr ) ;
break ;
default :
return false ;
}
* min_len + = addr_len ;
if ( len < * min_len )
return false ;
/* Check prefix length (in bits) vs address length (in bytes). */
if ( cond - > prefix_len > 8 * addr_len )
return false ;
return true ;
}
2012-12-09 11:09:54 +00:00
/* Validate a port comparison operator. */
2015-03-10 07:15:53 -07:00
static bool valid_port_comparison ( const struct inet_diag_bc_op * op ,
int len , int * min_len )
2012-12-09 11:09:54 +00:00
{
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
* min_len + = sizeof ( struct inet_diag_bc_op ) ;
if ( len < * min_len )
return false ;
return true ;
}
2016-08-24 15:46:26 +09:00
static bool valid_markcond ( const struct inet_diag_bc_op * op , int len ,
int * min_len )
2005-04-16 15:20:36 -07:00
{
2016-08-24 15:46:26 +09:00
* min_len + = sizeof ( struct inet_diag_markcond ) ;
return len > = * min_len ;
}
2020-04-30 18:51:15 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
static bool valid_cgroupcond ( const struct inet_diag_bc_op * op , int len ,
int * min_len )
{
* min_len + = sizeof ( u64 ) ;
return len > = * min_len ;
}
# endif
2016-08-24 15:46:26 +09:00
static int inet_diag_bc_audit ( const struct nlattr * attr ,
const struct sk_buff * skb )
{
bool net_admin = netlink_net_capable ( skb , CAP_NET_ADMIN ) ;
2016-08-24 15:46:25 +09:00
const void * bytecode , * bc ;
int bytecode_len , len ;
if ( ! attr | | nla_len ( attr ) < sizeof ( struct inet_diag_bc_op ) )
return - EINVAL ;
bytecode = bc = nla_data ( attr ) ;
len = bytecode_len = nla_len ( attr ) ;
2005-04-16 15:20:36 -07:00
while ( len > 0 ) {
2012-12-08 19:43:22 +00:00
int min_len = sizeof ( struct inet_diag_bc_op ) ;
2015-03-10 07:15:53 -07:00
const struct inet_diag_bc_op * op = bc ;
2005-04-16 15:20:36 -07:00
switch ( op - > code ) {
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_S_COND :
case INET_DIAG_BC_D_COND :
2012-12-08 19:43:22 +00:00
if ( ! valid_hostcond ( bc , len , & min_len ) )
return - EINVAL ;
2012-12-09 11:09:54 +00:00
break ;
2016-06-23 18:42:51 -07:00
case INET_DIAG_BC_DEV_COND :
if ( ! valid_devcond ( bc , len , & min_len ) )
return - EINVAL ;
break ;
2017-12-27 18:27:58 +01:00
case INET_DIAG_BC_S_EQ :
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_S_GE :
case INET_DIAG_BC_S_LE :
2017-12-27 18:27:58 +01:00
case INET_DIAG_BC_D_EQ :
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_D_GE :
case INET_DIAG_BC_D_LE :
2012-12-09 11:09:54 +00:00
if ( ! valid_port_comparison ( bc , len , & min_len ) )
2005-04-16 15:20:36 -07:00
return - EINVAL ;
break ;
2016-08-24 15:46:26 +09:00
case INET_DIAG_BC_MARK_COND :
if ( ! net_admin )
return - EPERM ;
if ( ! valid_markcond ( bc , len , & min_len ) )
return - EINVAL ;
break ;
2020-04-30 18:51:15 +03:00
# ifdef CONFIG_SOCK_CGROUP_DATA
case INET_DIAG_BC_CGROUP_COND :
if ( ! valid_cgroupcond ( bc , len , & min_len ) )
return - EINVAL ;
break ;
# endif
2012-12-09 11:09:54 +00:00
case INET_DIAG_BC_AUTO :
case INET_DIAG_BC_JMP :
2005-08-12 12:51:49 -03:00
case INET_DIAG_BC_NOP :
2005-04-16 15:20:36 -07:00
break ;
default :
return - EINVAL ;
}
2012-12-09 11:09:54 +00:00
if ( op - > code ! = INET_DIAG_BC_NOP ) {
if ( op - > no < min_len | | op - > no > len + 4 | | op - > no & 3 )
return - EINVAL ;
if ( op - > no < len & &
! valid_cc ( bytecode , bytecode_len , len - op - > no ) )
return - EINVAL ;
}
2012-12-08 19:43:22 +00:00
if ( op - > yes < min_len | | op - > yes > len + 4 | | op - > yes & 3 )
2011-06-17 16:25:39 -04:00
return - EINVAL ;
2006-01-09 14:56:19 -08:00
bc + = op - > yes ;
2005-04-16 15:20:36 -07:00
len - = op - > yes ;
}
return len = = 0 ? 0 : - EINVAL ;
}
2015-03-05 10:18:14 -08:00
static void twsk_build_assert ( void )
{
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_family ) ! =
offsetof ( struct sock , sk_family ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_num ) ! =
offsetof ( struct inet_sock , inet_num ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_dport ) ! =
offsetof ( struct inet_sock , inet_dport ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_rcv_saddr ) ! =
offsetof ( struct inet_sock , inet_rcv_saddr ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_daddr ) ! =
offsetof ( struct inet_sock , inet_daddr ) ) ;
# if IS_ENABLED(CONFIG_IPV6)
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_v6_rcv_saddr ) ! =
offsetof ( struct sock , sk_v6_rcv_saddr ) ) ;
BUILD_BUG_ON ( offsetof ( struct inet_timewait_sock , tw_v6_daddr ) ! =
offsetof ( struct sock , sk_v6_daddr ) ) ;
# endif
}
2011-12-09 06:23:18 +00:00
void inet_diag_dump_icsk ( struct inet_hashinfo * hashinfo , struct sk_buff * skb ,
2015-03-10 07:15:53 -07:00
struct netlink_callback * cb ,
2020-02-25 15:04:15 -08:00
const struct inet_diag_req_v2 * r )
2005-04-16 15:20:36 -07:00
{
2016-11-04 11:54:32 -07:00
bool net_admin = netlink_net_capable ( cb - > skb , CAP_NET_ADMIN ) ;
2020-02-25 15:04:15 -08:00
struct inet_diag_dump_data * cb_data = cb - > data ;
2012-07-16 04:28:49 +00:00
struct net * net = sock_net ( skb - > sk ) ;
2015-10-02 11:43:32 -07:00
u32 idiag_states = r - > idiag_states ;
2016-11-04 11:54:32 -07:00
int i , num , s_i , s_num ;
2020-02-25 15:04:15 -08:00
struct nlattr * bc ;
2016-11-04 11:54:32 -07:00
struct sock * sk ;
2006-01-09 14:56:19 -08:00
2020-02-25 15:04:15 -08:00
bc = cb_data - > inet_diag_nla_bc ;
2015-10-02 11:43:32 -07:00
if ( idiag_states & TCPF_SYN_RECV )
idiag_states | = TCPF_NEW_SYN_RECV ;
2005-04-16 15:20:36 -07:00
s_i = cb - > args [ 1 ] ;
s_num = num = cb - > args [ 2 ] ;
2005-08-12 09:27:49 -03:00
2005-04-16 15:20:36 -07:00
if ( cb - > args [ 0 ] = = 0 ) {
2016-10-19 21:24:58 -07:00
if ( ! ( idiag_states & TCPF_LISTEN ) | | r - > id . idiag_dport )
2005-04-16 15:20:36 -07:00
goto skip_listen_ht ;
2005-08-10 05:54:28 -03:00
2022-05-11 17:06:05 -07:00
for ( i = s_i ; i < = hashinfo - > lhash2_mask ; i + + ) {
2008-11-20 00:40:07 -08:00
struct inet_listen_hashbucket * ilb ;
2019-12-13 18:20:41 -08:00
struct hlist_nulls_node * node ;
2005-04-16 15:20:36 -07:00
num = 0 ;
2022-05-11 17:06:05 -07:00
ilb = & hashinfo - > lhash2 [ i ] ;
2024-01-22 11:26:03 +00:00
if ( hlist_nulls_empty ( & ilb - > nulls_head ) ) {
s_num = 0 ;
continue ;
}
2016-10-19 21:24:58 -07:00
spin_lock ( & ilb - > lock ) ;
2019-12-13 18:20:41 -08:00
sk_nulls_for_each ( sk , node , & ilb - > nulls_head ) {
2005-04-16 15:20:36 -07:00
struct inet_sock * inet = inet_sk ( sk ) ;
2012-07-16 04:28:49 +00:00
if ( ! net_eq ( sock_net ( sk ) , net ) )
continue ;
2005-04-16 15:20:36 -07:00
if ( num < s_num ) {
num + + ;
continue ;
}
2011-12-06 07:59:15 +00:00
if ( r - > sdiag_family ! = AF_UNSPEC & &
2015-03-10 07:15:53 -07:00
sk - > sk_family ! = r - > sdiag_family )
2011-12-06 07:59:15 +00:00
goto next_listen ;
2009-10-15 06:30:45 +00:00
if ( r - > id . idiag_sport ! = inet - > inet_sport & &
2005-08-12 12:51:49 -03:00
r - > id . idiag_sport )
2005-04-16 15:20:36 -07:00
goto next_listen ;
2020-02-25 15:04:09 -08:00
if ( ! inet_diag_bc_sk ( bc , sk ) )
goto next_listen ;
if ( inet_sk_diag_fill ( sk , inet_csk ( sk ) , skb ,
cb , r , NLM_F_MULTI ,
net_admin ) < 0 ) {
2016-10-19 21:24:58 -07:00
spin_unlock ( & ilb - > lock ) ;
2005-04-16 15:20:36 -07:00
goto done ;
}
next_listen :
+ + num ;
}
2016-10-19 21:24:58 -07:00
spin_unlock ( & ilb - > lock ) ;
2005-04-16 15:20:36 -07:00
s_num = 0 ;
}
skip_listen_ht :
cb - > args [ 0 ] = 1 ;
s_i = num = s_num = 0 ;
}
tcp: Dump bound-only sockets in inet_diag.
Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
that are bound but haven't yet called connect() or listen().
The code is inspired by the ->lhash2 loop. However there's no manual
test of the source port, since this kind of filtering is already
handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
at a time, to avoid running with bh disabled for too long.
There's no TCP state for bound but otherwise inactive sockets. Such
sockets normally map to TCP_CLOSE. However, "ss -l", which is supposed
to only dump listening sockets, actually requests the kernel to dump
sockets in either the TCP_LISTEN or TCP_CLOSE states. To avoid dumping
bound-only sockets with "ss -l", we therefore need to define a new
pseudo-state (TCP_BOUND_INACTIVE) that user space will be able to set
explicitly.
With an IPv4, an IPv6 and an IPv6-only socket, bound respectively to
40000, 64000, 60000, an updated version of iproute2 could work as
follow:
$ ss -t state bound-inactive
Recv-Q Send-Q Local Address:Port Peer Address:Port Process
0 0 0.0.0.0:40000 0.0.0.0:*
0 0 [::]:60000 [::]:*
0 0 *:64000 *:*
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/b3a84ae61e19c06806eea9c602b3b66e8f0cfc81.1701362867.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-12-01 15:49:52 +01:00
/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
* with bh disabled .
*/
# define SKARR_SZ 16
/* Dump bound but inactive (not listening, connecting, etc.) sockets */
if ( cb - > args [ 0 ] = = 1 ) {
if ( ! ( idiag_states & TCPF_BOUND_INACTIVE ) )
goto skip_bind_ht ;
for ( i = s_i ; i < hashinfo - > bhash_size ; i + + ) {
struct inet_bind_hashbucket * ibb ;
struct inet_bind2_bucket * tb2 ;
struct sock * sk_arr [ SKARR_SZ ] ;
int num_arr [ SKARR_SZ ] ;
int idx , accum , res ;
resume_bind_walk :
num = 0 ;
accum = 0 ;
ibb = & hashinfo - > bhash2 [ i ] ;
2024-01-22 11:26:03 +00:00
if ( hlist_empty ( & ibb - > chain ) ) {
s_num = 0 ;
continue ;
}
tcp: Dump bound-only sockets in inet_diag.
Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
that are bound but haven't yet called connect() or listen().
The code is inspired by the ->lhash2 loop. However there's no manual
test of the source port, since this kind of filtering is already
handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
at a time, to avoid running with bh disabled for too long.
There's no TCP state for bound but otherwise inactive sockets. Such
sockets normally map to TCP_CLOSE. However, "ss -l", which is supposed
to only dump listening sockets, actually requests the kernel to dump
sockets in either the TCP_LISTEN or TCP_CLOSE states. To avoid dumping
bound-only sockets with "ss -l", we therefore need to define a new
pseudo-state (TCP_BOUND_INACTIVE) that user space will be able to set
explicitly.
With an IPv4, an IPv6 and an IPv6-only socket, bound respectively to
40000, 64000, 60000, an updated version of iproute2 could work as
follow:
$ ss -t state bound-inactive
Recv-Q Send-Q Local Address:Port Peer Address:Port Process
0 0 0.0.0.0:40000 0.0.0.0:*
0 0 [::]:60000 [::]:*
0 0 *:64000 *:*
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/b3a84ae61e19c06806eea9c602b3b66e8f0cfc81.1701362867.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-12-01 15:49:52 +01:00
spin_lock_bh ( & ibb - > lock ) ;
inet_bind_bucket_for_each ( tb2 , & ibb - > chain ) {
if ( ! net_eq ( ib2_net ( tb2 ) , net ) )
continue ;
2023-12-19 09:18:32 +09:00
sk_for_each_bound ( sk , & tb2 - > owners ) {
tcp: Dump bound-only sockets in inet_diag.
Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
that are bound but haven't yet called connect() or listen().
The code is inspired by the ->lhash2 loop. However there's no manual
test of the source port, since this kind of filtering is already
handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
at a time, to avoid running with bh disabled for too long.
There's no TCP state for bound but otherwise inactive sockets. Such
sockets normally map to TCP_CLOSE. However, "ss -l", which is supposed
to only dump listening sockets, actually requests the kernel to dump
sockets in either the TCP_LISTEN or TCP_CLOSE states. To avoid dumping
bound-only sockets with "ss -l", we therefore need to define a new
pseudo-state (TCP_BOUND_INACTIVE) that user space will be able to set
explicitly.
With an IPv4, an IPv6 and an IPv6-only socket, bound respectively to
40000, 64000, 60000, an updated version of iproute2 could work as
follow:
$ ss -t state bound-inactive
Recv-Q Send-Q Local Address:Port Peer Address:Port Process
0 0 0.0.0.0:40000 0.0.0.0:*
0 0 [::]:60000 [::]:*
0 0 *:64000 *:*
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/b3a84ae61e19c06806eea9c602b3b66e8f0cfc81.1701362867.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-12-01 15:49:52 +01:00
struct inet_sock * inet = inet_sk ( sk ) ;
if ( num < s_num )
goto next_bind ;
if ( sk - > sk_state ! = TCP_CLOSE | |
! inet - > inet_num )
goto next_bind ;
if ( r - > sdiag_family ! = AF_UNSPEC & &
r - > sdiag_family ! = sk - > sk_family )
goto next_bind ;
if ( ! inet_diag_bc_sk ( bc , sk ) )
goto next_bind ;
sock_hold ( sk ) ;
num_arr [ accum ] = num ;
sk_arr [ accum ] = sk ;
if ( + + accum = = SKARR_SZ )
goto pause_bind_walk ;
next_bind :
num + + ;
}
}
pause_bind_walk :
spin_unlock_bh ( & ibb - > lock ) ;
res = 0 ;
for ( idx = 0 ; idx < accum ; idx + + ) {
if ( res > = 0 ) {
res = inet_sk_diag_fill ( sk_arr [ idx ] ,
NULL , skb , cb ,
r , NLM_F_MULTI ,
net_admin ) ;
if ( res < 0 )
num = num_arr [ idx ] ;
}
sock_put ( sk_arr [ idx ] ) ;
}
if ( res < 0 )
goto done ;
cond_resched ( ) ;
if ( accum = = SKARR_SZ ) {
s_num = num + 1 ;
goto resume_bind_walk ;
}
s_num = 0 ;
}
skip_bind_ht :
cb - > args [ 0 ] = 2 ;
s_i = num = s_num = 0 ;
}
2015-10-02 11:43:32 -07:00
if ( ! ( idiag_states & ~ TCPF_LISTEN ) )
2011-12-09 06:22:26 +00:00
goto out ;
2005-04-16 15:20:36 -07:00
2009-10-09 00:16:19 +00:00
for ( i = s_i ; i < = hashinfo - > ehash_mask ; i + + ) {
2005-08-10 05:54:28 -03:00
struct inet_ehash_bucket * head = & hashinfo - > ehash [ i ] ;
2008-11-21 16:39:19 -08:00
spinlock_t * lock = inet_ehash_lockp ( hashinfo , i ) ;
2008-11-16 19:40:17 -08:00
struct hlist_nulls_node * node ;
2016-11-04 11:54:32 -07:00
struct sock * sk_arr [ SKARR_SZ ] ;
int num_arr [ SKARR_SZ ] ;
int idx , accum , res ;
2008-08-28 01:09:54 -07:00
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 00:22:02 -07:00
if ( hlist_nulls_empty ( & head - > chain ) )
2008-08-28 01:09:54 -07:00
continue ;
2005-04-16 15:20:36 -07:00
if ( i > s_i )
s_num = 0 ;
2016-11-04 11:54:32 -07:00
next_chunk :
num = 0 ;
accum = 0 ;
2008-11-21 16:39:19 -08:00
spin_lock_bh ( lock ) ;
2008-11-16 19:40:17 -08:00
sk_nulls_for_each ( sk , node , & head - > chain ) {
2016-11-04 11:54:32 -07:00
int state ;
2005-04-16 15:20:36 -07:00
2012-07-16 04:28:49 +00:00
if ( ! net_eq ( sock_net ( sk ) , net ) )
continue ;
2005-04-16 15:20:36 -07:00
if ( num < s_num )
goto next_normal ;
2014-01-10 15:34:45 -05:00
state = ( sk - > sk_state = = TCP_TIME_WAIT ) ?
inet_twsk ( sk ) - > tw_substate : sk - > sk_state ;
2015-10-02 11:43:32 -07:00
if ( ! ( idiag_states & ( 1 < < state ) ) )
2005-04-16 15:20:36 -07:00
goto next_normal ;
2011-12-06 07:59:15 +00:00
if ( r - > sdiag_family ! = AF_UNSPEC & &
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 00:22:02 -07:00
sk - > sk_family ! = r - > sdiag_family )
2011-12-06 07:59:15 +00:00
goto next_normal ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 00:22:02 -07:00
if ( r - > id . idiag_sport ! = htons ( sk - > sk_num ) & &
2005-08-12 12:51:49 -03:00
r - > id . idiag_sport )
2005-04-16 15:20:36 -07:00
goto next_normal ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 00:22:02 -07:00
if ( r - > id . idiag_dport ! = sk - > sk_dport & &
2006-01-09 14:56:19 -08:00
r - > id . idiag_dport )
2005-04-16 15:20:36 -07:00
goto next_normal ;
2015-03-15 21:12:14 -07:00
twsk_build_assert ( ) ;
if ( ! inet_diag_bc_sk ( bc , sk ) )
goto next_normal ;
2018-12-20 15:28:56 -08:00
if ( ! refcount_inc_not_zero ( & sk - > sk_refcnt ) )
goto next_normal ;
2016-11-04 11:54:32 -07:00
num_arr [ accum ] = num ;
sk_arr [ accum ] = sk ;
if ( + + accum = = SKARR_SZ )
break ;
next_normal :
+ + num ;
}
spin_unlock_bh ( lock ) ;
res = 0 ;
for ( idx = 0 ; idx < accum ; idx + + ) {
if ( res > = 0 ) {
2020-02-25 15:04:09 -08:00
res = sk_diag_fill ( sk_arr [ idx ] , skb , cb , r ,
NLM_F_MULTI , net_admin ) ;
2016-11-04 11:54:32 -07:00
if ( res < 0 )
num = num_arr [ idx ] ;
2005-04-16 15:20:36 -07:00
}
2016-11-04 11:54:32 -07:00
sock_gen_put ( sk_arr [ idx ] ) ;
2005-04-16 15:20:36 -07:00
}
2016-11-04 11:54:32 -07:00
if ( res < 0 )
break ;
2016-03-14 15:40:00 -07:00
cond_resched ( ) ;
2016-11-04 11:54:32 -07:00
if ( accum = = SKARR_SZ ) {
s_num = num + 1 ;
goto next_chunk ;
}
2005-04-16 15:20:36 -07:00
}
done :
cb - > args [ 1 ] = i ;
cb - > args [ 2 ] = num ;
2011-12-09 06:22:26 +00:00
out :
;
}
2011-12-09 06:23:18 +00:00
EXPORT_SYMBOL_GPL ( inet_diag_dump_icsk ) ;
2011-12-09 06:22:26 +00:00
static int __inet_diag_dump ( struct sk_buff * skb , struct netlink_callback * cb ,
2020-02-25 15:04:15 -08:00
const struct inet_diag_req_v2 * r )
2011-12-09 06:22:26 +00:00
{
2020-07-09 15:12:39 +02:00
struct inet_diag_dump_data * cb_data = cb - > data ;
2011-12-09 06:22:26 +00:00
const struct inet_diag_handler * handler ;
2020-02-25 15:04:27 -08:00
u32 prev_min_dump_alloc ;
2020-07-09 15:12:39 +02:00
int protocol , err = 0 ;
protocol = inet_diag_get_protocol ( r , cb_data ) ;
2011-12-09 06:22:26 +00:00
2020-02-25 15:04:27 -08:00
again :
prev_min_dump_alloc = cb - > min_dump_alloc ;
2020-07-09 15:12:39 +02:00
handler = inet_diag_lock_handler ( protocol ) ;
2024-01-22 11:25:58 +00:00
if ( handler ) {
2020-02-25 15:04:15 -08:00
handler - > dump ( skb , cb , r ) ;
2024-01-22 11:25:58 +00:00
inet_diag_unlock_handler ( handler ) ;
} else {
err = - ENOENT ;
}
2020-02-25 15:04:27 -08:00
/* The skb is not large enough to fit one sk info and
* inet_sk_diag_fill ( ) has requested for a larger skb .
*/
if ( ! skb - > len & & cb - > min_dump_alloc > prev_min_dump_alloc ) {
err = pskb_expand_head ( skb , 0 , cb - > min_dump_alloc , GFP_KERNEL ) ;
if ( ! err )
goto again ;
}
2012-11-03 09:30:34 +00:00
return err ? : skb - > len ;
2005-04-16 15:20:36 -07:00
}
2011-12-06 07:58:58 +00:00
static int inet_diag_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
2020-02-25 15:04:15 -08:00
return __inet_diag_dump ( skb , cb , nlmsg_data ( cb - > nlh ) ) ;
}
static int __inet_diag_dump_start ( struct netlink_callback * cb , int hdrlen )
{
const struct nlmsghdr * nlh = cb - > nlh ;
struct inet_diag_dump_data * cb_data ;
struct sk_buff * skb = cb - > skb ;
struct nlattr * nla ;
2020-07-09 15:12:39 +02:00
int err ;
2020-02-25 15:04:15 -08:00
cb_data = kzalloc ( sizeof ( * cb_data ) , GFP_KERNEL ) ;
if ( ! cb_data )
return - ENOMEM ;
2020-09-21 07:27:20 -07:00
err = inet_diag_parse_attrs ( nlh , hdrlen , cb_data - > req_nlas ) ;
if ( err ) {
kfree ( cb_data ) ;
return err ;
}
2020-02-25 15:04:15 -08:00
nla = cb_data - > inet_diag_nla_bc ;
if ( nla ) {
err = inet_diag_bc_audit ( nla , skb ) ;
if ( err ) {
kfree ( cb_data ) ;
return err ;
}
}
2011-12-06 07:58:58 +00:00
2020-02-25 15:04:27 -08:00
nla = cb_data - > inet_diag_nla_bpf_stgs ;
if ( nla ) {
struct bpf_sk_storage_diag * bpf_stg_diag ;
bpf_stg_diag = bpf_sk_storage_diag_alloc ( nla ) ;
if ( IS_ERR ( bpf_stg_diag ) ) {
kfree ( cb_data ) ;
return PTR_ERR ( bpf_stg_diag ) ;
}
cb_data - > bpf_stg_diag = bpf_stg_diag ;
}
2020-02-25 15:04:15 -08:00
cb - > data = cb_data ;
return 0 ;
}
2011-12-06 07:58:58 +00:00
2020-02-25 15:04:15 -08:00
static int inet_diag_dump_start ( struct netlink_callback * cb )
{
return __inet_diag_dump_start ( cb , sizeof ( struct inet_diag_req_v2 ) ) ;
}
static int inet_diag_dump_start_compat ( struct netlink_callback * cb )
{
return __inet_diag_dump_start ( cb , sizeof ( struct inet_diag_req ) ) ;
}
static int inet_diag_dump_done ( struct netlink_callback * cb )
{
2020-02-25 15:04:27 -08:00
struct inet_diag_dump_data * cb_data = cb - > data ;
bpf_sk_storage_diag_free ( cb_data - > bpf_stg_diag ) ;
2020-02-25 15:04:15 -08:00
kfree ( cb - > data ) ;
return 0 ;
2011-12-06 07:58:58 +00:00
}
2015-03-10 07:15:53 -07:00
static int inet_diag_type2proto ( int type )
2011-12-06 07:59:32 +00:00
{
switch ( type ) {
case TCPDIAG_GETSOCK :
return IPPROTO_TCP ;
case DCCPDIAG_GETSOCK :
return IPPROTO_DCCP ;
default :
return 0 ;
}
}
2015-03-10 07:15:53 -07:00
static int inet_diag_dump_compat ( struct sk_buff * skb ,
struct netlink_callback * cb )
2011-12-06 07:58:58 +00:00
{
2012-06-26 21:28:54 -07:00
struct inet_diag_req * rc = nlmsg_data ( cb - > nlh ) ;
2012-01-10 22:36:35 +00:00
struct inet_diag_req_v2 req ;
2011-12-06 07:58:58 +00:00
2011-12-06 07:59:15 +00:00
req . sdiag_family = AF_UNSPEC ; /* compatibility */
2011-12-06 07:58:58 +00:00
req . sdiag_protocol = inet_diag_type2proto ( cb - > nlh - > nlmsg_type ) ;
req . idiag_ext = rc - > idiag_ext ;
req . idiag_states = rc - > idiag_states ;
req . id = rc - > id ;
2020-02-25 15:04:15 -08:00
return __inet_diag_dump ( skb , cb , & req ) ;
2011-12-06 07:58:58 +00:00
}
2011-12-06 07:58:39 +00:00
static int inet_diag_get_exact_compat ( struct sk_buff * in_skb ,
2015-03-10 07:15:53 -07:00
const struct nlmsghdr * nlh )
2011-12-06 07:58:39 +00:00
{
2012-06-26 21:28:54 -07:00
struct inet_diag_req * rc = nlmsg_data ( nlh ) ;
2012-01-10 22:36:35 +00:00
struct inet_diag_req_v2 req ;
2011-12-06 07:58:39 +00:00
req . sdiag_family = rc - > idiag_family ;
req . sdiag_protocol = inet_diag_type2proto ( nlh - > nlmsg_type ) ;
req . idiag_ext = rc - > idiag_ext ;
req . idiag_states = rc - > idiag_states ;
req . id = rc - > id ;
2020-07-09 15:12:39 +02:00
return inet_diag_cmd_exact ( SOCK_DIAG_BY_FAMILY , in_skb , nlh ,
sizeof ( struct inet_diag_req ) , & req ) ;
2011-12-06 07:58:39 +00:00
}
2011-12-06 07:57:06 +00:00
static int inet_diag_rcv_msg_compat ( struct sk_buff * skb , struct nlmsghdr * nlh )
2005-04-16 15:20:36 -07:00
{
2012-01-10 22:37:26 +00:00
int hdrlen = sizeof ( struct inet_diag_req ) ;
2012-07-16 04:28:49 +00:00
struct net * net = sock_net ( skb - > sk ) ;
2005-04-16 15:20:36 -07:00
2007-03-22 23:30:35 -07:00
if ( nlh - > nlmsg_type > = INET_DIAG_GETSOCK_MAX | |
nlmsg_len ( nlh ) < hdrlen )
return - EINVAL ;
2005-04-16 15:20:36 -07:00
2011-01-18 12:40:38 -08:00
if ( nlh - > nlmsg_flags & NLM_F_DUMP ) {
2020-02-25 15:04:15 -08:00
struct netlink_dump_control c = {
. start = inet_diag_dump_start_compat ,
. done = inet_diag_dump_done ,
. dump = inet_diag_dump_compat ,
} ;
return netlink_dump_start ( net - > diag_nlsk , skb , nlh , & c ) ;
2005-04-16 15:20:36 -07:00
}
2007-03-22 23:30:35 -07:00
2011-12-06 07:58:39 +00:00
return inet_diag_get_exact_compat ( skb , nlh ) ;
2005-04-16 15:20:36 -07:00
}
2015-12-16 12:30:04 +09:00
static int inet_diag_handler_cmd ( struct sk_buff * skb , struct nlmsghdr * h )
2011-12-06 07:58:03 +00:00
{
2012-01-10 22:36:35 +00:00
int hdrlen = sizeof ( struct inet_diag_req_v2 ) ;
2012-07-16 04:28:49 +00:00
struct net * net = sock_net ( skb - > sk ) ;
2011-12-06 07:58:03 +00:00
if ( nlmsg_len ( h ) < hdrlen )
return - EINVAL ;
2015-12-16 12:30:04 +09:00
if ( h - > nlmsg_type = = SOCK_DIAG_BY_FAMILY & &
h - > nlmsg_flags & NLM_F_DUMP ) {
2020-02-25 15:04:15 -08:00
struct netlink_dump_control c = {
. start = inet_diag_dump_start ,
. done = inet_diag_dump_done ,
. dump = inet_diag_dump ,
} ;
return netlink_dump_start ( net - > diag_nlsk , skb , h , & c ) ;
2011-12-06 07:58:03 +00:00
}
2020-07-09 15:12:39 +02:00
return inet_diag_cmd_exact ( h - > nlmsg_type , skb , h , hdrlen ,
nlmsg_data ( h ) ) ;
2011-12-06 07:58:03 +00:00
}
2015-06-15 11:26:20 -04:00
static
int inet_diag_handler_get_info ( struct sk_buff * skb , struct sock * sk )
{
const struct inet_diag_handler * handler ;
struct nlmsghdr * nlh ;
struct nlattr * attr ;
struct inet_diag_msg * r ;
void * info = NULL ;
int err = 0 ;
nlh = nlmsg_put ( skb , 0 , 0 , SOCK_DIAG_BY_FAMILY , sizeof ( * r ) , 0 ) ;
if ( ! nlh )
return - ENOMEM ;
r = nlmsg_data ( nlh ) ;
memset ( r , 0 , sizeof ( * r ) ) ;
inet_diag_msg_common_fill ( r , sk ) ;
2015-06-17 10:59:10 -04:00
if ( sk - > sk_type = = SOCK_DGRAM | | sk - > sk_type = = SOCK_STREAM )
r - > id . idiag_sport = inet_sk ( sk ) - > inet_sport ;
2015-06-15 11:26:20 -04:00
r - > idiag_state = sk - > sk_state ;
if ( ( err = nla_put_u8 ( skb , INET_DIAG_PROTOCOL , sk - > sk_protocol ) ) ) {
nlmsg_cancel ( skb , nlh ) ;
return err ;
}
handler = inet_diag_lock_handler ( sk - > sk_protocol ) ;
2024-01-22 11:25:58 +00:00
if ( ! handler ) {
2015-06-15 11:26:20 -04:00
nlmsg_cancel ( skb , nlh ) ;
2024-01-22 11:25:58 +00:00
return - ENOENT ;
2015-06-15 11:26:20 -04:00
}
attr = handler - > idiag_info_size
2016-04-26 10:06:14 +02:00
? nla_reserve_64bit ( skb , INET_DIAG_INFO ,
handler - > idiag_info_size ,
INET_DIAG_PAD )
2015-06-15 11:26:20 -04:00
: NULL ;
if ( attr )
info = nla_data ( attr ) ;
handler - > idiag_get_info ( sk , r , info ) ;
inet_diag_unlock_handler ( handler ) ;
nlmsg_end ( skb , nlh ) ;
return 0 ;
}
2012-04-24 18:21:07 +00:00
static const struct sock_diag_handler inet_diag_handler = {
2024-01-22 11:25:59 +00:00
. owner = THIS_MODULE ,
2011-12-06 07:58:03 +00:00
. family = AF_INET ,
2015-12-16 12:30:04 +09:00
. dump = inet_diag_handler_cmd ,
2015-06-15 11:26:20 -04:00
. get_info = inet_diag_handler_get_info ,
2015-12-16 12:30:04 +09:00
. destroy = inet_diag_handler_cmd ,
2011-12-06 07:58:03 +00:00
} ;
2012-04-24 18:21:07 +00:00
static const struct sock_diag_handler inet6_diag_handler = {
2024-01-22 11:25:59 +00:00
. owner = THIS_MODULE ,
2011-12-06 07:58:03 +00:00
. family = AF_INET6 ,
2015-12-16 12:30:04 +09:00
. dump = inet_diag_handler_cmd ,
2015-06-15 11:26:20 -04:00
. get_info = inet_diag_handler_get_info ,
2015-12-16 12:30:04 +09:00
. destroy = inet_diag_handler_cmd ,
2011-12-06 07:58:03 +00:00
} ;
2005-08-12 09:27:49 -03:00
int inet_diag_register ( const struct inet_diag_handler * h )
{
const __u16 type = h - > idiag_type ;
2011-12-06 08:05:24 +00:00
if ( type > = IPPROTO_MAX )
2024-01-22 11:25:58 +00:00
return - EINVAL ;
2005-08-12 09:27:49 -03:00
2024-01-22 11:25:58 +00:00
return ! cmpxchg ( ( const struct inet_diag_handler * * ) & inet_diag_table [ type ] ,
NULL , h ) ? 0 : - EEXIST ;
2005-08-12 09:27:49 -03:00
}
EXPORT_SYMBOL_GPL ( inet_diag_register ) ;
void inet_diag_unregister ( const struct inet_diag_handler * h )
{
const __u16 type = h - > idiag_type ;
2011-12-06 08:05:24 +00:00
if ( type > = IPPROTO_MAX )
2005-08-12 09:27:49 -03:00
return ;
2024-01-22 11:25:58 +00:00
xchg ( ( const struct inet_diag_handler * * ) & inet_diag_table [ type ] ,
NULL ) ;
2005-08-12 09:27:49 -03:00
}
EXPORT_SYMBOL_GPL ( inet_diag_unregister ) ;
2024-01-22 11:26:01 +00:00
static const struct sock_diag_inet_compat inet_diag_compat = {
. owner = THIS_MODULE ,
. fn = inet_diag_rcv_msg_compat ,
} ;
2005-08-12 12:51:49 -03:00
static int __init inet_diag_init ( void )
2005-04-16 15:20:36 -07:00
{
2011-12-06 08:05:24 +00:00
const int inet_diag_table_size = ( IPPROTO_MAX *
2005-08-12 09:27:49 -03:00
sizeof ( struct inet_diag_handler * ) ) ;
int err = - ENOMEM ;
2006-07-21 14:51:30 -07:00
inet_diag_table = kzalloc ( inet_diag_table_size , GFP_KERNEL ) ;
2005-08-12 09:27:49 -03:00
if ( ! inet_diag_table )
goto out ;
2011-12-06 07:58:03 +00:00
err = sock_diag_register ( & inet_diag_handler ) ;
if ( err )
goto out_free_nl ;
err = sock_diag_register ( & inet6_diag_handler ) ;
if ( err )
goto out_free_inet ;
2024-01-22 11:26:01 +00:00
sock_diag_register_inet_compat ( & inet_diag_compat ) ;
2005-08-12 09:27:49 -03:00
out :
return err ;
2011-12-06 07:58:03 +00:00
out_free_inet :
sock_diag_unregister ( & inet_diag_handler ) ;
out_free_nl :
2005-08-12 09:27:49 -03:00
kfree ( inet_diag_table ) ;
goto out ;
2005-04-16 15:20:36 -07:00
}
2005-08-12 12:51:49 -03:00
static void __exit inet_diag_exit ( void )
2005-04-16 15:20:36 -07:00
{
2011-12-06 07:58:03 +00:00
sock_diag_unregister ( & inet6_diag_handler ) ;
sock_diag_unregister ( & inet_diag_handler ) ;
2024-01-22 11:26:01 +00:00
sock_diag_unregister_inet_compat ( & inet_diag_compat ) ;
2005-08-12 09:27:49 -03:00
kfree ( inet_diag_table ) ;
2005-04-16 15:20:36 -07:00
}
2005-08-12 12:51:49 -03:00
module_init ( inet_diag_init ) ;
module_exit ( inet_diag_exit ) ;
2005-04-16 15:20:36 -07:00
MODULE_LICENSE ( " GPL " ) ;
2023-11-18 19:30:06 -08:00
MODULE_DESCRIPTION ( " INET/INET6: socket monitoring via SOCK_DIAG " ) ;
2011-12-15 02:43:27 +00:00
MODULE_ALIAS_NET_PF_PROTO_TYPE ( PF_NETLINK , NETLINK_SOCK_DIAG , 2 /* AF_INET */ ) ;
MODULE_ALIAS_NET_PF_PROTO_TYPE ( PF_NETLINK , NETLINK_SOCK_DIAG , 10 /* AF_INET6 */ ) ;