linux/net/ipv4/inet_diag.c

1196 lines
28 KiB
C
Raw Normal View History

/*
* inet_diag.c Module for monitoring INET transport protocols sockets.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/time.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/inet6_hashtables.h>
#include <net/netlink.h>
#include <linux/inet.h>
#include <linux/stddef.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
static const struct inet_diag_handler **inet_diag_table;
struct inet_diag_entry {
__be32 *saddr;
__be32 *daddr;
u16 sport;
u16 dport;
u16 family;
u16 userlocks;
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
#endif
};
static DEFINE_MUTEX(inet_diag_table_mutex);
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
if (!inet_diag_table[proto])
request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
NETLINK_SOCK_DIAG, AF_INET, proto);
mutex_lock(&inet_diag_table_mutex);
if (!inet_diag_table[proto])
return ERR_PTR(-ENOENT);
return inet_diag_table[proto];
}
static inline void inet_diag_unlock_handler(
const struct inet_diag_handler *handler)
{
mutex_unlock(&inet_diag_table_mutex);
}
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
const struct inet_diag_handler *handler;
int ext = req->idiag_ext;
handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(handler == NULL);
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
r = nlmsg_data(nlh);
BUG_ON(sk->sk_state == TCP_TIME_WAIT);
r->idiag_family = sk->sk_family;
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
r->id.idiag_if = sk->sk_bound_dev_if;
sock_diag_save_cookie(sk, r->id.idiag_cookie);
r->id.idiag_sport = inet->inet_sport;
r->id.idiag_dport = inet->inet_dport;
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
* hence this needs to be included regardless of socket family.
*/
if (ext & (1 << (INET_DIAG_TOS - 1)))
if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
goto errout;
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
if (nla_put_u8(skb, INET_DIAG_TCLASS,
inet6_sk(sk)->tclass) < 0)
goto errout;
}
#endif
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
.idiag_wmem = sk->sk_wmem_queued,
.idiag_fmem = sk->sk_forward_alloc,
.idiag_tmem = sk_wmem_alloc_get(sk),
};
if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
goto errout;
}
if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
if (icsk == NULL) {
handler->idiag_get_info(sk, r, NULL);
goto out;
}
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
tcp: Tail loss probe (TLP) This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-11 14:00:43 +04:00
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
} else {
r->idiag_timer = 0;
r->idiag_expires = 0;
}
#undef EXPIRES_IN_MS
if (ext & (1 << (INET_DIAG_INFO - 1))) {
attr = nla_reserve(skb, INET_DIAG_INFO,
sizeof(struct tcp_info));
if (!attr)
goto errout;
info = nla_data(attr);
}
if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
if (nla_put_string(skb, INET_DIAG_CONG,
icsk->icsk_ca_ops->name) < 0)
goto errout;
handler->idiag_get_info(sk, r, info);
if (sk->sk_state < TCP_TIME_WAIT &&
icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
icsk->icsk_ca_ops->get_info(sk, ext, skb);
out:
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 00:09:00 +03:00
nlmsg_end(skb, nlh);
return 0;
errout:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
static int inet_csk_diag_fill(struct sock *sk,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
return inet_sk_diag_fill(sk, inet_csk(sk),
skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
}
static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
s32 tmo;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
tmo = tw->tw_ttd - inet_tw_time_stamp();
if (tmo < 0)
tmo = 0;
r->idiag_family = tw->tw_family;
r->idiag_retrans = 0;
r->id.idiag_if = tw->tw_bound_dev_if;
sock_diag_save_cookie(tw, r->id.idiag_cookie);
r->id.idiag_sport = tw->tw_sport;
r->id.idiag_dport = tw->tw_dport;
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
r->id.idiag_src[0] = tw->tw_rcv_saddr;
r->id.idiag_dst[0] = tw->tw_daddr;
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
*(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
*(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
}
#endif
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 00:09:00 +03:00
nlmsg_end(skb, nlh);
return 0;
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
nlmsg_flags, unlh);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
nlmsg_flags, unlh);
}
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
{
int err;
struct sock *sk;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
err = -EINVAL;
if (req->sdiag_family == AF_INET) {
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
req->id.idiag_if);
}
#endif
else {
goto out_nosk;
}
err = -ENOENT;
if (sk == NULL)
goto out_nosk;
err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
if (err)
goto out;
rep = nlmsg_new(sizeof(struct inet_diag_msg) +
sizeof(struct inet_diag_meminfo) +
sizeof(struct tcp_info) + 64, GFP_KERNEL);
if (!rep) {
err = -ENOMEM;
goto out;
}
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
goto out;
}
err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
MSG_DONTWAIT);
if (err > 0)
err = 0;
out:
if (sk)
sock_gen_put(sk);
out_nosk:
return err;
}
EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
static int inet_diag_get_exact(struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
struct inet_diag_req_v2 *req)
{
const struct inet_diag_handler *handler;
int err;
handler = inet_diag_lock_handler(req->sdiag_protocol);
if (IS_ERR(handler))
err = PTR_ERR(handler);
else
err = handler->dump_one(in_skb, nlh, req);
inet_diag_unlock_handler(handler);
return err;
}
static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
{
int words = bits >> 5;
bits &= 0x1f;
if (words) {
if (memcmp(a1, a2, words << 2))
return 0;
}
if (bits) {
__be32 w1, w2;
__be32 mask;
w1 = a1[words];
w2 = a2[words];
mask = htonl((0xffffffff) << (32 - bits));
if ((w1 ^ w2) & mask)
return 0;
}
return 1;
}
static int inet_diag_bc_run(const struct nlattr *_bc,
const struct inet_diag_entry *entry)
{
const void *bc = nla_data(_bc);
int len = nla_len(_bc);
while (len > 0) {
int yes = 1;
const struct inet_diag_bc_op *op = bc;
switch (op->code) {
case INET_DIAG_BC_NOP:
break;
case INET_DIAG_BC_JMP:
yes = 0;
break;
case INET_DIAG_BC_S_GE:
yes = entry->sport >= op[1].no;
break;
case INET_DIAG_BC_S_LE:
yes = entry->sport <= op[1].no;
break;
case INET_DIAG_BC_D_GE:
yes = entry->dport >= op[1].no;
break;
case INET_DIAG_BC_D_LE:
yes = entry->dport <= op[1].no;
break;
case INET_DIAG_BC_AUTO:
yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
break;
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND: {
struct inet_diag_hostcond *cond;
__be32 *addr;
cond = (struct inet_diag_hostcond *)(op + 1);
if (cond->port != -1 &&
cond->port != (op->code == INET_DIAG_BC_S_COND ?
entry->sport : entry->dport)) {
yes = 0;
break;
}
if (op->code == INET_DIAG_BC_S_COND)
addr = entry->saddr;
else
addr = entry->daddr;
if (cond->family != AF_UNSPEC &&
cond->family != entry->family) {
if (entry->family == AF_INET6 &&
cond->family == AF_INET) {
if (addr[0] == 0 && addr[1] == 0 &&
addr[2] == htonl(0xffff) &&
bitstring_match(addr + 3,
cond->addr,
cond->prefix_len))
break;
}
yes = 0;
break;
}
if (cond->prefix_len == 0)
break;
if (bitstring_match(addr, cond->addr,
cond->prefix_len))
break;
yes = 0;
break;
}
}
if (yes) {
len -= op->yes;
bc += op->yes;
} else {
len -= op->no;
bc += op->no;
}
}
return len == 0;
}
int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
{
struct inet_diag_entry entry;
struct inet_sock *inet = inet_sk(sk);
if (bc == NULL)
return 1;
entry.family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
if (entry.family == AF_INET6) {
entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
entry.daddr = sk->sk_v6_daddr.s6_addr32;
} else
#endif
{
entry.saddr = &inet->inet_rcv_saddr;
entry.daddr = &inet->inet_daddr;
}
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
entry.userlocks = sk->sk_userlocks;
return inet_diag_bc_run(bc, &entry);
}
EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
static int valid_cc(const void *bc, int len, int cc)
{
while (len >= 0) {
const struct inet_diag_bc_op *op = bc;
if (cc > len)
return 0;
if (cc == len)
return 1;
if (op->yes < 4 || op->yes & 3)
return 0;
len -= op->yes;
bc += op->yes;
}
return 0;
}
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
{
int addr_len;
struct inet_diag_hostcond *cond;
/* Check hostcond space. */
*min_len += sizeof(struct inet_diag_hostcond);
if (len < *min_len)
return false;
cond = (struct inet_diag_hostcond *)(op + 1);
/* Check address family and address length. */
switch (cond->family) {
case AF_UNSPEC:
addr_len = 0;
break;
case AF_INET:
addr_len = sizeof(struct in_addr);
break;
case AF_INET6:
addr_len = sizeof(struct in6_addr);
break;
default:
return false;
}
*min_len += addr_len;
if (len < *min_len)
return false;
/* Check prefix length (in bits) vs address length (in bytes). */
if (cond->prefix_len > 8 * addr_len)
return false;
return true;
}
/* Validate a port comparison operator. */
static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
int len, int *min_len)
{
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
*min_len += sizeof(struct inet_diag_bc_op);
if (len < *min_len)
return false;
return true;
}
static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
{
const void *bc = bytecode;
int len = bytecode_len;
while (len > 0) {
const struct inet_diag_bc_op *op = bc;
int min_len = sizeof(struct inet_diag_bc_op);
//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
if (!valid_hostcond(bc, len, &min_len))
return -EINVAL;
break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
default:
return -EINVAL;
}
if (op->code != INET_DIAG_BC_NOP) {
if (op->no < min_len || op->no > len + 4 || op->no & 3)
return -EINVAL;
if (op->no < len &&
!valid_cc(bytecode, bytecode_len, len - op->no))
return -EINVAL;
}
if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
return -EINVAL;
bc += op->yes;
len -= op->yes;
}
return len == 0 ? 0 : -EINVAL;
}
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
static int inet_twsk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
struct inet_timewait_sock *tw = inet_twsk(sk);
if (bc != NULL) {
struct inet_diag_entry entry;
entry.family = tw->tw_family;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
entry.daddr = tw->tw_v6_daddr.s6_addr32;
} else
#endif
{
entry.saddr = &tw->tw_rcv_saddr;
entry.daddr = &tw->tw_daddr;
}
entry.sport = tw->tw_num;
entry.dport = ntohs(tw->tw_dport);
entry.userlocks = 0;
if (!inet_diag_bc_run(bc, &entry))
return 0;
}
return inet_twsk_diag_fill(tw, skb, r,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
* from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
*/
static inline void inet_diag_req_addrs(const struct sock *sk,
const struct request_sock *req,
struct inet_diag_entry *entry)
{
struct inet_request_sock *ireq = inet_rsk(req);
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6) {
if (req->rsk_ops->family == AF_INET6) {
entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
} else if (req->rsk_ops->family == AF_INET) {
ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
&entry->saddr_storage);
ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
&entry->daddr_storage);
entry->saddr = entry->saddr_storage.s6_addr32;
entry->daddr = entry->daddr_storage.s6_addr32;
}
} else
#endif
{
entry->saddr = &ireq->ir_loc_addr;
entry->daddr = &ireq->ir_rmt_addr;
}
}
static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
struct request_sock *req,
struct user_namespace *user_ns,
u32 portid, u32 seq,
const struct nlmsghdr *unlh)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_sock *inet = inet_sk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
NLM_F_MULTI);
if (!nlh)
return -EMSGSIZE;
r = nlmsg_data(nlh);
r->idiag_family = sk->sk_family;
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
tcp: better retrans tracking for defer-accept For passive TCP connections using TCP_DEFER_ACCEPT facility, we incorrectly increment req->retrans each time timeout triggers while no SYNACK is sent. SYNACK are not sent for TCP_DEFER_ACCEPT that were established (for which we received the ACK from client). Only the last SYNACK is sent so that we can receive again an ACK from client, to move the req into accept queue. We plan to change this later to avoid the useless retransmit (and potential problem as this SYNACK could be lost) TCP_INFO later gives wrong information to user, claiming imaginary retransmits. Decouple req->retrans field into two independent fields : num_retrans : number of retransmit num_timeout : number of timeouts num_timeout is the counter that is incremented at each timeout, regardless of actual SYNACK being sent or not, and used to compute the exponential timeout. Introduce inet_rtx_syn_ack() helper to increment num_retrans only if ->rtx_syn_ack() succeeded. Use inet_rtx_syn_ack() from tcp_check_req() to increment num_retrans when we re-send a SYNACK in answer to a (retransmitted) SYN. Prior to this patch, we were not counting these retransmits. Change tcp_v[46]_rtx_synack() to increment TCP_MIB_RETRANSSEGS only if a synack packet was successfully queued. Reported-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Julian Anastasov <ja@ssi.bg> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Elliott Hughes <enh@google.com> Cc: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-10-28 03:16:46 +04:00
r->idiag_retrans = req->num_retrans;
r->id.idiag_if = sk->sk_bound_dev_if;
sock_diag_save_cookie(req, r->id.idiag_cookie);
tmo = req->expires - jiffies;
if (tmo < 0)
tmo = 0;
r->id.idiag_sport = inet->inet_sport;
r->id.idiag_dport = ireq->ir_rmt_port;
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
r->id.idiag_src[0] = ireq->ir_loc_addr;
r->id.idiag_dst[0] = ireq->ir_rmt_addr;
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
struct inet_diag_entry entry;
inet_diag_req_addrs(sk, req, &entry);
memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
}
#endif
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 00:09:00 +03:00
nlmsg_end(skb, nlh);
return 0;
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
struct inet_diag_entry entry;
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt;
struct inet_sock *inet = inet_sk(sk);
int j, s_j;
int reqnum, s_reqnum;
int err = 0;
s_j = cb->args[3];
s_reqnum = cb->args[4];
if (s_j > 0)
s_j--;
entry.family = sk->sk_family;
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt;
if (!lopt || !lopt->qlen)
goto out;
if (bc != NULL) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
for (j = s_j; j < lopt->nr_table_entries; j++) {
struct request_sock *req, *head = lopt->syn_table[j];
reqnum = 0;
for (req = head; req; reqnum++, req = req->dl_next) {
struct inet_request_sock *ireq = inet_rsk(req);
if (reqnum < s_reqnum)
continue;
if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
inet_diag_req_addrs(sk, req, &entry);
entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
}
err = inet_diag_fill_req(skb, sk, req,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
cb->args[4] = reqnum;
goto out;
}
}
s_reqnum = 0;
}
out:
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
return err;
}
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int i, num;
int s_i, s_num;
struct net *net = sock_net(skb->sk);
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct sock *sk;
struct hlist_nulls_node *node;
struct inet_listen_hashbucket *ilb;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num) {
num++;
continue;
}
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
goto next_listen;
if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_listen;
if (!(r->idiag_states & TCPF_LISTEN) ||
r->id.idiag_dport ||
cb->args[3] > 0)
goto syn_recv;
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
syn_recv:
if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
next_listen:
cb->args[3] = 0;
cb->args[4] = 0;
++num;
}
spin_unlock_bh(&ilb->lock);
s_num = 0;
cb->args[3] = 0;
cb->args[4] = 0;
}
skip_listen_ht:
cb->args[0] = 1;
s_i = num = s_num = 0;
}
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct sock *sk;
struct hlist_nulls_node *node;
num = 0;
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if (hlist_nulls_empty(&head->chain))
continue;
if (i > s_i)
s_num = 0;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
int res;
int state;
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num)
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
if (!(r->idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
sk->sk_family != r->sdiag_family)
goto next_normal;
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if (r->id.idiag_sport != htons(sk->sk_num) &&
r->id.idiag_sport)
goto next_normal;
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if (sk->sk_state == TCP_TIME_WAIT)
res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
else
res = inet_csk_diag_dump(sk, skb, cb, r, bc);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
}
next_normal:
++num;
}
spin_unlock_bh(lock);
}
done:
cb->args[1] = i;
cb->args[2] = num;
out:
;
}
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct inet_diag_req_v2 *r, struct nlattr *bc)
{
const struct inet_diag_handler *handler;
int err = 0;
handler = inet_diag_lock_handler(r->sdiag_protocol);
if (!IS_ERR(handler))
handler->dump(skb, cb, r, bc);
else
err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
return err ? : skb->len;
}
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req_v2);
if (nlmsg_attrlen(cb->nlh, hdrlen))
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
}
static inline int inet_diag_type2proto(int type)
{
switch (type) {
case TCPDIAG_GETSOCK:
return IPPROTO_TCP;
case DCCPDIAG_GETSOCK:
return IPPROTO_DCCP;
default:
return 0;
}
}
static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
struct inet_diag_req_v2 req;
struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req);
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
if (nlmsg_attrlen(cb->nlh, hdrlen))
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
return __inet_diag_dump(skb, cb, &req, bc);
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
const struct nlmsghdr *nlh)
{
struct inet_diag_req *rc = nlmsg_data(nlh);
struct inet_diag_req_v2 req;
req.sdiag_family = rc->idiag_family;
req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
return inet_diag_get_exact(in_skb, nlh, &req);
}
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int hdrlen = sizeof(struct inet_diag_req);
struct net *net = sock_net(skb->sk);
if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
nlmsg_len(nlh) < hdrlen)
return -EINVAL;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
attr = nlmsg_find_attr(nlh, hdrlen,
INET_DIAG_REQ_BYTECODE);
if (attr == NULL ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
{
struct netlink_dump_control c = {
.dump = inet_diag_dump_compat,
};
return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
}
return inet_diag_get_exact_compat(skb, nlh);
}
static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct inet_diag_req_v2);
struct net *net = sock_net(skb->sk);
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
if (h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
attr = nlmsg_find_attr(h, hdrlen,
INET_DIAG_REQ_BYTECODE);
if (attr == NULL ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
{
struct netlink_dump_control c = {
.dump = inet_diag_dump,
};
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
}
return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
};
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
int err = -EINVAL;
if (type >= IPPROTO_MAX)
goto out;
mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
if (inet_diag_table[type] == NULL) {
inet_diag_table[type] = h;
err = 0;
}
mutex_unlock(&inet_diag_table_mutex);
out:
return err;
}
EXPORT_SYMBOL_GPL(inet_diag_register);
void inet_diag_unregister(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
if (type >= IPPROTO_MAX)
return;
mutex_lock(&inet_diag_table_mutex);
inet_diag_table[type] = NULL;
mutex_unlock(&inet_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
static int __init inet_diag_init(void)
{
const int inet_diag_table_size = (IPPROTO_MAX *
sizeof(struct inet_diag_handler *));
int err = -ENOMEM;
inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
if (!inet_diag_table)
goto out;
err = sock_diag_register(&inet_diag_handler);
if (err)
goto out_free_nl;
err = sock_diag_register(&inet6_diag_handler);
if (err)
goto out_free_inet;
sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
out:
return err;
out_free_inet:
sock_diag_unregister(&inet_diag_handler);
out_free_nl:
kfree(inet_diag_table);
goto out;
}
static void __exit inet_diag_exit(void)
{
sock_diag_unregister(&inet6_diag_handler);
sock_diag_unregister(&inet_diag_handler);
sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
kfree(inet_diag_table);
}
module_init(inet_diag_init);
module_exit(inet_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);