From 5682d393b40e1fe7426a7b8c3471f05262f42010 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:09 -0800 Subject: [PATCH 1/4] inet_diag: Refactor inet_sk_diag_fill(), dump(), and dump_one() In a latter patch, there is a need to update "cb->min_dump_alloc" in inet_sk_diag_fill() as it learns the diffierent bpf_sk_storages stored in a sk while dumping all sk(s) (e.g. tcp_hashinfo). The inet_sk_diag_fill() currently does not take the "cb" as an argument. One of the reason is inet_sk_diag_fill() is used by both dump_one() and dump() (which belong to the "struct inet_diag_handler". The dump_one() interface does not pass the "cb" along. This patch is to make dump_one() pass a "cb". The "cb" is created in inet_diag_cmd_exact(). The "nlh" and "in_skb" are stored in "cb" as the dump() interface does. The total number of args in inet_sk_diag_fill() is also cut from 10 to 7 and that helps many callers to pass fewer args. In particular, "struct user_namespace *user_ns", "u32 pid", and "u32 seq" can be replaced by accessing "cb->nlh" and "cb->skb". A similar argument reduction is also made to inet_twsk_diag_fill() and inet_req_diag_fill(). inet_csk_diag_dump() and inet_csk_diag_fill() are also removed. They are mostly equivalent to inet_sk_diag_fill(). Their repeated usages are very limited. Thus, inet_sk_diag_fill() is directly used in those occasions. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230409.1975173-1-kafai@fb.com --- include/linux/inet_diag.h | 12 ++-- net/dccp/diag.c | 5 +- net/ipv4/inet_diag.c | 116 +++++++++++++++----------------------- net/ipv4/raw_diag.c | 18 ++---- net/ipv4/tcp_diag.c | 4 +- net/ipv4/udp_diag.c | 26 ++++----- net/sctp/diag.c | 5 +- 7 files changed, 73 insertions(+), 113 deletions(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 39faaaf843e1..6b157ce07d74 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -18,8 +18,7 @@ struct inet_diag_handler { const struct inet_diag_req_v2 *r, struct nlattr *bc); - int (*dump_one)(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, + int (*dump_one)(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); void (*idiag_get_info)(struct sock *sk, @@ -42,16 +41,15 @@ struct inet_diag_handler { struct inet_connection_sock; int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin); + struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + u16 nlmsg_flags, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct sk_buff *in_skb, const struct nlmsghdr *nlh, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req); struct sock *inet_diag_find_one_icsk(struct net *net, diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 73ef73a218ff..8f1e2a653f6d 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -51,11 +51,10 @@ static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); } -static int dccp_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int dccp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req); + return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); } static const struct inet_diag_handler dccp_diag_handler = { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f11e997e517b..d2ecff3195ba 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -157,11 +157,9 @@ errout: EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, - bool net_admin) + struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + u16 nlmsg_flags, bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; @@ -174,8 +172,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(!handler); - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -187,7 +185,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, + sk_user_ns(NETLINK_CB(cb->skb).sk), + net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { @@ -312,30 +312,19 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_csk_diag_fill(struct sock *sk, - struct sk_buff *skb, - const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, - bool net_admin) -{ - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, - portid, seq, nlmsg_flags, unlh, net_admin); -} - static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + struct netlink_callback *cb, + u16 nlmsg_flags) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -359,16 +348,16 @@ static int inet_twsk_diag_fill(struct sock *sk, } static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin) + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) { struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -397,21 +386,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin) + u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh, net_admin); + return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh, net_admin); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -459,10 +445,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net, EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct sk_buff *in_skb, - const struct nlmsghdr *nlh, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); struct net *net = sock_net(in_skb->sk); struct sk_buff *rep; @@ -479,10 +465,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, goto out; } - err = sk_diag_fill(sk, rep, req, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, net_admin); + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -509,14 +492,19 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, int err; handler = inet_diag_lock_handler(req->sdiag_protocol); - if (IS_ERR(handler)) + if (IS_ERR(handler)) { err = PTR_ERR(handler); - else if (cmd == SOCK_DIAG_BY_FAMILY) - err = handler->dump_one(in_skb, nlh, req); - else if (cmd == SOCK_DESTROY && handler->destroy) + } else if (cmd == SOCK_DIAG_BY_FAMILY) { + struct netlink_callback cb = { + .nlh = nlh, + .skb = in_skb, + }; + err = handler->dump_one(&cb, req); + } else if (cmd == SOCK_DESTROY && handler->destroy) { err = handler->destroy(in_skb, req); - else + } else { err = -EOPNOTSUPP; + } inet_diag_unlock_handler(handler); return err; @@ -847,23 +835,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static int inet_csk_diag_dump(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc, - bool net_admin) -{ - if (!inet_diag_bc_sk(bc, sk)) - return 0; - - return inet_csk_diag_fill(sk, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, - net_admin); -} - static void twsk_build_assert(void) { BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != @@ -935,8 +906,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, - bc, net_admin) < 0) { + if (!inet_diag_bc_sk(bc, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { spin_unlock(&ilb->lock); goto done; } @@ -1014,11 +989,8 @@ next_normal: res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh, net_admin); + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e35736b99300..a2933eeabd91 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -87,15 +87,16 @@ out_unlock: return sk ? sk : ERR_PTR(-ENOENT); } -static int raw_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct net *net = sock_net(in_skb->sk); + struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; + struct net *net; int err; + net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -108,10 +109,7 @@ static int raw_diag_dump_one(struct sk_buff *in_skb, return -ENOMEM; } - err = inet_sk_diag_fill(sk, NULL, rep, r, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, + err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); @@ -136,11 +134,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, if (!inet_diag_bc_sk(bc, sk)) return 0; - return inet_sk_diag_fill(sk, NULL, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh, net_admin); + return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0d08f9e2d8d0..bcd3a26efff1 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -184,10 +184,10 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); } -static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); + return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 910555a4d9fe..7d65a6a5cd51 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, if (!inet_diag_bc_sk(bc, sk)) return 0; - return inet_sk_diag_fill(sk, NULL, skb, req, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); + return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, + net_admin); } -static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int udp_dump_one(struct udp_table *tbl, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; int err = -EINVAL; struct sock *sk = NULL; struct sk_buff *rep; @@ -70,11 +69,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, if (!rep) goto out; - err = inet_sk_diag_fill(sk, NULL, rep, req, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, - netlink_net_capable(in_skb, CAP_NET_ADMIN)); + err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -151,10 +147,10 @@ static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, udp_dump(&udp_table, skb, cb, r, bc); } -static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return udp_dump_one(&udp_table, in_skb, nlh, req); + return udp_dump_one(&udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -255,10 +251,10 @@ static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, udp_dump(&udplite_table, skb, cb, r, bc); } -static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return udp_dump_one(&udplite_table, in_skb, nlh, req); + return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 8a15146faaeb..bed6436cd0af 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -432,11 +432,12 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); } -static int sctp_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int sctp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; struct net *net = sock_net(in_skb->sk); + const struct nlmsghdr *nlh = cb->nlh; union sctp_addr laddr, paddr; struct sctp_comm_param commp = { .skb = in_skb, From 0df6d32842b9a5f97a29ea90c8adc5cfac38341d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:15 -0800 Subject: [PATCH 2/4] inet_diag: Move the INET_DIAG_REQ_BYTECODE nlattr to cb->data The INET_DIAG_REQ_BYTECODE nlattr is currently re-found every time when the "dump()" is re-started. In a latter patch, it will also need to parse the new INET_DIAG_REQ_SK_BPF_STORAGES nlattr to learn the map_fds. Thus, this patch takes this chance to store the parsed nlattr in cb->data during the "start" time of a dump. By doing this, the "bc" argument also becomes unnecessary and is removed. Also, the two copies of the INET_DIAG_REQ_BYTECODE parsing-audit logic between compat/current version can be consolidated to one. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230415.1975555-1-kafai@fb.com --- include/linux/inet_diag.h | 11 ++-- include/uapi/linux/inet_diag.h | 3 +- net/dccp/diag.c | 4 +- net/ipv4/inet_diag.c | 117 ++++++++++++++++++++------------- net/ipv4/raw_diag.c | 6 +- net/ipv4/tcp_diag.c | 4 +- net/ipv4/udp_diag.c | 15 +++-- net/sctp/diag.c | 2 +- 8 files changed, 98 insertions(+), 64 deletions(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 6b157ce07d74..1bb94cac265f 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -15,8 +15,7 @@ struct netlink_callback; struct inet_diag_handler { void (*dump)(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); int (*dump_one)(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); @@ -39,6 +38,11 @@ struct inet_diag_handler { __u16 idiag_info_size; }; +struct inet_diag_dump_data { + struct nlattr *req_nlas[__INET_DIAG_REQ_MAX]; +#define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE] +}; + struct inet_connection_sock; int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, @@ -46,8 +50,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, u16 nlmsg_flags, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct netlink_callback *cb, const struct inet_diag_req_v2 *req); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index a1ff345b3f33..bab9a9f8da12 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -64,9 +64,10 @@ struct inet_diag_req_raw { enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, + __INET_DIAG_REQ_MAX, }; -#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE +#define INET_DIAG_REQ_MAX (__INET_DIAG_REQ_MAX - 1) /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 8f1e2a653f6d..8a82c5a2c5a8 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -46,9 +46,9 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); } static int dccp_diag_dump_one(struct netlink_callback *cb, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index d2ecff3195ba..4bce8a477699 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -495,9 +495,11 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { + struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, + .data = &empty_dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -863,14 +865,17 @@ static void twsk_build_assert(void) void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; int i, num, s_i, s_num; + struct nlattr *bc; struct sock *sk; + bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -1014,15 +1019,14 @@ out: EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { const struct inet_diag_handler *handler; int err = 0; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) - handler->dump(skb, cb, r, bc); + handler->dump(skb, cb, r); else err = PTR_ERR(handler); inet_diag_unlock_handler(handler); @@ -1032,13 +1036,57 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int hdrlen = sizeof(struct inet_diag_req_v2); - struct nlattr *bc = NULL; + return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); +} - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); +static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct inet_diag_dump_data *cb_data; + struct sk_buff *skb = cb->skb; + struct nlattr *nla; + int rem, err; - return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); + cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); + if (!cb_data) + return -ENOMEM; + + nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + cb_data->req_nlas[type] = nla; + } + + nla = cb_data->inet_diag_nla_bc; + if (nla) { + err = inet_diag_bc_audit(nla, skb); + if (err) { + kfree(cb_data); + return err; + } + } + + cb->data = cb_data; + return 0; +} + +static int inet_diag_dump_start(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); +} + +static int inet_diag_dump_start_compat(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); +} + +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + kfree(cb->data); + + return 0; } static int inet_diag_type2proto(int type) @@ -1057,9 +1105,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); - int hdrlen = sizeof(struct inet_diag_req); struct inet_diag_req_v2 req; - struct nlattr *bc = NULL; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); @@ -1067,10 +1113,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - - return __inet_diag_dump(skb, cb, &req, bc); + return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, @@ -1098,22 +1141,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(nlh, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(nlh, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump_compat, - }; - return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start_compat, + .done = inet_diag_dump_done, + .dump = inet_diag_dump_compat, + }; + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); @@ -1129,22 +1162,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(h, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(h, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump, - }; - return netlink_dump_start(net->diag_nlsk, skb, h, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start, + .done = inet_diag_dump_done, + .dump = inet_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index a2933eeabd91..d19cce39be1b 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -138,17 +138,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct sock *sk = NULL; + struct nlattr *bc; if (IS_ERR(hashinfo)) return; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index bcd3a26efff1..75a1c985f49a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -179,9 +179,9 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 7d65a6a5cd51..93884696abdd 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -89,12 +89,16 @@ out_nosk: static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; + struct nlattr *bc; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -142,9 +146,9 @@ done: } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udp_table, skb, cb, r, bc); + udp_dump(&udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, @@ -245,10 +249,9 @@ static const struct inet_diag_handler udp_diag_handler = { }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udplite_table, skb, cb, r, bc); + udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, diff --git a/net/sctp/diag.c b/net/sctp/diag.c index bed6436cd0af..69743a6aaf6f 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -471,7 +471,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb, } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { u32 idiag_states = r->idiag_states; struct net *net = sock_net(skb->sk); From 1ed4d92458a969e71e7914550b6f0c730c14d84e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:21 -0800 Subject: [PATCH 3/4] bpf: INET_DIAG support in bpf_sk_storage This patch adds INET_DIAG support to bpf_sk_storage. 1. Although this series adds bpf_sk_storage diag capability to inet sk, bpf_sk_storage is in general applicable to all fullsock. Hence, the bpf_sk_storage logic will operate on SK_DIAG_* nlattr. The caller will pass in its specific nesting nlattr (e.g. INET_DIAG_*) as the argument. 2. The request will be like: INET_DIAG_REQ_SK_BPF_STORAGES (nla_nest) (defined in latter patch) SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32) SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32) ...... Considering there could have multiple bpf_sk_storages in a sk, instead of reusing INET_DIAG_INFO ("ss -i"), the user can select some specific bpf_sk_storage to dump by specifying an array of SK_DIAG_BPF_STORAGE_REQ_MAP_FD. If no SK_DIAG_BPF_STORAGE_REQ_MAP_FD is specified (i.e. an empty INET_DIAG_REQ_SK_BPF_STORAGES), it will dump all bpf_sk_storages of a sk. 3. The reply will be like: INET_DIAG_BPF_SK_STORAGES (nla_nest) (defined in latter patch) SK_DIAG_BPF_STORAGE (nla_nest) SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) SK_DIAG_BPF_STORAGE (nla_nest) SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) ...... 4. Unlike other INET_DIAG info of a sk which is pretty static, the size required to dump the bpf_sk_storage(s) of a sk is dynamic as the system adding more bpf_sk_storage_map. It is hard to set a static min_dump_alloc size. Hence, this series learns it at the runtime and adjust the cb->min_dump_alloc as it iterates all sk(s) of a system. The "unsigned int *res_diag_size" in bpf_sk_storage_diag_put() is for this purpose. The next patch will update the cb->min_dump_alloc as it iterates the sk(s). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230421.1975729-1-kafai@fb.com --- include/linux/bpf.h | 1 + include/net/bpf_sk_storage.h | 27 ++++ include/uapi/linux/sock_diag.h | 26 +++ kernel/bpf/syscall.c | 15 ++ net/core/bpf_sk_storage.c | 283 ++++++++++++++++++++++++++++++++- 5 files changed, 346 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9aa33b8f3d55..6015a4daf118 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1023,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 8e4f831d2e52..5036c94c0503 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_sk_storage_diag; +struct sk_buff; +struct nlattr; +struct sock; + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs); +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag); +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size); #else static inline int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { return 0; } +static inline struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla) +{ + return NULL; +} +static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ +} +static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + return 0; +} #endif #endif /* _BPF_SK_STORAGE_H */ diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index e5925009a652..5f74a5f6091d 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -36,4 +36,30 @@ enum sknetlink_groups { }; #define SKNLGRP_MAX (__SKNLGRP_MAX - 1) +enum { + SK_DIAG_BPF_STORAGE_REQ_NONE, + SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + __SK_DIAG_BPF_STORAGE_REQ_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_REQ_MAX (__SK_DIAG_BPF_STORAGE_REQ_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_REP_NONE, + SK_DIAG_BPF_STORAGE, + __SK_DIAG_BPF_STORAGE_REP_MAX, +}; + +#define SK_DIAB_BPF_STORAGE_REP_MAX (__SK_DIAG_BPF_STORAGE_REP_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_NONE, + SK_DIAG_BPF_STORAGE_PAD, + SK_DIAG_BPF_STORAGE_MAP_ID, + SK_DIAG_BPF_STORAGE_MAP_VALUE, + __SK_DIAG_BPF_STORAGE_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1) + #endif /* _UAPI__SOCK_DIAG_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a79743a89815..c536c65256ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -902,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map); + fdput(f); + + return map; +} + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 3ab23f698221..3415a4896c59 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -8,6 +8,7 @@ #include #include #include +#include #include static atomic_t cache_idx; @@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) kfree(map); } +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_sk_storage_elem))) + static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || @@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || - /* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ - attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + if (attr->value_size > MAX_VALUE_SIZE) return -E2BIG; return 0; @@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_SOCKET, }; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + nla_for_each_nested(nla, nla_stgs, rem) { + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + nr_maps++; + } + + diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, + GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nla, nla_stgs, rem) { + struct bpf_map *map; + int map_fd; + + if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + continue; + + map_fd = nla_get_u32(nla); + map = bpf_map_get(map_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_sk_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = __sk_storage_lookup(sk_storage, + (struct bpf_sk_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); From 085c20cacf2b72991ce1c9d99a5e2f1d9e73bb68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:27 -0800 Subject: [PATCH 4/4] bpf: inet_diag: Dump bpf_sk_storages in inet_diag_dump() This patch will dump out the bpf_sk_storages of a sk if the request has the INET_DIAG_REQ_SK_BPF_STORAGES nlattr. An array of SK_DIAG_BPF_STORAGE_REQ_MAP_FD can be specified in INET_DIAG_REQ_SK_BPF_STORAGES to select which bpf_sk_storage to dump. If no map_fd is specified, all bpf_sk_storages of a sk will be dumped. bpf_sk_storages can be added to the system at runtime. It is difficult to find a proper static value for cb->min_dump_alloc. This patch learns the nlattr size required to dump the bpf_sk_storages of a sk. If it happens to be the very first nlmsg of a dump and it cannot fit the needed bpf_sk_storages, it will try to expand the skb by "pskb_expand_head()". Instead of expanding it in inet_sk_diag_fill(), it is expanded at a sleepable context in __inet_diag_dump() so __GFP_DIRECT_RECLAIM can be used. In __inet_diag_dump(), it will retry as long as the skb is empty and the cb->min_dump_alloc becomes larger than before. cb->min_dump_alloc is bounded by KMALLOC_MAX_SIZE. The min_dump_alloc is also changed from 'u16' to 'u32' to accommodate a sk that may have a few large bpf_sk_storages. The updated cb->min_dump_alloc will also be used to allocate the skb in the next dump. This logic already exists in netlink_dump(). Here is the sample output of a locally modified 'ss' and it could be made more readable by using BTF later: [root@arch-fb-vm1 ~]# ss --bpf-map-id 14 --bpf-map-id 13 -t6an 'dst [::1]:8989' State Recv-Q Send-Q Local Address:Port Peer Address:PortProcess ESTAB 0 0 [::1]:51072 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] ESTAB 0 0 [::1]:51070 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] [root@arch-fb-vm1 ~]# ~/devshare/github/iproute2/misc/ss --bpf-maps -t6an 'dst [::1]:8989' State Recv-Q Send-Q Local Address:Port Peer Address:Port Process ESTAB 0 0 [::1]:51072 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ] ESTAB 0 0 [::1]:51070 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ] Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230427.1976129-1-kafai@fb.com --- include/linux/inet_diag.h | 4 ++ include/linux/netlink.h | 4 +- include/uapi/linux/inet_diag.h | 2 + net/ipv4/inet_diag.c | 74 ++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 1bb94cac265f..e4ba25d63913 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -38,9 +38,13 @@ struct inet_diag_handler { __u16 idiag_info_size; }; +struct bpf_sk_storage_diag; struct inet_diag_dump_data { struct nlattr *req_nlas[__INET_DIAG_REQ_MAX]; #define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE] +#define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] + + struct bpf_sk_storage_diag *bpf_stg_diag; }; struct inet_connection_sock; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 205fa7b1f07a..788969ccbbde 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -188,10 +188,10 @@ struct netlink_callback { struct module *module; struct netlink_ext_ack *extack; u16 family; - u16 min_dump_alloc; - bool strict_check; u16 answer_flags; + u32 min_dump_alloc; unsigned int prev_seq, seq; + bool strict_check; union { u8 ctx[48]; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bab9a9f8da12..75dffd78363a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -64,6 +64,7 @@ struct inet_diag_req_raw { enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, + INET_DIAG_REQ_SK_BPF_STORAGES, __INET_DIAG_REQ_MAX, }; @@ -155,6 +156,7 @@ enum { INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, + INET_DIAG_SK_BPF_STORAGES, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4bce8a477699..e1cad25909df 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -156,6 +157,8 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, @@ -163,12 +166,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; + struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + cb_data = cb->data; handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(!handler); @@ -302,6 +307,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } + /* Keep it at the end for potential retry with a larger skb, + * or else do best-effort fitting, which is only done for the + * first_nlmsg. + */ + if (cb_data->bpf_stg_diag) { + bool first_nlmsg = ((unsigned char *)nlh == skb->data); + unsigned int prev_min_dump_alloc; + unsigned int total_nla_size = 0; + unsigned int msg_len; + int err; + + msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; + err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, + INET_DIAG_SK_BPF_STORAGES, + &total_nla_size); + + if (!err) + goto out; + + total_nla_size += msg_len; + prev_min_dump_alloc = cb->min_dump_alloc; + if (total_nla_size > prev_min_dump_alloc) + cb->min_dump_alloc = min_t(u32, total_nla_size, + MAX_DUMP_ALLOC_SIZE); + + if (!first_nlmsg) + goto errout; + + if (cb->min_dump_alloc > prev_min_dump_alloc) + /* Retry with pskb_expand_head() with + * __GFP_DIRECT_RECLAIM + */ + goto errout; + + WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); + + /* Send what we have for this sk + * and move on to the next sk in the following + * dump() + */ + } + out: nlmsg_end(skb, nlh); return 0; @@ -1022,8 +1069,11 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { const struct inet_diag_handler *handler; + u32 prev_min_dump_alloc; int err = 0; +again: + prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); @@ -1031,6 +1081,15 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, err = PTR_ERR(handler); inet_diag_unlock_handler(handler); + /* The skb is not large enough to fit one sk info and + * inet_sk_diag_fill() has requested for a larger skb. + */ + if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { + err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); + if (!err) + goto again; + } + return err ? : skb->len; } @@ -1068,6 +1127,18 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) } } + nla = cb_data->inet_diag_nla_bpf_stgs; + if (nla) { + struct bpf_sk_storage_diag *bpf_stg_diag; + + bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); + if (IS_ERR(bpf_stg_diag)) { + kfree(cb_data); + return PTR_ERR(bpf_stg_diag); + } + cb_data->bpf_stg_diag = bpf_stg_diag; + } + cb->data = cb_data; return 0; } @@ -1084,6 +1155,9 @@ static int inet_diag_dump_start_compat(struct netlink_callback *cb) static int inet_diag_dump_done(struct netlink_callback *cb) { + struct inet_diag_dump_data *cb_data = cb->data; + + bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); kfree(cb->data); return 0;