From 49ca0d8bfaf3bc46d5eef60ce67b00eb195bd392 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Jan 2015 13:29:31 -0500 Subject: [PATCH 1/3] net-timestamp: no-payload option Add timestamping option SOF_TIMESTAMPING_OPT_TSONLY. For transmit timestamps, this loops timestamps on top of empty packets. Doing so reduces the pressure on SO_RCVBUF. Payload inspection and cmsg reception (aside from timestamps) are no longer possible. This works together with a follow on patch that allows administrators to only allow tx timestamping if it does not loop payload or metadata. Signed-off-by: Willem de Bruijn ---- Changes (rfc -> v1) - add documentation - remove unnecessary skb->len test (thanks to Richard Cochran) Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 21 +++++++++++++++++++++ include/uapi/linux/net_tstamp.h | 3 ++- net/core/skbuff.c | 19 ++++++++++++++----- net/ipv4/ip_sockglue.c | 7 ++++--- net/ipv6/datagram.c | 5 ++--- net/rxrpc/ar-error.c | 5 +++++ 6 files changed, 48 insertions(+), 12 deletions(-) diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index a5c784c89312..5f0922613f1a 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -162,6 +162,27 @@ SOF_TIMESTAMPING_OPT_CMSG: option IP_PKTINFO simultaneously. +SOF_TIMESTAMPING_OPT_TSONLY: + + Applies to transmit timestamps only. Makes the kernel return the + timestamp as a cmsg alongside an empty packet, as opposed to + alongside the original packet. This reduces the amount of memory + charged to the socket's receive budget (SO_RCVBUF) and delivers + the timestamp even if sysctl net.core.tstamp_allow_data is 0. + This option disables SOF_TIMESTAMPING_OPT_CMSG. + + +New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to +disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate +regardless of the setting of sysctl net.core.tstamp_allow_data. + +An exception is when a process needs additional cmsg data, for +instance SOL_IP/IP_PKTINFO to detect the egress network interface. +Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on +having access to the contents of the original packet, so cannot be +combined with SOF_TIMESTAMPING_OPT_TSONLY. + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index edbc888ceb51..6d1abea9746e 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -24,8 +24,9 @@ enum { SOF_TIMESTAMPING_TX_SCHED = (1<<8), SOF_TIMESTAMPING_TX_ACK = (1<<9), SOF_TIMESTAMPING_OPT_CMSG = (1<<10), + SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 56db472e9b86..65a3798f43e6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3710,19 +3710,28 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; + bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!sk) return; - if (hwtstamps) - *skb_hwtstamps(orig_skb) = *hwtstamps; + if (tsonly) + skb = alloc_skb(0, GFP_ATOMIC); else - orig_skb->tstamp = ktime_get_real(); - - skb = skb_clone(orig_skb, GFP_ATOMIC); + skb = skb_clone(orig_skb, GFP_ATOMIC); if (!skb) return; + if (tsonly) { + skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; + skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; + } + + if (hwtstamps) + *skb_hwtstamps(skb) = *hwtstamps; + else + skb->tstamp = ktime_get_real(); + __skb_complete_tx_timestamp(skb, sk, tstype); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index db5e0f81ce0a..31d8c71986b4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -483,7 +483,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin) { + if (sin && skb->len) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -496,8 +496,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || - ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { + if (skb->len && + (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_sk(sk)->cmsg_flags) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 49f5e73db122..c215be70cac0 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -369,7 +369,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin) { + if (sin && skb->len) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -394,8 +394,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - - if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { sin->sin6_family = AF_INET6; if (np->rxopt.all) { if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 74c0fcd36838..5394b6be46ec 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -42,6 +42,11 @@ void rxrpc_UDP_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } + if (!skb->len) { + _leave("UDP empty message"); + kfree_skb(skb); + return; + } rxrpc_new_skb(skb); From b245be1f4db1a0394e4b6eb66059814b46670ac3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Jan 2015 13:29:32 -0500 Subject: [PATCH 2/3] net-timestamp: no-payload only sysctl Tx timestamps are looped onto the error queue on top of an skb. This mechanism leaks packet headers to processes unless the no-payload options SOF_TIMESTAMPING_OPT_TSONLY is set. Add a sysctl that optionally drops looped timestamp with data. This only affects processes without CAP_NET_RAW. The policy is checked when timestamps are generated in the stack. It is possible for timestamps with data to be reported after the sysctl is set, if these were queued internally earlier. No vulnerability is immediately known that exploits knowledge gleaned from packet headers, but it may still be preferable to allow administrators to lock down this path at the cost of possible breakage of legacy applications. Signed-off-by: Willem de Bruijn ---- Changes (v1 -> v2) - test socket CAP_NET_RAW instead of capable(CAP_NET_RAW) (rfc -> v1) - document the sysctl in Documentation/sysctl/net.txt - fix access control race: read .._OPT_TSONLY only once, use same value for permission check and skb generation. Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 8 ++++++++ include/net/sock.h | 1 + net/core/skbuff.c | 21 ++++++++++++++++++++- net/core/sock.c | 3 +++ net/core/sysctl_net_core.c | 9 +++++++++ 5 files changed, 41 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 666594b43cff..6294b5186ae5 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -97,6 +97,14 @@ rmem_max The maximum receive socket buffer size in bytes. +tstamp_allow_data +----------------- +Allow processes to receive tx timestamps looped together with the original +packet contents. If disabled, transmit timestamp requests from unprivileged +processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set. +Default: 1 (on) + + wmem_default ------------ diff --git a/include/net/sock.h b/include/net/sock.h index 15341499786c..511ef7c8889b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2239,6 +2239,7 @@ bool sk_net_capable(const struct sock *sk, int cap); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; +extern int sysctl_tstamp_allow_data; extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 65a3798f43e6..a5bff2767f15 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -74,6 +74,8 @@ #include #include #include +#include +#include struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; @@ -3690,11 +3692,28 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, kfree_skb(skb); } +static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) +{ + bool ret; + + if (likely(sysctl_tstamp_allow_data || tsonly)) + return true; + + read_lock_bh(&sk->sk_callback_lock); + ret = sk->sk_socket && sk->sk_socket->file && + file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); + read_unlock_bh(&sk->sk_callback_lock); + return ret; +} + void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps) { struct sock *sk = skb->sk; + if (!skb_may_tx_timestamp(sk, false)) + return; + /* take a reference to prevent skb_orphan() from freeing the socket */ sock_hold(sk); @@ -3712,7 +3731,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sk_buff *skb; bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; - if (!sk) + if (!sk || !skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) diff --git a/net/core/sock.c b/net/core/sock.c index 1c7a33db1314..93c8b20c91e4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -325,6 +325,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +int sysctl_tstamp_allow_data __read_mostly = 1; + struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL_GPL(memalloc_socks); @@ -840,6 +842,7 @@ set_rcvbuf: ret = -EINVAL; break; } + if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 31baba2a71ce..fde21d19e61b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -321,6 +321,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tstamp_allow_data", + .data = &sysctl_tstamp_allow_data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one + }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", From 2368592365bc97e941d0c641a3ba24b06d2c469b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Jan 2015 13:29:33 -0500 Subject: [PATCH 3/3] net-timestamp: no-payload option in txtimestamp test Demonstrate how SOF_TIMESTAMPING_OPT_TSONLY can be used and test the implementation. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- .../networking/timestamping/txtimestamp.c | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/timestamping/txtimestamp.c b/Documentation/networking/timestamping/txtimestamp.c index 05694febc238..8217510d3842 100644 --- a/Documentation/networking/timestamping/txtimestamp.c +++ b/Documentation/networking/timestamping/txtimestamp.c @@ -70,6 +70,7 @@ static int do_ipv6 = 1; static int cfg_payload_len = 10; static bool cfg_show_payload; static bool cfg_do_pktinfo; +static bool cfg_loop_nodata; static uint16_t dest_port = 9000; static struct sockaddr_in daddr; @@ -141,6 +142,9 @@ static void print_payload(char *data, int len) { int i; + if (!len) + return; + if (len > 70) len = 70; @@ -177,6 +181,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) struct sock_extended_err *serr = NULL; struct scm_timestamping *tss = NULL; struct cmsghdr *cm; + int batch = 0; for (cm = CMSG_FIRSTHDR(msg); cm && cm->cmsg_len; @@ -209,10 +214,18 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) } else fprintf(stderr, "unknown cmsg %d,%d\n", cm->cmsg_level, cm->cmsg_type); + + if (serr && tss) { + print_timestamp(tss, serr->ee_info, serr->ee_data, + payload_len); + serr = NULL; + tss = NULL; + batch++; + } } - if (serr && tss) - print_timestamp(tss, serr->ee_info, serr->ee_data, payload_len); + if (batch > 1) + fprintf(stderr, "batched %d timestamps\n", batch); } static int recv_errmsg(int fd) @@ -244,7 +257,7 @@ static int recv_errmsg(int fd) if (ret == -1 && errno != EAGAIN) error(1, errno, "recvmsg"); - if (ret > 0) { + if (ret >= 0) { __recv_errmsg_cmsg(&msg, ret); if (cfg_show_payload) print_payload(data, cfg_payload_len); @@ -309,6 +322,9 @@ static void do_test(int family, unsigned int opt) opt |= SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_CMSG | SOF_TIMESTAMPING_OPT_ID; + if (cfg_loop_nodata) + opt |= SOF_TIMESTAMPING_OPT_TSONLY; + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (char *) &opt, sizeof(opt))) error(1, 0, "setsockopt timestamping"); @@ -378,6 +394,7 @@ static void __attribute__((noreturn)) usage(const char *filepath) " -h: show this message\n" " -I: request PKTINFO\n" " -l N: send N bytes at a time\n" + " -n: set no-payload option\n" " -r: use raw\n" " -R: use raw (IP_HDRINCL)\n" " -p N: connect to port N\n" @@ -392,7 +409,7 @@ static void parse_opt(int argc, char **argv) int proto_count = 0; char c; - while ((c = getopt(argc, argv, "46hIl:p:rRux")) != -1) { + while ((c = getopt(argc, argv, "46hIl:np:rRux")) != -1) { switch (c) { case '4': do_ipv6 = 0; @@ -403,6 +420,9 @@ static void parse_opt(int argc, char **argv) case 'I': cfg_do_pktinfo = true; break; + case 'n': + cfg_loop_nodata = true; + break; case 'r': proto_count++; cfg_proto = SOCK_RAW;