From c47d8c2f38f805ba541496ddd7d8c3aee59b49d5 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:47 -0700 Subject: [PATCH 01/14] net: Clear skb->tstamp only on the forwarding path This is done in preparation for the upcoming time based transmission patchset. Now that skb->tstamp will be used to hold packet's txtime, we must ensure that it is being cleared when traversing namespaces. Also, doing that from skb_scrub_packet() before the early return would break our feature when tunnels are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1357f36c8a5e..c4e24ac27464 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4898,7 +4898,6 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4912,6 +4911,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ipvs_reset(skb); skb->mark = 0; + skb->tstamp = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); From 80b14dee2bea128928537d61c333f24cb8cbb62f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 3 Jul 2018 15:42:48 -0700 Subject: [PATCH 02/14] net: Add a new socket option for a future transmit time. This patch introduces SO_TXTIME. User space enables this option in order to pass a desired future transmit time in a CMSG when calling sendmsg(2). The argument to this socket option is a 8-bytes long struct provided by the uapi header net_tstamp.h defined as: struct sock_txtime { clockid_t clockid; u32 flags; }; Note that new fields were added to struct sock by filling a 2-bytes hole found in the struct. For that reason, neither the struct size or number of cachelines were altered. Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 3 +++ arch/ia64/include/uapi/asm/socket.h | 3 +++ arch/mips/include/uapi/asm/socket.h | 3 +++ arch/parisc/include/uapi/asm/socket.h | 3 +++ arch/s390/include/uapi/asm/socket.h | 3 +++ arch/sparc/include/uapi/asm/socket.h | 3 +++ arch/xtensa/include/uapi/asm/socket.h | 3 +++ include/net/sock.h | 10 ++++++++ include/uapi/asm-generic/socket.h | 3 +++ include/uapi/linux/net_tstamp.h | 15 ++++++++++++ net/core/sock.c | 35 +++++++++++++++++++++++++++ 11 files changed, 84 insertions(+) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index be14f16149d5..065fb372e355 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -112,4 +112,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 3efba40adc54..c872c4e6bafb 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -114,4 +114,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 49c3d4795963..71370fb3ceef 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -123,4 +123,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1d0fdc3b5d22..061b9cf2a779 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -104,4 +104,7 @@ #define SO_ZEROCOPY 0x4035 +#define SO_TXTIME 0x4036 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 3510c0fd06f4..39d901476ee5 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -111,4 +111,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index d58520c2e6ff..7ea35e5601b6 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -101,6 +101,9 @@ #define SO_ZEROCOPY 0x003e +#define SO_TXTIME 0x003f +#define SCM_TXTIME SO_TXTIME + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 75a07b8119a9..1de07a7f7680 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -116,4 +116,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 2ed99bfa4595..68347b9821c6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,9 @@ struct sock_common { * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_rcu: used during RCU grace period + * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) + * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_unused: unused txtime flags */ struct sock { /* @@ -475,6 +478,11 @@ struct sock { u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; + + u8 sk_clockid; + u8 sk_txtime_deadline_mode : 1, + sk_txtime_unused : 7; + struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -790,6 +798,7 @@ enum sock_flags { SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ + SOCK_TXTIME, }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1585,6 +1594,7 @@ void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { + u64 transmit_time; u32 mark; u16 tsflags; }; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 0ae758c90e54..a12692e5f7a8 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -107,4 +107,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 4fe104b2411f..c9a77c353b98 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -141,4 +141,19 @@ struct scm_ts_pktinfo { __u32 reserved[2]; }; +/* + * SO_TXTIME gets a struct sock_txtime with flags being an integer bit + * field comprised of these values. + */ +enum txtime_flags { + SOF_TXTIME_DEADLINE_MODE = (1 << 0), + + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) +}; + +struct sock_txtime { + clockid_t clockid; /* reference clockid */ + u32 flags; /* flags defined by enum txtime_flags */ +}; + #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 6429982eb976..fe64b839f1b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,24 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1135,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1424,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2165,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: From bc969a977880511057053642a81371196303ca01 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:49 -0700 Subject: [PATCH 03/14] net: ipv4: Hook into time based transmission Add a transmit_time field to struct inet_cork, then copy the timestamp from the CMSG cookie at ip_setup_cork() so we can safely copy it into the skb later during __ip_make_skb(). For the raw fast path, just perform the copy at raw_send_hdrinc(). Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/ipv4/icmp.c | 2 ++ net/ipv4/ip_output.c | 3 +++ net/ipv4/ping.c | 1 + net/ipv4/raw.c | 2 ++ net/ipv4/udp.c | 1 + 6 files changed, 10 insertions(+) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83d5b3c2ac42..314be484c696 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -148,6 +148,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u64 transmit_time; }; struct inet_cork_full { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..937239afd68d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -437,6 +437,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -715,6 +716,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 188cc586e7ff..570e3ebc3974 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1154,6 +1154,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->tos = ipc->tos; cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; return 0; } @@ -1414,6 +1415,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1551,6 +1553,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..b47492205507 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -746,6 +746,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..446af7be2b55 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -562,6 +563,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24e116ddae79..5c76ba0666ec 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -930,6 +930,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; From a818f75e311c23cdac528888c60ae6e43a8958d0 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:50 -0700 Subject: [PATCH 04/14] net: ipv6: Hook into time based transmission Add a struct sockcm_cookie parameter to ip6_setup_cork() so we can easily re-use the transmit_time field from struct inet_cork for most paths, by copying the timestamp from the CMSG cookie. This is later copied into the skb during __ip6_make_skb(). For the raw fast path, also pass the sockcm_cookie as a parameter so we can just perform the copy at rawv6_send_hdrinc() directly. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 11 ++++++++--- net/ipv6/raw.c | 7 +++++-- net/ipv6/udp.c | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a14fb4fcdf18..f48af7e62f12 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1158,7 +1158,8 @@ static void ip6_append_data_mtu(unsigned int *mtu, static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, - struct rt6_info *rt, struct flowi6 *fl6) + struct rt6_info *rt, struct flowi6 *fl6, + const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; @@ -1226,6 +1227,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; + cork->base.transmit_time = sockc->transmit_time; + return 0; } @@ -1575,7 +1578,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ err = ip6_setup_cork(sk, &inet->cork, &np->cork, - ipc6, rt, fl6); + ipc6, rt, fl6, sockc); if (err) return err; @@ -1673,6 +1676,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->base.transmit_time; + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { @@ -1765,7 +1770,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.opt = NULL; cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6, sockc); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index afc307c89d1a..5737c50f16eb 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -620,7 +620,7 @@ out: static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, - unsigned int flags) + unsigned int flags, const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *dstp = NULL; @@ -848,6 +849,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = sk->sk_bound_dev_if; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); @@ -921,7 +923,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); + err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, + msg->msg_flags, &sockc); else { ipc6.opt = opt; lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6645cae403e..ac6fc6728903 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1148,6 +1148,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.dontfrag = -1; ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; /* destination address check */ if (sin6) { From 3d0ba8c03ca9c49ffcb79d989312f123dd1bdc7a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 3 Jul 2018 15:42:51 -0700 Subject: [PATCH 05/14] net: packet: Hook into time based transmission. For raw layer-2 packets, copy the desired future transmit time from the CMSG cookie into the skb. Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 57634bc3da74..3428f7739ae9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1951,6 +1951,7 @@ retry: goto out_unlock; } + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -1962,6 +1963,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @@ -2633,6 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.transmit_time = 0; sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); @@ -2829,6 +2833,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { @@ -2903,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); From 860b642b9c33ea4a6ae2f416607b0b98a9d11bb0 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:52 -0700 Subject: [PATCH 06/14] net/sched: Allow creating a Qdisc watchdog with other clocks This adds 'qdisc_watchdog_init_clockid()' that allows a clockid to be passed, this allows other time references to be used when scheduling the Qdisc to run. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 ++ net/sched/sch_api.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 815b92a23936..2466ea143d01 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -72,6 +72,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:53 -0700 Subject: [PATCH 07/14] net/sched: Introduce the ETF Qdisc The ETF (Earliest TxTime First) qdisc uses the information added earlier in this series (the socket option SO_TXTIME and the new role of sk_buff->tstamp) to schedule packets transmission based on absolute time. For some workloads, just bandwidth enforcement is not enough, and precise control of the transmission of packets is necessary. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \ clockid CLOCK_TAI In this example, the Qdisc will provide SW best-effort for the control of the transmission time to the network adapter, the time stamp in the socket will be in reference to the clockid CLOCK_TAI and packets will leave the qdisc "delta" (100000) nanoseconds before its transmission time. The ETF qdisc will buffer packets sorted by their txtime. It will drop packets on enqueue() if their skbuff clockid does not match the clock reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped if it expires while being enqueued. The qdisc also supports the SO_TXTIME deadline mode. For this mode, it will dequeue a packet as soon as possible and change the skb timestamp to 'now' during etf_dequeue(). Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid to be configured, but it's been decided that usage of CLOCK_TAI should be enforced until we decide to allow for other clockids to be used. The rationale here is that PTP times are usually in the TAI scale, thus no other clocks should be necessary. For now, the qdisc will return EINVAL if any clocks other than CLOCK_TAI are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/uapi/linux/pkt_sched.h | 17 ++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/sch_etf.c | 384 +++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+) create mode 100644 net/sched/sch_etf.c diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1ef749b6f9f..f06ee8f91e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -798,6 +798,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bad3c03bcf43..d5e933ce1447 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -937,4 +937,21 @@ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..fcc89706745b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..9a5a7077d217 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..4b7f4903ac17 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia + * Vinicius Costa Gomes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) + +struct etf_sched_data { + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + /* FIXME: Must return error on the socket's error queue */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d deadline %s\n", + qopt->delta, qopt->clockid, + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); From 88cab77162e86e0f6a2b7e4f859c1435c4e24feb Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:54 -0700 Subject: [PATCH 08/14] net/sched: Add HW offloading capability to ETF Add infra so etf qdisc supports HW offload of time-based transmission. For hw offload, the time sorted list is still used, so packets are dequeued always in order of txtime. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf offload delta 100000 \ clockid CLOCK_REALTIME In this example, the Qdisc will use HW offload for the control of the transmission time through the network adapter. The hrtimer used for packets scheduling inside the qdisc will use the clockid CLOCK_REALTIME as reference and packets leave the Qdisc "delta" (100000) nanoseconds before their transmission time. Because this will be using HW offload and since dynamic clocks are not supported by the hrtimer, the system clock and the PHC clock must be synchronized for this mode to behave as expected. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 5 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 71 +++++++++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2466ea143d01..7dc769e5452b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -155,4 +155,9 @@ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_etf_qopt_offload { + u8 enable; + s32 queue; +}; + #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d5e933ce1447..949118461009 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -944,6 +944,7 @@ struct tc_etf_qopt { __s32 clockid; __u32 flags; #define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4b7f4903ac17..932a136db568 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -20,8 +20,10 @@ #include #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) struct etf_sched_data { + bool offload; bool deadline_mode; int clockid; int queue; @@ -45,6 +47,9 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt, * * Dynamic clockids are not supported. * * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); @@ -225,6 +230,56 @@ out: return skb; } +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -251,8 +306,9 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_ETF_PARMS]); - pr_debug("delta %d clockid %d deadline %s\n", + pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); @@ -261,9 +317,16 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); switch (q->clockid) { @@ -326,10 +389,13 @@ static void etf_reset(struct Qdisc *sch) static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -344,6 +410,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.delta = q->delta; opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; From 91db364236c8ae1af976d9794e5fec98e859dae7 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:55 -0700 Subject: [PATCH 09/14] igb: Refactor igb_configure_cbs() Make this function retrieve what it needs from the Tx ring being addressed since it already relies on what had been saved on it before. Also, since this function will be used by the upcoming Launchtime patches rename it to better reflect its intention. Note that Launchtime is not part of what 802.1Qav specifies, but the i210 datasheet refers to this set of functionality as "Qav Transmission Mode". Here we also perform a tiny refactor at is_any_cbs_enabled(), and add further documentation to igb_setup_tx_mode(). Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 60 +++++++++++------------ 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index f1e3397bd405..15f6b9c57ccf 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1655,23 +1655,17 @@ static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode) } /** - * igb_configure_cbs - Configure Credit-Based Shaper (CBS) + * igb_config_tx_modes - Configure "Qav Tx mode" features on igb * @adapter: pointer to adapter struct * @queue: queue number - * @enable: true = enable CBS, false = disable CBS - * @idleslope: idleSlope in kbps - * @sendslope: sendSlope in kbps - * @hicredit: hiCredit in bytes - * @locredit: loCredit in bytes * - * Configure CBS for a given hardware queue. When disabling, idleslope, - * sendslope, hicredit, locredit arguments are ignored. Returns 0 if - * success. Negative otherwise. + * Configure CBS for a given hardware queue. Parameters are retrieved + * from the correct Tx ring, so igb_save_cbs_params() should be used + * for setting those correctly prior to this function being called. **/ -static void igb_configure_cbs(struct igb_adapter *adapter, int queue, - bool enable, int idleslope, int sendslope, - int hicredit, int locredit) +static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) { + struct igb_ring *ring = adapter->tx_ring[queue]; struct net_device *netdev = adapter->netdev; struct e1000_hw *hw = &adapter->hw; u32 tqavcc; @@ -1680,7 +1674,7 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue, WARN_ON(hw->mac.type != e1000_i210); WARN_ON(queue < 0 || queue > 1); - if (enable || queue == 0) { + if (ring->cbs_enable || queue == 0) { /* i210 does not allow the queue 0 to be in the Strict * Priority mode while the Qav mode is enabled, so, * instead of disabling strict priority mode, we give @@ -1690,10 +1684,10 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue, * Queue0 QueueMode must be set to 1b when * TransmitMode is set to Qav." */ - if (queue == 0 && !enable) { + if (queue == 0 && !ring->cbs_enable) { /* max "linkspeed" idleslope in kbps */ - idleslope = 1000000; - hicredit = ETH_FRAME_LEN; + ring->idleslope = 1000000; + ring->hicredit = ETH_FRAME_LEN; } set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); @@ -1756,14 +1750,15 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue, * calculated value, so the resulting bandwidth might * be slightly higher for some configurations. */ - value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000); + value = DIV_ROUND_UP_ULL(ring->idleslope * 61034ULL, 1000000); tqavcc = rd32(E1000_I210_TQAVCC(queue)); tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK; tqavcc |= value; wr32(E1000_I210_TQAVCC(queue), tqavcc); - wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735); + wr32(E1000_I210_TQAVHC(queue), + 0x80000000 + ring->hicredit * 0x7735); } else { set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); @@ -1783,8 +1778,9 @@ static void igb_configure_cbs(struct igb_adapter *adapter, int queue, */ netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n", - (enable) ? "enabled" : "disabled", queue, - idleslope, sendslope, hicredit, locredit); + (ring->cbs_enable) ? "enabled" : "disabled", queue, + ring->idleslope, ring->sendslope, ring->hicredit, + ring->locredit); } static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, @@ -1809,19 +1805,25 @@ static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, static bool is_any_cbs_enabled(struct igb_adapter *adapter) { - struct igb_ring *ring; int i; for (i = 0; i < adapter->num_tx_queues; i++) { - ring = adapter->tx_ring[i]; - - if (ring->cbs_enable) + if (adapter->tx_ring[i]->cbs_enable) return true; } return false; } +/** + * igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable + * @adapter: pointer to adapter struct + * + * Configure TQAVCTRL register switching the controller's Tx mode + * if FQTSS mode is enabled or disabled. Additionally, will issue + * a call to igb_config_tx_modes() per queue so any previously saved + * Tx parameters are applied. + **/ static void igb_setup_tx_mode(struct igb_adapter *adapter) { struct net_device *netdev = adapter->netdev; @@ -1881,11 +1883,7 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter) adapter->num_tx_queues : I210_SR_QUEUES_NUM; for (i = 0; i < max_queue; i++) { - struct igb_ring *ring = adapter->tx_ring[i]; - - igb_configure_cbs(adapter, i, ring->cbs_enable, - ring->idleslope, ring->sendslope, - ring->hicredit, ring->locredit); + igb_config_tx_modes(adapter, i); } } else { wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT); @@ -2480,9 +2478,7 @@ static int igb_offload_cbs(struct igb_adapter *adapter, return err; if (is_fqtss_enabled(adapter)) { - igb_configure_cbs(adapter, qopt->queue, qopt->enable, - qopt->idleslope, qopt->sendslope, - qopt->hicredit, qopt->locredit); + igb_config_tx_modes(adapter, qopt->queue); if (!is_any_cbs_enabled(adapter)) enable_fqtss(adapter, false); From 0364a0d0e7a124e7c821cc536a95f7ef421349cb Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:56 -0700 Subject: [PATCH 10/14] igb: Only change Tx arbitration when CBS is on Currently the data transmission arbitration algorithm - DataTranARB field on TQAVCTRL reg - is always set to CBS when the Tx mode is changed from legacy to 'Qav' mode. Make that configuration a bit more granular in preparation for the upcoming Launchtime enabling patches, since CBS and Launchtime can be enabled separately. That is achieved by moving the DataTranARB setup to igb_config_tx_modes() instead. Similarly, when disabling CBS we must check if it has been disabled for all queues, and clear the DataTranARB accordingly. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 49 +++++++++++++++-------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 15f6b9c57ccf..8c90f1e51add 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1654,6 +1654,18 @@ static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode) wr32(E1000_I210_TQAVCC(queue), val); } +static bool is_any_cbs_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->cbs_enable) + return true; + } + + return false; +} + /** * igb_config_tx_modes - Configure "Qav Tx mode" features on igb * @adapter: pointer to adapter struct @@ -1668,7 +1680,7 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) struct igb_ring *ring = adapter->tx_ring[queue]; struct net_device *netdev = adapter->netdev; struct e1000_hw *hw = &adapter->hw; - u32 tqavcc; + u32 tqavcc, tqavctrl; u16 value; WARN_ON(hw->mac.type != e1000_i210); @@ -1693,6 +1705,14 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); + /* Always set data transfer arbitration to credit-based + * shaper algorithm on TQAVCTRL if CBS is enabled for any of + * the queues. + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + /* According to i210 datasheet section 7.2.7.7, we should set * the 'idleSlope' field from TQAVCC register following the * equation: @@ -1770,6 +1790,16 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) /* Set hiCredit to zero. */ wr32(E1000_I210_TQAVHC(queue), 0); + + /* If CBS is not enabled for any queues anymore, then return to + * the default state of Data Transmission Arbitration on + * TQAVCTRL. + */ + if (!is_any_cbs_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } } /* XXX: In i210 controller the sendSlope and loCredit parameters from @@ -1803,18 +1833,6 @@ static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, return 0; } -static bool is_any_cbs_enabled(struct igb_adapter *adapter) -{ - int i; - - for (i = 0; i < adapter->num_tx_queues; i++) { - if (adapter->tx_ring[i]->cbs_enable) - return true; - } - - return false; -} - /** * igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable * @adapter: pointer to adapter struct @@ -1838,11 +1856,10 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter) int i, max_queue; /* Configure TQAVCTRL register: set transmit mode to 'Qav', - * set data fetch arbitration to 'round robin' and set data - * transfer arbitration to 'credit shaper algorithm. + * set data fetch arbitration to 'round robin'. */ val = rd32(E1000_I210_TQAVCTRL); - val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB; + val |= E1000_TQAVCTRL_XMIT_MODE; val &= ~E1000_TQAVCTRL_DATAFETCHARB; wr32(E1000_I210_TQAVCTRL, val); From 8080e6ab4e99216f414c5c314264fd7cf3b6e4c1 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:57 -0700 Subject: [PATCH 11/14] igb: Refactor igb_offload_cbs() Split code into a separate function (igb_offload_apply()) that will be used by ETF offload implementation. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 8c90f1e51add..c30ab7b260cc 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2474,6 +2474,19 @@ igb_features_check(struct sk_buff *skb, struct net_device *dev, return features; } +static void igb_offload_apply(struct igb_adapter *adapter, s32 queue) +{ + if (!is_fqtss_enabled(adapter)) { + enable_fqtss(adapter, true); + return; + } + + igb_config_tx_modes(adapter, queue); + + if (!is_any_cbs_enabled(adapter)) + enable_fqtss(adapter, false); +} + static int igb_offload_cbs(struct igb_adapter *adapter, struct tc_cbs_qopt_offload *qopt) { @@ -2494,15 +2507,7 @@ static int igb_offload_cbs(struct igb_adapter *adapter, if (err) return err; - if (is_fqtss_enabled(adapter)) { - igb_config_tx_modes(adapter, qopt->queue); - - if (!is_any_cbs_enabled(adapter)) - enable_fqtss(adapter, false); - - } else { - enable_fqtss(adapter, true); - } + igb_offload_apply(adapter, qopt->queue); return 0; } From 1b9231e7e148520a3ba63a604b27f11093f21bee Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:58 -0700 Subject: [PATCH 12/14] igb: Only call skb_tx_timestamp after descriptors are ready Currently, skb_tx_timestamp() is being called before the Tx descriptors are prepared in igb_xmit_frame_ring(), which happens during either the igb_tso() or igb_tx_csum() calls. Given that now the skb->tstamp might be used to carry the timestamp for SO_TXTIME, we must only call skb_tx_timestamp() after the information has been copied into the Tx descriptors. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c30ab7b260cc..445da8285d9b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6033,8 +6033,6 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, } } - skb_tx_timestamp(skb); - if (skb_vlan_tag_present(skb)) { tx_flags |= IGB_TX_FLAGS_VLAN; tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT); @@ -6050,6 +6048,8 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, else if (!tso) igb_tx_csum(tx_ring, first); + skb_tx_timestamp(skb); + if (igb_tx_map(tx_ring, first, hdr_len)) goto cleanup_tx_tstamp; From 3048cf84d152344f874e993558770bba73a65c8f Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:59 -0700 Subject: [PATCH 13/14] igb: Add support for ETF offload Implement HW offload support for SO_TXTIME through igb's Launchtime feature. This is done by extending igb_setup_tc() so it supports TC_SETUP_QDISC_ETF and configuring i210 so time based transmit arbitration is enabled. The FQTSS transmission mode added before is extended so strict priority (SP) queues wait for stream reservation (SR) ones. igb_config_tx_modes() is extended so it can support enabling/disabling Launchtime following the previous approach used for the credit-based shaper (CBS). As the previous flow, FQTSS transmission mode is enabled automatically by the driver once Launchtime (or CBS, as before) is enabled. Similarly, it's automatically disabled when the feature is disabled for the last queue that had it setup on. The driver just consumes the transmit times from the skbuffs directly, so no special handling is done in case an 'invalid' time is provided. We assume this has been handled by the ETF qdisc already. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- .../net/ethernet/intel/igb/e1000_defines.h | 16 ++ drivers/net/ethernet/intel/igb/igb.h | 1 + drivers/net/ethernet/intel/igb/igb_main.c | 138 +++++++++++++++--- 3 files changed, 138 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h index 252440a418dc..8a28f3388f69 100644 --- a/drivers/net/ethernet/intel/igb/e1000_defines.h +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h @@ -1048,6 +1048,22 @@ #define E1000_TQAVCTRL_XMIT_MODE BIT(0) #define E1000_TQAVCTRL_DATAFETCHARB BIT(4) #define E1000_TQAVCTRL_DATATRANARB BIT(8) +#define E1000_TQAVCTRL_DATATRANTIM BIT(9) +#define E1000_TQAVCTRL_SP_WAIT_SR BIT(10) +/* Fetch Time Delta - bits 31:16 + * + * This field holds the value to be reduced from the launch time for + * fetch time decision. The FetchTimeDelta value is defined in 32 ns + * granularity. + * + * This field is 16 bits wide, and so the maximum value is: + * + * 65535 * 32 = 2097120 ~= 2.1 msec + * + * XXX: We are configuring the max value here since we couldn't come up + * with a reason for not doing so. + */ +#define E1000_TQAVCTRL_FETCHTIME_DELTA (0xFFFF << 16) /* TX Qav Credit Control fields */ #define E1000_TQAVCC_IDLESLOPE_MASK 0xFFFF diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 9643b5b3d444..ca54e268d157 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -262,6 +262,7 @@ struct igb_ring { u16 count; /* number of desc. in the ring */ u8 queue_index; /* logical index of the ring*/ u8 reg_idx; /* physical index of the ring */ + bool launchtime_enable; /* true if LaunchTime is enabled */ bool cbs_enable; /* indicates if CBS is enabled */ s32 idleslope; /* idleSlope in kbps */ s32 sendslope; /* sendSlope in kbps */ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 445da8285d9b..e3a0c02721c9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1666,13 +1666,26 @@ static bool is_any_cbs_enabled(struct igb_adapter *adapter) return false; } +static bool is_any_txtime_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->launchtime_enable) + return true; + } + + return false; +} + /** * igb_config_tx_modes - Configure "Qav Tx mode" features on igb * @adapter: pointer to adapter struct * @queue: queue number * - * Configure CBS for a given hardware queue. Parameters are retrieved - * from the correct Tx ring, so igb_save_cbs_params() should be used + * Configure CBS and Launchtime for a given hardware queue. + * Parameters are retrieved from the correct Tx ring, so + * igb_save_cbs_params() and igb_save_txtime_params() should be used * for setting those correctly prior to this function being called. **/ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) @@ -1686,6 +1699,19 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) WARN_ON(hw->mac.type != e1000_i210); WARN_ON(queue < 0 || queue > 1); + /* If any of the Qav features is enabled, configure queues as SR and + * with HIGH PRIO. If none is, then configure them with LOW PRIO and + * as SP. + */ + if (ring->cbs_enable || ring->launchtime_enable) { + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); + set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); + } else { + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); + set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); + } + + /* If CBS is enabled, set DataTranARB and config its parameters. */ if (ring->cbs_enable || queue == 0) { /* i210 does not allow the queue 0 to be in the Strict * Priority mode while the Qav mode is enabled, so, @@ -1702,9 +1728,6 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) ring->hicredit = ETH_FRAME_LEN; } - set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); - set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); - /* Always set data transfer arbitration to credit-based * shaper algorithm on TQAVCTRL if CBS is enabled for any of * the queues. @@ -1780,8 +1803,6 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) wr32(E1000_I210_TQAVHC(queue), 0x80000000 + ring->hicredit * 0x7735); } else { - set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); - set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); /* Set idleSlope to zero. */ tqavcc = rd32(E1000_I210_TQAVCC(queue)); @@ -1802,17 +1823,61 @@ static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) } } + /* If LaunchTime is enabled, set DataTranTIM. */ + if (ring->launchtime_enable) { + /* Always set DataTranTIM on TQAVCTRL if LaunchTime is enabled + * for any of the SR queues, and configure fetchtime delta. + * XXX NOTE: + * - LaunchTime will be enabled for all SR queues. + * - A fixed offset can be added relative to the launch + * time of all packets if configured at reg LAUNCH_OS0. + * We are keeping it as 0 for now (default value). + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANTIM | + E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } else { + /* If Launchtime is not enabled for any SR queues anymore, + * then clear DataTranTIM on TQAVCTRL and clear fetchtime delta, + * effectively disabling Launchtime. + */ + if (!is_any_txtime_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANTIM; + tqavctrl &= ~E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } + } + /* XXX: In i210 controller the sendSlope and loCredit parameters from * CBS are not configurable by software so we don't do any 'controller * configuration' in respect to these parameters. */ - netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n", - (ring->cbs_enable) ? "enabled" : "disabled", queue, + netdev_dbg(netdev, "Qav Tx mode: cbs %s, launchtime %s, queue %d \ + idleslope %d sendslope %d hiCredit %d \ + locredit %d\n", + (ring->cbs_enable) ? "enabled" : "disabled", + (ring->launchtime_enable) ? "enabled" : "disabled", queue, ring->idleslope, ring->sendslope, ring->hicredit, ring->locredit); } +static int igb_save_txtime_params(struct igb_adapter *adapter, int queue, + bool enable) +{ + struct igb_ring *ring; + + if (queue < 0 || queue > adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[queue]; + ring->launchtime_enable = enable; + + return 0; +} + static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, bool enable, int idleslope, int sendslope, int hicredit, int locredit) @@ -1856,10 +1921,11 @@ static void igb_setup_tx_mode(struct igb_adapter *adapter) int i, max_queue; /* Configure TQAVCTRL register: set transmit mode to 'Qav', - * set data fetch arbitration to 'round robin'. + * set data fetch arbitration to 'round robin', set SP_WAIT_SR + * so SP queues wait for SR ones. */ val = rd32(E1000_I210_TQAVCTRL); - val |= E1000_TQAVCTRL_XMIT_MODE; + val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_SP_WAIT_SR; val &= ~E1000_TQAVCTRL_DATAFETCHARB; wr32(E1000_I210_TQAVCTRL, val); @@ -2483,7 +2549,7 @@ static void igb_offload_apply(struct igb_adapter *adapter, s32 queue) igb_config_tx_modes(adapter, queue); - if (!is_any_cbs_enabled(adapter)) + if (!is_any_cbs_enabled(adapter) && !is_any_txtime_enabled(adapter)) enable_fqtss(adapter, false); } @@ -2756,6 +2822,29 @@ static int igb_setup_tc_block(struct igb_adapter *adapter, } } +static int igb_offload_txtime(struct igb_adapter *adapter, + struct tc_etf_qopt_offload *qopt) +{ + struct e1000_hw *hw = &adapter->hw; + int err; + + /* Launchtime offloading is only supported by i210 controller. */ + if (hw->mac.type != e1000_i210) + return -EOPNOTSUPP; + + /* Launchtime offloading is only supported by queues 0 and 1. */ + if (qopt->queue < 0 || qopt->queue > 1) + return -EINVAL; + + err = igb_save_txtime_params(adapter, qopt->queue, qopt->enable); + if (err) + return err; + + igb_offload_apply(adapter, qopt->queue); + + return 0; +} + static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -2766,6 +2855,8 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, return igb_offload_cbs(adapter, type_data); case TC_SETUP_BLOCK: return igb_setup_tc_block(adapter, type_data); + case TC_SETUP_QDISC_ETF: + return igb_offload_txtime(adapter, type_data); default: return -EOPNOTSUPP; @@ -5586,11 +5677,14 @@ set_itr_now: } } -static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens, - u32 type_tucmd, u32 mss_l4len_idx) +static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, + struct igb_tx_buffer *first, + u32 vlan_macip_lens, u32 type_tucmd, + u32 mss_l4len_idx) { struct e1000_adv_tx_context_desc *context_desc; u16 i = tx_ring->next_to_use; + struct timespec64 ts; context_desc = IGB_TX_CTXTDESC(tx_ring, i); @@ -5605,9 +5699,18 @@ static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens, mss_l4len_idx |= tx_ring->reg_idx << 4; context_desc->vlan_macip_lens = cpu_to_le32(vlan_macip_lens); - context_desc->seqnum_seed = 0; context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd); context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx); + + /* We assume there is always a valid tx time available. Invalid times + * should have been handled by the upper layers. + */ + if (tx_ring->launchtime_enable) { + ts = ns_to_timespec64(first->skb->tstamp); + context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32); + } else { + context_desc->seqnum_seed = 0; + } } static int igb_tso(struct igb_ring *tx_ring, @@ -5690,7 +5793,8 @@ static int igb_tso(struct igb_ring *tx_ring, vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, + type_tucmd, mss_l4len_idx); return 1; } @@ -5745,7 +5849,7 @@ no_csum: vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, 0); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0); } #define IGB_SET_FLAG(_input, _flag, _result) \ From 4b15c7075352668d4467ced7594b676707d11cae Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:43:00 -0700 Subject: [PATCH 14/14] net/sched: Make etf report drops on error_queue Use the socket error queue for reporting dropped packets if the socket has enabled that feature through the SO_TXTIME API. Packets are dropped either on enqueue() if they aren't accepted by the qdisc or on dequeue() if the system misses their deadline. Those are reported as different errors so applications can react accordingly. Userspace can retrieve the errors through the socket error queue and the corresponding cmsg interfaces. A struct sock_extended_err* is used for returning the error data, and the packet's timestamp can be retrieved by adding both ee_data and ee_info fields as e.g.: ((__u64) serr->ee_data << 32) + serr->ee_info This feature is disabled by default and must be explicitly enabled by applications. Enabling it can bring some overhead for the Tx cycles of the application. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- include/uapi/linux/errqueue.h | 4 ++++ include/uapi/linux/net_tstamp.h | 5 ++++- net/core/sock.c | 4 ++++ net/sched/sch_etf.c | 35 +++++++++++++++++++++++++++++++-- 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 68347b9821c6..e0eac9ef44b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,7 +481,8 @@ struct sock { u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, - sk_txtime_unused : 7; + sk_txtime_report_errors : 1, + sk_txtime_unused : 6; struct socket *sk_socket; void *sk_user_data; diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..c0151200f7d1 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,12 +20,16 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_TXTIME 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_CODE_TXTIME_INVALID_PARAM 1 +#define SO_EE_CODE_TXTIME_MISSED 2 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index c9a77c353b98..f8f4539f1135 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -147,8 +147,11 @@ struct scm_ts_pktinfo { */ enum txtime_flags { SOF_TXTIME_DEADLINE_MODE = (1 << 0), + SOF_TXTIME_REPORT_ERRORS = (1 << 1), - SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) + SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS, + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) | + SOF_TXTIME_FLAGS_LAST }; struct sock_txtime { diff --git a/net/core/sock.c b/net/core/sock.c index fe64b839f1b2..03fdea5b0f57 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1087,6 +1087,8 @@ set_rcvbuf: sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); } break; @@ -1429,6 +1431,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; break; default: diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 932a136db568..1538d6fa8165 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,32 @@ static void reset_watchdog(struct Qdisc *sch) qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -130,8 +157,11 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct rb_node **p = &q->head.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; - if (!is_packet_valid(sch, nskb)) + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); + } while (*p) { struct sk_buff *skb; @@ -174,6 +204,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, if (drop) { struct sk_buff *to_free = NULL; + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + qdisc_drop(skb, sch, &to_free); kfree_skb_list(to_free); qdisc_qstats_overlimit(sch); @@ -199,7 +231,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) now = q->get_time(); /* Drop if packet has expired while in queue. */ - /* FIXME: Must return error on the socket's error queue */ if (ktime_before(skb->tstamp, now)) { timesortedlist_erase(sch, skb, true); skb = NULL;