net-gro: Prepare GRO stack for the upcoming tunneling support

This patch modifies the GRO stack to avoid the use of "network_header"
and associated macros like ip_hdr() and ipv6_hdr() in order to allow
an arbitary number of IP hdrs (v4 or v6) to be used in the
encapsulation chain. This lays the foundation for various IP
tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later.

With this patch, the GRO stack traversing now is mostly based on
skb_gro_offset rather than special hdr offsets saved in skb (e.g.,
skb->network_header). As a result all but the top layer (i.e., the
the transport layer) must have hdrs of the same length in order for
a pkt to be considered for aggregation. Therefore when adding a new
encap layer (e.g., for tunneling), one must check and skip flows
(e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a
different hdr length.

Note that unlike the network header, the transport header can and
will continue to be set by the GRO code since there will be at
most one "transport layer" in the encap chain.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Jerry Chu 2013-12-11 20:53:45 -08:00 committed by David S. Miller
parent a46dc748ca
commit 299603e837
6 changed files with 97 additions and 75 deletions

View File

@ -1676,7 +1676,7 @@ struct offload_callbacks {
int (*gso_send_check)(struct sk_buff *skb); int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff **(*gro_receive)(struct sk_buff **head,
struct sk_buff *skb); struct sk_buff *skb);
int (*gro_complete)(struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff);
}; };
struct packet_offload { struct packet_offload {

View File

@ -3752,7 +3752,7 @@ static int napi_gro_complete(struct sk_buff *skb)
if (ptype->type != type || !ptype->callbacks.gro_complete) if (ptype->type != type || !ptype->callbacks.gro_complete)
continue; continue;
err = ptype->callbacks.gro_complete(skb); err = ptype->callbacks.gro_complete(skb, 0);
break; break;
} }
rcu_read_unlock(); rcu_read_unlock();
@ -3818,6 +3818,23 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
} }
} }
static void skb_gro_reset_offset(struct sk_buff *skb)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
}
}
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{ {
struct sk_buff **pp = NULL; struct sk_buff **pp = NULL;
@ -3833,6 +3850,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (skb_is_gso(skb) || skb_has_frag_list(skb)) if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal; goto normal;
skb_gro_reset_offset(skb);
gro_list_prepare(napi, skb); gro_list_prepare(napi, skb);
rcu_read_lock(); rcu_read_lock();
@ -3938,27 +3956,8 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
return ret; return ret;
} }
static void skb_gro_reset_offset(struct sk_buff *skb)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
}
}
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{ {
skb_gro_reset_offset(skb);
return napi_skb_finish(dev_gro_receive(napi, skb), skb); return napi_skb_finish(dev_gro_receive(napi, skb), skb);
} }
EXPORT_SYMBOL(napi_gro_receive); EXPORT_SYMBOL(napi_gro_receive);
@ -3992,12 +3991,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
{ {
switch (ret) { switch (ret) {
case GRO_NORMAL: case GRO_NORMAL:
case GRO_HELD: if (netif_receive_skb(skb))
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_HELD)
skb_gro_pull(skb, -ETH_HLEN);
else if (netif_receive_skb(skb))
ret = GRO_DROP; ret = GRO_DROP;
break; break;
@ -4006,6 +4000,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
napi_reuse_skb(napi, skb); napi_reuse_skb(napi, skb);
break; break;
case GRO_HELD:
case GRO_MERGED: case GRO_MERGED:
break; break;
} }
@ -4016,36 +4011,15 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
static struct sk_buff *napi_frags_skb(struct napi_struct *napi) static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{ {
struct sk_buff *skb = napi->skb; struct sk_buff *skb = napi->skb;
struct ethhdr *eth;
unsigned int hlen;
unsigned int off;
napi->skb = NULL; napi->skb = NULL;
skb_reset_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
skb_gro_reset_offset(skb);
off = skb_gro_offset(skb);
hlen = off + sizeof(*eth);
eth = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
eth = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!eth)) {
napi_reuse_skb(napi, skb); napi_reuse_skb(napi, skb);
skb = NULL; return NULL;
goto out;
}
} }
skb->protocol = eth_type_trans(skb, skb->dev);
skb_gro_pull(skb, sizeof(*eth));
/*
* This works because the only protocols we care about don't require
* special handling. We'll fix it up properly at the end.
*/
skb->protocol = eth->h_proto;
out:
return skb; return skb;
} }

View File

@ -1377,8 +1377,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!NAPI_GRO_CB(p)->same_flow) if (!NAPI_GRO_CB(p)->same_flow)
continue; continue;
iph2 = ip_hdr(p); iph2 = (struct iphdr *)(p->data + off);
/* The above works because, with the exception of the top
* (inner most) layer, we only aggregate pkts with the same
* hdr length so all the hdrs we'll need to verify will start
* at the same offset.
*/
if ((iph->protocol ^ iph2->protocol) | if ((iph->protocol ^ iph2->protocol) |
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
@ -1397,6 +1401,11 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
} }
NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
* immediately following this IP hdr.
*/
skb_gro_pull(skb, sizeof(*iph)); skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb)); skb_set_transport_header(skb, skb_gro_offset(skb));
@ -1411,10 +1420,10 @@ out:
return pp; return pp;
} }
static int inet_gro_complete(struct sk_buff *skb) static int inet_gro_complete(struct sk_buff *skb, int nhoff)
{ {
__be16 newlen = htons(skb->len - skb_network_offset(skb)); __be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = ip_hdr(skb); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops; const struct net_offload *ops;
int proto = iph->protocol; int proto = iph->protocol;
int err = -ENOSYS; int err = -ENOSYS;
@ -1427,7 +1436,11 @@ static int inet_gro_complete(struct sk_buff *skb)
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock; goto out_unlock;
err = ops->callbacks.gro_complete(skb); /* Only need to add sizeof(*iph) to get to the next hdr below
* because any hdr with option will have been flushed in
* inet_gro_receive().
*/
err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
out_unlock: out_unlock:
rcu_read_unlock(); rcu_read_unlock();

View File

@ -240,7 +240,7 @@ int tcp_gro_complete(struct sk_buff *skb)
{ {
struct tcphdr *th = tcp_hdr(skb); struct tcphdr *th = tcp_hdr(skb);
skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_start = (unsigned char *)th - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check); skb->csum_offset = offsetof(struct tcphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL;
@ -272,6 +272,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb)
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{ {
/* Use the IP hdr immediately proceeding for this transport */
const struct iphdr *iph = skb_gro_network_header(skb); const struct iphdr *iph = skb_gro_network_header(skb);
__wsum wsum; __wsum wsum;
@ -303,13 +304,13 @@ skip_csum:
return tcp_gro_receive(head, skb); return tcp_gro_receive(head, skb);
} }
static int tcp4_gro_complete(struct sk_buff *skb) static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{ {
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb); struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->saddr, iph->daddr, 0); iph->daddr, 0);
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
return tcp_gro_complete(skb); return tcp_gro_complete(skb);

View File

@ -154,6 +154,35 @@ out:
return segs; return segs;
} }
/* Return the total length of all the extension hdrs, following the same
* logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
*/
static int ipv6_exthdrs_len(struct ipv6hdr *iph,
const struct net_offload **opps)
{
struct ipv6_opt_hdr *opth = NULL;
int len = 0, proto, optlen;
proto = iph->nexthdr;
for (;;) {
if (proto != NEXTHDR_HOP) {
*opps = rcu_dereference(inet6_offloads[proto]);
if (unlikely(!(*opps)))
break;
if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
break;
}
if (opth == NULL)
opth = (void *)(iph+1);
else
opth = (void *)opth + optlen;
optlen = ipv6_optlen(opth);
len += optlen;
proto = opth->nexthdr;
}
return len;
}
static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
struct sk_buff *skb) struct sk_buff *skb)
{ {
@ -177,6 +206,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
goto out; goto out;
} }
skb_set_network_header(skb, off);
skb_gro_pull(skb, sizeof(*iph)); skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb)); skb_set_transport_header(skb, skb_gro_offset(skb));
@ -211,12 +241,16 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
if (!NAPI_GRO_CB(p)->same_flow) if (!NAPI_GRO_CB(p)->same_flow)
continue; continue;
iph2 = ipv6_hdr(p); iph2 = (struct ipv6hdr *)(p->data + off);
first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ;
/* All fields must match except length and Traffic Class. */ /* All fields must match except length and Traffic Class.
if (nlen != skb_network_header_len(p) || * XXX skbs on the gro_list have all been parsed and pulled
(first_word & htonl(0xF00FFFFF)) || * already so we don't need to compare nlen
* (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
* memcmp() alone below is suffcient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
memcmp(&iph->nexthdr, &iph2->nexthdr, memcmp(&iph->nexthdr, &iph2->nexthdr,
nlen - offsetof(struct ipv6hdr, nexthdr))) { nlen - offsetof(struct ipv6hdr, nexthdr))) {
NAPI_GRO_CB(p)->same_flow = 0; NAPI_GRO_CB(p)->same_flow = 0;
@ -245,21 +279,21 @@ out:
return pp; return pp;
} }
static int ipv6_gro_complete(struct sk_buff *skb) static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{ {
const struct net_offload *ops; const struct net_offload *ops;
struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
int err = -ENOSYS; int err = -ENOSYS;
iph->payload_len = htons(skb->len - skb_network_offset(skb) - iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
sizeof(*iph));
rcu_read_lock(); rcu_read_lock();
ops = rcu_dereference(inet6_offloads[NAPI_GRO_CB(skb)->proto]);
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock; goto out_unlock;
err = ops->callbacks.gro_complete(skb); err = ops->callbacks.gro_complete(skb, nhoff);
out_unlock: out_unlock:
rcu_read_unlock(); rcu_read_unlock();

View File

@ -66,13 +66,13 @@ skip_csum:
return tcp_gro_receive(head, skb); return tcp_gro_receive(head, skb);
} }
static int tcp6_gro_complete(struct sk_buff *skb) static int tcp6_gro_complete(struct sk_buff *skb, int thoff)
{ {
const struct ipv6hdr *iph = ipv6_hdr(skb); const struct ipv6hdr *iph = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb); struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
&iph->saddr, &iph->daddr, 0); &iph->daddr, 0);
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
return tcp_gro_complete(skb); return tcp_gro_complete(skb);