From a0d9a8604f29ee3340126ec3f90c9421f930aa50 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:45 +0200 Subject: [PATCH 01/22] rtnetlink: introduce new RTA_ENCAP_TYPE and RTA_ENCAP attributes This patch introduces two new RTA attributes to attach encap data to fib routes. Example iproute2 command to attach mpls encap data to ipv4 routes $ip route add 10.1.1.0/30 encap mpls 200 via inet 10.1.1.1 dev swp1 Signed-off-by: Roopa Prabhu Suggested-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index fdd8f07f1d34..0d3d3cc43356 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -308,6 +308,8 @@ enum rtattr_type_t { RTA_VIA, RTA_NEWDST, RTA_PREF, + RTA_ENCAP_TYPE, + RTA_ENCAP, __RTA_MAX }; From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:46 +0200 Subject: [PATCH 02/22] lwtunnel: infrastructure for handling light weight tunnels like mpls Provides infrastructure to parse/dump/store encap information for light weight tunnels like mpls. Encap information for such tunnels is associated with fib routes. This infrastructure is based on previous suggestions from Eric Biederman to follow the xfrm infrastructure. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/lwtunnel.h | 6 ++ include/net/lwtunnel.h | 132 +++++++++++++++++++++++++ include/uapi/linux/lwtunnel.h | 15 +++ net/Kconfig | 7 ++ net/core/Makefile | 1 + net/core/lwtunnel.c | 179 ++++++++++++++++++++++++++++++++++ 6 files changed, 340 insertions(+) create mode 100644 include/linux/lwtunnel.h create mode 100644 include/net/lwtunnel.h create mode 100644 include/uapi/linux/lwtunnel.h create mode 100644 net/core/lwtunnel.c diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 000000000000..97f32f8b4ae1 --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_LWTUNNEL_H_ +#define _LINUX_LWTUNNEL_H_ + +#include + +#endif /* _LINUX_LWTUNNEL_H_ */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h new file mode 100644 index 000000000000..df24b3611ff4 --- /dev/null +++ b/include/net/lwtunnel.h @@ -0,0 +1,132 @@ +#ifndef __NET_LWTUNNEL_H +#define __NET_LWTUNNEL_H 1 + +#include +#include +#include +#include +#include + +#define LWTUNNEL_HASH_BITS 7 +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) + +/* lw tunnel state flags */ +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 + +struct lwtunnel_state { + __u16 type; + __u16 flags; + atomic_t refcnt; + int len; + __u8 data[0]; +}; + +struct lwtunnel_encap_ops { + int (*build_state)(struct net_device *dev, struct nlattr *encap, + struct lwtunnel_state **ts); + int (*output)(struct sock *sk, struct sk_buff *skb); + int (*fill_encap)(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); + int (*get_encap_size)(struct lwtunnel_state *lwtstate); + int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); +}; + +extern const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; + +#ifdef CONFIG_LWTUNNEL +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ + atomic_inc(&lws->refcnt); +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ + if (!lws) + return; + + if (atomic_dec_and_test(&lws->refcnt)) + kfree(lws); +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) + return true; + + return false; +} + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws); +int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); + +#else + +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; + +} + +static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) +{ + return NULL; +} + +static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + return 0; +} + +#endif + +#endif /* __NET_LWTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h new file mode 100644 index 000000000000..aa611d931a31 --- /dev/null +++ b/include/uapi/linux/lwtunnel.h @@ -0,0 +1,15 @@ +#ifndef _UAPI_LWTUNNEL_H_ +#define _UAPI_LWTUNNEL_H_ + +#include + +enum lwtunnel_encap_types { + LWTUNNEL_ENCAP_NONE, + LWTUNNEL_ENCAP_MPLS, + __LWTUNNEL_ENCAP_MAX, +}; + +#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1) + + +#endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/Kconfig b/net/Kconfig index 57a7c5af3175..7021c1bf44d6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -374,6 +374,13 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + This feature provides an infrastructure to support light weight + tunnels like mpls. There is no netdevice associated with a light + weight tunnel endpoint. Tunnel encapsulation parameters are stored + with light weight tunnel state associated with fib routes. endif # if NET diff --git a/net/core/Makefile b/net/core/Makefile index fec0856dd6c0..086b01fbe1bd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000000..d7ae3a235b4b --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,179 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); From 571e722676fe386bb66f72a75b64a6ebf535c077 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:47 +0200 Subject: [PATCH 03/22] ipv4: support for fib route lwtunnel encap attributes This patch adds support in ipv4 fib functions to parse user provided encap attributes and attach encap state data to fib_nh and rtable. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 ++- include/net/route.h | 1 + net/ipv4/fib_frontend.c | 8 ++++ net/ipv4/fib_semantics.c | 96 +++++++++++++++++++++++++++++++++++++++- net/ipv4/route.c | 16 ++++++- 5 files changed, 122 insertions(+), 4 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 49c142bdf01e..5e0196084f1e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -44,7 +44,9 @@ struct fib_config { u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; - }; + struct nlattr *fc_encap; + u16 fc_encap_type; +}; struct fib_info; struct rtable; @@ -89,6 +91,7 @@ struct fib_nh { struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; + struct lwtunnel_state *nh_lwtstate; }; /* diff --git a/include/net/route.h b/include/net/route.h index fe22d03afb6a..2d45f419477f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,6 +66,7 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; + struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6bbc54940eb4..9b2019cc3586 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -591,6 +591,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +658,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_ENCAP: + cfg->fc_encap = attr; + break; + case RTA_ENCAP_TYPE: + cfg->fc_encap_type = nla_get_u16(attr); + break; } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c7358ea4ae93..6754c64b2fe0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "fib_lookup.h" @@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtunnel_state_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; @@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -452,6 +469,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +495,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, &lwtstate); + if (ret) + goto errout; + lwtunnel_state_get(lwtstate); + nexthop_nh->nh_lwtstate = lwtstate; + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } #endif +int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, + encap, &lwtstate); + if (!ret) + return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + + return 0; +} + int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +564,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -882,6 +956,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto failure; + + lwtunnel_state_get(lwtstate); + nh->nh_lwtstate = lwtstate; + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -1055,6 +1145,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1090,6 +1182,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 04c83de4f79e..226570ba1ced 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_SYSCTL @@ -1355,6 +1356,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } + lwtunnel_state_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1403,6 +1405,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif + if (nh->nh_lwtstate) { + lwtunnel_state_get(nh->nh_lwtstate); + rt->rt_lwtstate = nh->nh_lwtstate; + } else { + rt->rt_lwtstate = NULL; + } if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1488,6 +1496,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1617,6 +1626,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; @@ -1791,6 +1801,8 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1980,7 +1992,7 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2260,7 +2272,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - + rt->rt_lwtstate = NULL; dst_free(new); } From 19e42e45150672124b6a4341e2bc7982d247f0ac Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:48 +0200 Subject: [PATCH 04/22] ipv6: support for fib route lwtunnel encap attributes This patch adds support in ipv6 fib functions to parse Netlink RTA encap attributes and attach encap state data to rt6_info. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +++ net/ipv6/ip6_fib.c | 2 ++ net/ipv6/route.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b76849c190f..276328e3daa6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -51,6 +51,8 @@ struct fib6_config { struct nlattr *fc_mp; struct nl_info fc_nlinfo; + struct nlattr *fc_encap; + u16 fc_encap_type; }; struct fib6_node { @@ -131,6 +133,7 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 55d19861ab20..d715f2e0c4e7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { + lwtunnel_state_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969937f8..b3431b79dfb1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -58,6 +58,7 @@ #include #include #include +#include #include @@ -1770,6 +1771,17 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.output = ip6_output; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto out; + lwtunnel_state_get(lwtstate); + rt->rt6i_lwtstate = lwtstate; + } + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) @@ -2595,6 +2607,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2689,6 +2703,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= RTF_PREF(pref); } + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; @@ -2721,6 +2741,10 @@ beginning: r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); if (err) { @@ -2783,7 +2807,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2797,7 +2821,8 @@ static inline size_t rt6_nlmsg_size(void) + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ - + nla_total_size(1); /* RTA_PREF */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->rt6i_lwtstate); } static int rt6_fill_node(struct net *net, @@ -2945,6 +2970,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; + lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + nlmsg_end(skb, nlh); return 0; @@ -3071,7 +3098,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; From ffce41962ef64b8e685e5b621caf24bf381addd9 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:49 +0200 Subject: [PATCH 05/22] lwtunnel: support dst output redirect function This patch introduces lwtunnel_output function to call corresponding lwtunnels output function to xmit the packet. It adds two variants lwtunnel_output and lwtunnel_output6 for ipv4 and ipv6 respectively today. But this is subject to change when lwtstate will reside in dst or dst_metadata (as per upstream discussions). Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 12 +++++++++ net/core/lwtunnel.c | 56 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index df24b3611ff4..918e03c1dafa 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -69,6 +69,8 @@ int lwtunnel_fill_encap(struct sk_buff *skb, int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); +int lwtunnel_output(struct sock *sk, struct sk_buff *skb); +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else @@ -127,6 +129,16 @@ static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, return 0; } +static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index d7ae3a235b4b..bb58826c708d 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -25,6 +25,7 @@ #include #include +#include struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { @@ -177,3 +178,58 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) return ret; } EXPORT_SYMBOL(lwtunnel_cmp_encap); + +int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (!lwtstate) + goto drop; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree(skb); + + return ret; +} + +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt6i_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output6); + +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output); From 8602a625024737602630fe0dee423b3096d31524 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:50 +0200 Subject: [PATCH 06/22] ipv4: redirect dst output to lwtunnel output For input routes with tunnel encap state this patch redirects dst output functions to lwtunnel_output which later resolves to the corresponding lwtunnel output function. This has been tested to work with mpls ip tunnels. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 226570ba1ced..cd3157c464e6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1633,6 +1633,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + if (lwtunnel_output_redirect(rth->rt_lwtstate)) + rth->dst.output = lwtunnel_output; skb_dst_set(skb, &rth->dst); out: err = 0; From 74a0f2fe8ed51e3adbb1c882be04672fe7bb6996 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:51 +0200 Subject: [PATCH 07/22] ipv6: rt6_info output redirect to tunnel output This is similar to ipv4 redirect of dst output to lwtunnel output function for encapsulation and xmit. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b3431b79dfb1..7f2214f8fde7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1780,6 +1780,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; lwtunnel_state_get(lwtstate); rt->rt6i_lwtstate = lwtstate; + rt->dst.output = lwtunnel_output6; } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); From face0188e31b3cfc598d8dc3470e28e00fb3b07c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:52 +0200 Subject: [PATCH 08/22] mpls: export mpls functions for use by mpls iptunnels Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 11 ++++++++--- net/mpls/internal.h | 9 +++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f93a5978f2a..6e669114f829 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -58,10 +58,11 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->mpls_ptr); } -static bool mpls_output_possible(const struct net_device *dev) +bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } +EXPORT_SYMBOL_GPL(mpls_output_possible); static unsigned int mpls_rt_header_size(const struct mpls_route *rt) { @@ -69,13 +70,14 @@ static unsigned int mpls_rt_header_size(const struct mpls_route *rt) return rt->rt_labels * sizeof(struct mpls_shim_hdr); } -static unsigned int mpls_dev_mtu(const struct net_device *dev) +unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } +EXPORT_SYMBOL_GPL(mpls_dev_mtu); -static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -85,6 +87,7 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) return true; } +EXPORT_SYMBOL_GPL(mpls_pkt_too_big); static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) @@ -626,6 +629,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, return 0; } +EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]) @@ -671,6 +675,7 @@ int nla_get_labels(const struct nlattr *nla, *labels = nla_labels; return 0; } +EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 8cabeb5a1cb9..2681a4ba6c37 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -50,7 +50,12 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, + const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, + u32 label[]); +bool mpls_output_possible(const struct net_device *dev); +unsigned int mpls_dev_mtu(const struct net_device *dev); +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); #endif /* MPLS_INTERNAL_H */ From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:53 +0200 Subject: [PATCH 09/22] mpls: ip tunnel support This implementation uses lwtunnel infrastructure to register hooks for mpls tunnel encaps. It picks cues from iptunnel_encaps infrastructure and previous mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/mpls_iptunnel.h | 6 + include/net/mpls_iptunnel.h | 29 ++++ include/uapi/linux/mpls_iptunnel.h | 28 ++++ net/mpls/Kconfig | 8 +- net/mpls/Makefile | 1 + net/mpls/mpls_iptunnel.c | 233 +++++++++++++++++++++++++++++ 6 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 include/linux/mpls_iptunnel.h create mode 100644 include/net/mpls_iptunnel.h create mode 100644 include/uapi/linux/mpls_iptunnel.h create mode 100644 net/mpls/mpls_iptunnel.c diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..ef29eb2d6dfd --- /dev/null +++ b/include/linux/mpls_iptunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_MPLS_IPTUNNEL_H +#define _LINUX_MPLS_IPTUNNEL_H + +#include + +#endif /* _LINUX_MPLS_IPTUNNEL_H */ diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h new file mode 100644 index 000000000000..4757997f76ed --- /dev/null +++ b/include/net/mpls_iptunnel.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 Cumulus Networks, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_IPTUNNEL_H +#define _NET_MPLS_IPTUNNEL_H 1 + +#define MAX_NEW_LABELS 2 + +struct mpls_iptunnel_encap { + u32 label[MAX_NEW_LABELS]; + u32 labels; +}; + +static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) +{ + return (struct mpls_iptunnel_encap *)lwtstate->data; +} + +#endif diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..d80a0498f77e --- /dev/null +++ b/include/uapi/linux/mpls_iptunnel.h @@ -0,0 +1,28 @@ +/* + * mpls tunnel api + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H +#define _UAPI_LINUX_MPLS_IPTUNNEL_H + +/* MPLS tunnel attributes + * [RTA_ENCAP] = { + * [MPLS_IPTUNNEL_DST] + * } + */ +enum { + MPLS_IPTUNNEL_UNSPEC, + MPLS_IPTUNNEL_DST, + __MPLS_IPTUNNEL_MAX, +}; +#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) + +#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */ diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 17bde799c854..5c467ef97311 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 65bbe68c72e6..9ca923625016 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c new file mode 100644 index 000000000000..eea096f21ba5 --- /dev/null +++ b/net/mpls/mpls_iptunnel.c @@ -0,0 +1,233 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + struct lwtunnel_state *lwtstate = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (skb->protocol == htons(ETH_P_IP)) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + lwtstate = rt->rt_lwtstate; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + lwtstate = rt6->rt6i_lwtstate; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = rcu_dereference(dst->dev); + if (!mpls_output_possible(out_dev) || + !lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:54 +0200 Subject: [PATCH 10/22] ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic Rename the tunnel metadata data structures currently internal to OVS and make them generic for use by all IP tunnels. Both structures are kernel internal and will stay that way. Their members are exposed to user space through individual Netlink attributes by OVS. It will therefore be possible to extend/modify these structures without affecting user ABI. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 63 ++++++++++++++++++++++++++ include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 2 +- net/openvswitch/datapath.h | 5 ++- net/openvswitch/flow.c | 4 +- net/openvswitch/flow.h | 76 ++------------------------------ net/openvswitch/flow_netlink.c | 16 +++---- net/openvswitch/flow_netlink.h | 2 +- net/openvswitch/vport-geneve.c | 17 ++++--- net/openvswitch/vport-gre.c | 16 +++---- net/openvswitch/vport-vxlan.c | 18 ++++---- net/openvswitch/vport.c | 30 ++++++------- net/openvswitch/vport.h | 12 ++--- 13 files changed, 128 insertions(+), 135 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d8214cb88bbc..6b9d559ce5f5 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -22,6 +22,28 @@ /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) +/* Used to memset ip_tunnel padding. */ +#define IP_TUNNEL_KEY_SIZE \ + (offsetof(struct ip_tunnel_key, tp_dst) + \ + FIELD_SIZEOF(struct ip_tunnel_key, tp_dst)) + +struct ip_tunnel_key { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + __be16 tun_flags; + __u8 ipv4_tos; + __u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; +} __packed __aligned(4); /* Minimize padding. */ + +struct ip_tunnel_info { + struct ip_tunnel_key key; + const void *options; + u8 options_len; +}; + /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { @@ -136,6 +158,47 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); +static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + tun_info->key.tun_id = tun_id; + tun_info->key.ipv4_src = saddr; + tun_info->key.ipv4_dst = daddr; + tun_info->key.ipv4_tos = tos; + tun_info->key.ipv4_ttl = ttl; + tun_info->key.tun_flags = tun_flags; + + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->key.tp_src = tp_src; + tun_info->key.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; +} + +static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + __ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, tp_src, tp_dst, + tun_id, tun_flags, opts, opts_len); +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1dab77601c21..d6b885460187 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -321,7 +321,7 @@ enum ovs_key_attr { * the accepted length of the array. */ #ifdef __KERNEL__ - OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a8c0b8b4f63..27c1687cfd92 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; + struct ip_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index cd691e935e08..6b28c5cedb23 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "flow.h" #include "flow_table.h" @@ -98,7 +99,7 @@ struct datapath { * when a packet is received by OVS. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -114,7 +115,7 @@ struct ovs_skb_cb { * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + const struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bc7b0aba994a..8db22ef73626 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); if (tun_info->options) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a076e445ccc2..cadc6c5c3545 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -32,31 +32,10 @@ #include #include #include +#include struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,55 +45,6 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} - #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) @@ -122,7 +52,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -273,7 +203,7 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624e41c4267f..ecfa530d3461 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, } static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_KEY && @@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, } static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { struct nlattr *nla; @@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, egress_tun_info->options, egress_tun_info->options_len); } @@ -1746,7 +1746,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct ip_tunnel_info *tun_info; struct nlattr *a; int err = 0, start, opts_type; @@ -1777,7 +1777,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return PTR_ERR(a); tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; + tun_info->key = key.tun_key; tun_info->options_len = key.tun_opts_len; if (tun_info->options_len) { @@ -2227,13 +2227,13 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = nla_data(ovs_key); start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + err = ipv4_tun_to_nlattr(skb, &tun_info->key, tun_info->options_len ? tun_info->options : NULL, tun_info->options_len); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 5c3d75bff310..ec53eb6e632b 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 208c576bd1b6..1da3a14d1010 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) struct vport *vport = gs->rcv_data; struct genevehdr *geneveh = geneve_hdr(skb); int opts_len; - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; __be64 key; __be16 flags; @@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) key = vni_to_tunnel_id(geneveh->vni); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, geneveh->options, opts_len); ovs_vport_receive(vport, skb, &tun_info); } @@ -165,8 +164,8 @@ error: static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; + const struct ip_tunnel_key *tun_key; + struct ip_tunnel_info *tun_info; struct net *net = ovs_dp_get_net(vport->dp); struct geneve_port *geneve_port = geneve_vport(vport); __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; @@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport) } static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { struct geneve_port *geneve_port = geneve_vport(vport); struct net *net = ovs_dp_get_net(vport->dp); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f17ac9642f4e..b87656c66aaf 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key, + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; @@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_skb; } - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport) } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { return ovs_tunnel_get_egress_info(egress_tun_info, ovs_dp_get_net(vport->dp), diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 6d39766e7828..6f7986fabb70 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -64,7 +64,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, struct vxlan_metadata *md) { - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; struct vxlan_port *vxlan_port; struct vport *vport = vs->data; struct iphdr *iph; @@ -82,9 +82,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); + ip_tunnel_info_init(&tun_info, iph, + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, &opts, sizeof(opts)); ovs_vport_receive(vport, skb, &tun_info); } @@ -205,13 +205,13 @@ error: static int vxlan_ext_gbp(struct sk_buff *skb) { - const struct ovs_tunnel_info *tun_info; + const struct ip_tunnel_info *tun_info; const struct ovs_vxlan_opts *opts; tun_info = OVS_CB(skb)->egress_tun_info; opts = tun_info->options; - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && + if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT && tun_info->options_len >= sizeof(*opts)) return opts->gbp; else @@ -224,7 +224,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct vxlan_port *vxlan_port = vxlan_vport(vport); struct sock *sk = vxlan_port->vs->sock->sk; __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct vxlan_metadata md = {0}; struct rtable *rt; struct flowi4 fl; @@ -238,7 +238,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -269,7 +269,7 @@ error: } static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 067a3fff1d2c..af23ba077836 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) + const struct ip_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst) { - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct rtable *rt; struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; /* Route lookup to get srouce IP address. * The process may need to be changed if the corresponding process @@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ovs_flow_tun_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); + __ip_tunnel_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) + struct ip_tunnel_info *info) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index bc85331a6c60..4750fb673a9f 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -58,15 +58,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); + struct ip_tunnel_info *info); /* The following definitions are for implementers of vport devices: */ @@ -176,7 +176,7 @@ struct vport_ops { int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); + struct ip_tunnel_info *); struct module *owner; struct list_head list; @@ -226,7 +226,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -239,7 +239,7 @@ int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ovs_key_ipv4_tunnel *key, + const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) From 773a69d64bf65eb6c212c97e9737963a2cf668fd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:55 +0200 Subject: [PATCH 11/22] icmp: Don't leak original dst into ip_route_input() ip_route_input() unconditionally overwrites the dst. Hide the original dst attached to the skb by calling skb_dst_set(skb, NULL) prior to ip_route_input(). Reported-by: Julian Anastasov Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f5203fba6236..c0556f1e4bf0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -496,6 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ + skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); From f38a9eb1f77b296ff07e000823884a0f64d67b2a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:56 +0200 Subject: [PATCH 12/22] dst: Metadata destinations Introduces a new dst_metadata which enables to carry per packet metadata between forwarding and processing elements via the skb->dst pointer. The structure is set up to be a union. Thus, each separate type of metadata requires its own dst instance. If demand arises to carry multiple types of metadata concurrently, metadata dst entries can be made stackable. The metadata dst entry is refcnt'ed as expected for now but a non reference counted use is possible if the reference is forced before queueing the skb. In order to allow allocating dsts with variable length, the existing dst_alloc() is split into a dst_alloc() and dst_init() function. The existing dst_init() function to initialize the subsystem is being renamed to dst_subsys_init() to make it clear what is what. The check before ip_route_input() is changed to ignore metadata dsts and drop the dst inside the routing function thus allowing to interpret metadata in a later commit. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++- include/net/dst_metadata.h | 32 +++++++++++++++ net/core/dev.c | 2 +- net/core/dst.c | 84 +++++++++++++++++++++++++++++++------- net/ipv4/ip_input.c | 3 +- net/ipv4/route.c | 2 + 6 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 include/net/dst_metadata.h diff --git a/include/net/dst.h b/include/net/dst.h index 2bc73f8a00a9..2578811cef51 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -57,6 +57,7 @@ struct dst_entry { #define DST_FAKE_RTABLE 0x0040 #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 +#define DST_METADATA 0x0200 unsigned short pending_confirm; @@ -356,6 +357,9 @@ static inline int dst_discard(struct sk_buff *skb) } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags); void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); @@ -457,7 +461,7 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) return dst; } -void dst_init(void); +void dst_subsys_init(void); /* Flags for xfrm_lookup flags argument. */ enum { diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h new file mode 100644 index 000000000000..4f7694f3c7d0 --- /dev/null +++ b/include/net/dst_metadata.h @@ -0,0 +1,32 @@ +#ifndef __NET_DST_METADATA_H +#define __NET_DST_METADATA_H 1 + +#include +#include +#include + +struct metadata_dst { + struct dst_entry dst; + size_t opts_len; +}; + +static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); + + if (md_dst && md_dst->dst.flags & DST_METADATA) + return md_dst; + + return NULL; +} + +static inline bool skb_valid_dst(const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + return dst && !(dst->flags & DST_METADATA); +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); + +#endif /* __NET_DST_METADATA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 2ee15afb412d..cb52cba30ae8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7669,7 +7669,7 @@ static int __init net_dev_init(void) open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index e956ce6d1378..917364f0d0be 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -22,6 +22,7 @@ #include #include +#include /* * Theory of operations: @@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = { [RTAX_MAX] = 0xdeadbeef, }; - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; dst->child = NULL; dst->dev = dev; if (dev) @@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + return dst; } EXPORT_SYMBOL(dst_alloc); @@ -248,7 +259,11 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); + + if (dst->flags & DST_METADATA) + kfree(dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) { @@ -327,6 +342,47 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +static struct dst_ops md_dst_ops = { + .family = AF_UNSPEC, +}; + +static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call output on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static int dst_md_discard(struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call input on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + struct dst_entry *dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return ERR_PTR(-ENOMEM); + + dst = &md_dst->dst; + dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + + dst->input = dst_md_discard; + dst->output = dst_md_discard_sk; + + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->opts_len = optslen; + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -391,7 +447,7 @@ static struct notifier_block dst_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; -void __init dst_init(void) +void __init dst_subsys_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2db4c8773c1b..f4fc8a77aaa7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,6 +146,7 @@ #include #include #include +#include /* * Process Router Attention IP option (RFC 2113) @@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { + if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cd3157c464e6..4c8e84e75871 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1690,6 +1690,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + skb_dst_drop(skb); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; From 0accfc268f4d3345693d3af3d5780aae3ad93d8d Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:57 +0200 Subject: [PATCH 13/22] arp: Inherit metadata dst when creating ARP requests If output device wants to see the dst, inherit the dst of the original skb and pass it on to generate the ARP request. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/arp.c | 65 ++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 933a92820d26..1d59e50ce8b7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -291,6 +291,40 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(skb); } +/* Create and send an arp packet. */ +static void arp_send_dst(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, struct sk_buff *oskb) +{ + struct sk_buff *skb; + + /* arp on this interface. */ + if (dev->flags & IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (oskb) + skb_dst_copy(skb, oskb); + + arp_xmit(skb); +} + +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, + target_hw, NULL); +} +EXPORT_SYMBOL(arp_send); + static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; @@ -346,8 +380,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL); + arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_hw, dev->dev_addr, NULL, + dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -596,32 +631,6 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); -/* - * Create and send an arp packet. - */ -void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw) -{ - struct sk_buff *skb; - - /* - * No arp on this interface. - */ - - if (dev->flags&IFF_NOARP) - return; - - skb = arp_create(type, ptype, dest_ip, dev, src_ip, - dest_hw, src_hw, target_hw); - if (!skb) - return; - - arp_xmit(skb); -} -EXPORT_SYMBOL(arp_send); - /* * Process an arp request. */ From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:58 +0200 Subject: [PATCH 14/22] vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 149 +++++++++++++++++++++++++++++------ include/linux/skbuff.h | 1 + include/net/dst_metadata.h | 13 +++ include/net/ip_tunnels.h | 14 ++++ include/net/vxlan.h | 10 ++- include/uapi/linux/if_link.h | 1 + 6 files changed, 165 insertions(+), 23 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ec86a11743fd..06c092b05a51 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -49,6 +49,7 @@ #include #include #endif +#include #define VXLAN_VERSION "0.1" @@ -140,6 +141,11 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) +{ + return vs->flags & VXLAN_F_COLLECT_METADATA; +} + #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) @@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { + struct metadata_dst *tun_dst = NULL; + struct ip_tunnel_info *info; struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; - struct vxlan_metadata md = {0}; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vni &= VXLAN_VNI_MASK; } + if (vxlan_collect_metadata(vs)) { + const struct iphdr *iph = ip_hdr(skb); + + tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC); + if (!tun_dst) + goto drop; + + info = &tun_dst->u.tun_info; + info->key.ipv4_src = iph->saddr; + info->key.ipv4_dst = iph->daddr; + info->key.ipv4_tos = iph->tos; + info->key.ipv4_ttl = iph->ttl; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = TUNNEL_KEY; + info->key.tun_id = cpu_to_be64(vni >> 8); + if (udp_hdr(skb)->check != 0) + info->key.tun_flags |= TUNNEL_CSUM; + + md = ip_tunnel_info_opts(info, sizeof(*md)); + md->tun_dst = tun_dst; + } else { + memset(md, 0, sizeof(*md)); + } + /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ @@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; - md.gbp = ntohs(gbp->policy_id); + md->gbp = ntohs(gbp->policy_id); + + if (tun_dst) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) - md.gbp |= VXLAN_GBP_DONT_LEARN; + md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) - md.gbp |= VXLAN_GBP_POLICY_APPLIED; + md->gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } @@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md.vni = vxh->vx_vni; - vs->rcv(vs, skb, &md); + md->vni = vxh->vx_vni; + vs->rcv(vs, skb, md); return 0; drop: @@ -1247,6 +1286,9 @@ bad_flags: ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + /* Return non vxlan pkt */ return 1; } @@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, int err = 0; union vxlan_addr *remote_ip; - vni = ntohl(md->vni) >> 8; + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_FLOW_BASED) + vni = 0; + else + vni = ntohl(md->vni) >> 8; + /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) @@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, #endif } + if (md->tun_dst) { + skb_dst_set(skb, (struct dst_entry *)md->tun_dst); + md->tun_dst = NULL; + } + if ((vxlan->flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) goto drop; skb_reset_network_header(skb); - skb->mark = md->gbp; + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_FLOW_BASED)) + skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); @@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, return; drop: + if (md->tun_dst) + dst_release((struct dst_entry *)md->tun_dst); + /* Consume bad packet */ kfree_skb(skb); } @@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct ip_tunnel_info *info = skb_tunnel_info(skb); struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; - struct vxlan_metadata md; + union vxlan_addr remote_ip; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; int err; + u32 flags = vxlan->flags; - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; - vni = rdst->remote_vni; - dst = &rdst->remote_ip; + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; + } else { + if (!info) { + WARN_ONCE(1, "%s: Missing encapsulation instructions\n", + dev->name); + goto drop; + } + + dst_port = info->key.tp_dst ? : vxlan->dst_port; + vni = be64_to_cpu(info->key.tun_id); + remote_ip.sin.sin_family = AF_INET; + remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; + dst = &remote_ip; + } if (vxlan_addr_any(dst)) { if (did_rsc) { @@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->port_max, true); if (dst->sa.sa_family == AF_INET) { + if (info) { + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + + ttl = info->key.ipv4_ttl; + tos = info->key.ipv4_tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info, sizeof(*md)); + } else { + md->gbp = skb->mark; + } + memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; @@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; - + md->vni = htonl(vni << 8); err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, &md, + src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = rdst->remote_ifindex; + fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; fl6.saddr = vxlan->saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; @@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; + md->vni = htonl(vni << 8); + md->gbp = skb->mark; err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, &md, + 0, ttl, src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), vxlan->flags); #endif @@ -2051,6 +2141,7 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; @@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } + if (vxlan->flags & VXLAN_F_FLOW_BASED && + info && info->mode == IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } + f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; @@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_FLOWBASED] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, @@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_LIMIT]) vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + if (data[IFLA_VXLAN_FLOWBASED] && + nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) + vxlan->flags |= VXLAN_F_FLOW_BASED; + if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); @@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_FLOWBASED */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + @@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->flags & VXLAN_F_L3MISS)) || + nla_put_u8(skb, IFLA_VXLAN_FLOWBASED, + !!(vxlan->flags & VXLAN_F_FLOW_BASED)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6bd96fe9416a..648a2c241993 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4f7694f3c7d0..e843937fb30a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -8,6 +8,9 @@ struct metadata_dst { struct dst_entry dst; size_t opts_len; + union { + struct ip_tunnel_info tun_info; + } u; }; static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) @@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + if (md_dst) + return &md_dst->u.tun_info; + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6b9d559ce5f5..d11530f1c1e2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -38,10 +38,19 @@ struct ip_tunnel_key { __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ +/* Indicates whether the tunnel info structure represents receive + * or transmit tunnel parameters. + */ +enum { + IP_TUNNEL_INFO_RX, + IP_TUNNEL_INFO_TX, +}; + struct ip_tunnel_info { struct ip_tunnel_key key; const void *options; u8 options_len; + u8 mode; }; /* 6rd prefix/relay information */ @@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err, } } +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +{ + return info + 1; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0082b5d33d7d..80a2da29e088 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1< Date: Tue, 21 Jul 2015 10:43:59 +0200 Subject: [PATCH 15/22] route: Extend flow representation with tunnel key Add a new flowi_tunnel structure which is a subset of ip_tunnel_key to allow routes to match on tunnel metadata. For now, the tunnel id is added to flowi_tunnel which allows for routes to be bound to specific virtual tunnels. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 8 ++++++++ net/ipv4/fib_frontend.c | 2 ++ net/ipv4/route.c | 8 ++++++++ 3 files changed, 18 insertions(+) diff --git a/include/net/flow.h b/include/net/flow.h index 8109a159d1b3..3098ae33a178 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -19,6 +19,10 @@ #define LOOPBACK_IFINDEX 1 +struct flowi_tunnel { + __be64 tun_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -30,6 +34,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 __u32 flowic_secid; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { @@ -66,6 +71,7 @@ struct flowi4 { #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid +#define flowi4_tun_key __fl_common.flowic_tun_key /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -95,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; + fl4->flowi4_tun_key.tun_id = 0; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; @@ -165,6 +172,7 @@ struct flowi { #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid +#define flowi_tun_key u.__fl_common.flowic_tun_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 9b2019cc3586..6b98de0d7949 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + fl4.flowi4_tun_key.tun_id = 0; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { @@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_tun_key.tun_id = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4c8e84e75871..91da18be0a71 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +111,7 @@ #include #endif #include +#include #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1673,6 +1675,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct ip_tunnel_info *tun_info; struct flowi4 fl4; unsigned int flags = 0; u32 itag = 0; @@ -1690,6 +1693,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + tun_info = skb_tunnel_info(skb); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; + else + fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:00 +0200 Subject: [PATCH 16/22] route: Per route IP tunnel metadata via lightweight tunnel This introduces a new IP tunnel lightweight tunnel type which allows to specify IP tunnel instructions per route. Only IPv4 is supported at this point. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 10 ++- include/net/dst_metadata.h | 12 +++- include/net/ip_tunnels.h | 7 +- include/uapi/linux/lwtunnel.h | 1 + include/uapi/linux/rtnetlink.h | 15 +++++ net/ipv4/ip_tunnel_core.c | 114 +++++++++++++++++++++++++++++++++ net/ipv4/route.c | 2 +- net/openvswitch/vport.h | 1 + 8 files changed, 157 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 06c092b05a51..9486d7ec128c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1935,7 +1935,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { - struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; struct rtable *rt = NULL; @@ -1952,6 +1952,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, int err; u32 flags = vxlan->flags; + /* FIXME: Support IPv6 */ + info = skb_tunnel_info(skb, AF_INET); + if (rdst) { dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; @@ -2141,12 +2144,15 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - const struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_info *info; struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; struct vxlan_fdb *f; + /* FIXME: Support IPv6 */ + info = skb_tunnel_info(skb, AF_INET); + skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index e843937fb30a..7b0306894663 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,13 +23,23 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, + int family) { struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct rtable *rt; if (md_dst) return &md_dst->u.tun_info; + switch (family) { + case AF_INET: + rt = (struct rtable *)skb_dst(skb); + if (rt && rt->rt_lwtstate) + return lwt_tun_info(rt->rt_lwtstate); + break; + } + return NULL; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d11530f1c1e2..0b7e18cfa0b4 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -9,9 +9,9 @@ #include #include #include -#include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -298,6 +298,11 @@ static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) return info + 1; } +static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) +{ + return (struct ip_tunnel_info *)lwtstate->data; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index aa611d931a31..31377bbea3f8 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -6,6 +6,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_NONE, LWTUNNEL_ENCAP_MPLS, + LWTUNNEL_ENCAP_IP, __LWTUNNEL_ENCAP_MAX, }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0d3d3cc43356..47d24cb3fbc1 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -286,6 +286,21 @@ enum rt_class_t { /* Routing message attributes */ +enum ip_tunnel_t { + IP_TUN_UNSPEC, + IP_TUN_ID, + IP_TUN_DST, + IP_TUN_SRC, + IP_TUN_TTL, + IP_TUN_TOS, + IP_TUN_SPORT, + IP_TUN_DPORT, + IP_TUN_FLAGS, + __IP_TUN_MAX, +}; + +#define IP_TUN_MAX (__IP_TUN_MAX - 1) + enum rtattr_type_t { RTA_UNSPEC, RTA_DST, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6a51a71a6c67..025b76e803fd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -190,3 +190,117 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = { + [IP_TUN_ID] = { .type = NLA_U64 }, + [IP_TUN_DST] = { .type = NLA_U32 }, + [IP_TUN_SRC] = { .type = NLA_U32 }, + [IP_TUN_TTL] = { .type = NLA_U8 }, + [IP_TUN_TOS] = { .type = NLA_U8 }, + [IP_TUN_SPORT] = { .type = NLA_U16 }, + [IP_TUN_DPORT] = { .type = NLA_U16 }, + [IP_TUN_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[IP_TUN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP; + + tun_info = lwt_tun_info(new_state); + + if (tb[IP_TUN_ID]) + tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]); + + if (tb[IP_TUN_DST]) + tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]); + + if (tb[IP_TUN_SRC]) + tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]); + + if (tb[IP_TUN_TTL]) + tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]); + + if (tb[IP_TUN_TOS]) + tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]); + + if (tb[IP_TUN_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]); + + if (tb[IP_TUN_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]); + + if (tb[IP_TUN_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options = NULL; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) || + nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) || + nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) || + nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) || + nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) || + nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* IP_TUN_ID */ + + nla_total_size(4) /* IP_TUN_DST */ + + nla_total_size(4) /* IP_TUN_SRC */ + + nla_total_size(1) /* IP_TUN_TOS */ + + nla_total_size(1) /* IP_TUN_TTL */ + + nla_total_size(2) /* IP_TUN_SPORT */ + + nla_total_size(2) /* IP_TUN_DPORT */ + + nla_total_size(2); /* IP_TUN_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { + .build_state = ip_tun_build_state, + .fill_encap = ip_tun_fill_encap_info, + .get_encap_size = ip_tun_encap_nlsize, +}; + +static int __init ip_tunnel_core_init(void) +{ + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + + return 0; +} +module_init(ip_tunnel_core_init); + +static void __exit ip_tunnel_core_exit(void) +{ + lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); +} +module_exit(ip_tunnel_core_exit); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 91da18be0a71..519ec232818d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - tun_info = skb_tunnel_info(skb); + tun_info = skb_tunnel_info(skb, AF_INET); if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 4750fb673a9f..75d68248ba69 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "datapath.h" From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:01 +0200 Subject: [PATCH 17/22] fib: Add fib rule match on tunnel id This add the ability to select a routing table based on the tunnel id which allows to maintain separate routing tables for each virtual tunnel network. ip rule add from all tunnel-id 100 lookup 100 ip rule add from all tunnel-id 200 lookup 200 A new static key controls the collection of metadata at tunnel level upon demand. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 3 ++- include/net/fib_rules.h | 1 + include/net/ip_tunnels.h | 11 +++++++++++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 24 ++++++++++++++++++++++-- net/ipv4/ip_tunnel_core.c | 16 ++++++++++++++++ 6 files changed, 53 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 9486d7ec128c..2587ac84f71a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -143,7 +143,8 @@ static struct workqueue_struct *vxlan_wq; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { - return vs->flags & VXLAN_F_COLLECT_METADATA; + return vs->flags & VXLAN_F_COLLECT_METADATA || + ip_tunnel_collect_metadata(); } #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 903a55efbffe..4e8f804f4589 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -19,6 +19,7 @@ struct fib_rule { u8 action; /* 3 bytes hole, try to use */ u32 target; + __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0b7e18cfa0b4..0a5a7763eec2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -303,6 +303,17 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat return (struct ip_tunnel_info *)lwtstate->data; } +extern struct static_key ip_tunnel_metadata_cnt; + +/* Returns > 0 if metadata should be collected */ +static inline int ip_tunnel_collect_metadata(void) +{ + return static_key_false(&ip_tunnel_metadata_cnt); +} + +void ip_tunnel_need_metadata(void); +void ip_tunnel_unneed_metadata(void); + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2b82d7e30974..96161b8202b5 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -43,7 +43,7 @@ enum { FRA_UNUSED5, FRA_FWMARK, /* mark */ FRA_FLOW, /* flow/class id */ - FRA_UNUSED6, + FRA_TUN_ID, FRA_SUPPRESS_IFGROUP, FRA_SUPPRESS_PREFIXLEN, FRA_TABLE, /* Extended table id */ diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a12668f7d62..ae8306e7c56f 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -16,6 +16,7 @@ #include #include #include +#include int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) @@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_FWMASK]) rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + if (tb[FRA_TUN_ID]) + rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) continue; + if (tb[FRA_TUN_ID] && + (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 025b76e803fd..630e6d5712e8 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -304,3 +305,18 @@ static void __exit ip_tunnel_core_exit(void) lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); } module_exit(ip_tunnel_core_exit); + +struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(ip_tunnel_metadata_cnt); + +void ip_tunnel_need_metadata(void) +{ + static_key_slow_inc(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); + +void ip_tunnel_unneed_metadata(void) +{ + static_key_slow_dec(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); From 0dfbdf4102b9303d3ddf2177c0220098ff99f6de Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:02 +0200 Subject: [PATCH 18/22] vxlan: Factor out device configuration This factors out the device configuration out of the RTNL newlink API which allows for in-kernel creation of VXLAN net_devices. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 376 +++++++++++++++++++++++--------------------- include/net/vxlan.h | 59 +++++++ 2 files changed, 258 insertions(+), 177 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2587ac84f71a..30e1f215af73 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -55,10 +55,6 @@ #define PORT_HASH_BITS 8 #define PORT_HASH_SIZE (1<remote_ip)) goto nla_put_failure; - if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && + if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && @@ -756,7 +713,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (!(flags & NLM_F_CREATE)) return -ENOENT; - if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) + if (vxlan->cfg.addrmax && + vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; /* Disallow replace to add a multicast entry */ @@ -842,7 +800,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, return -EINVAL; *port = nla_get_be16(tb[NDA_PORT]); } else { - *port = vxlan->dst_port; + *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { @@ -1028,7 +986,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan_fdb_create(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, - vxlan->dst_port, + vxlan->cfg.dst_port, vxlan->default_dst.remote_vni, 0, NTF_SELF); spin_unlock(&vxlan->hash_lock); @@ -1957,7 +1915,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, info = skb_tunnel_info(skb, AF_INET); if (rdst) { - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = rdst->remote_vni; dst = &rdst->remote_ip; } else { @@ -1967,7 +1925,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto drop; } - dst_port = info->key.tp_dst ? : vxlan->dst_port; + dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); remote_ip.sin.sin_family = AF_INET; remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; @@ -1985,16 +1943,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph = ip_hdr(skb); - ttl = vxlan->ttl; + ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; - tos = vxlan->tos; + tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); - src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min, - vxlan->port_max, true); + src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); if (dst->sa.sa_family == AF_INET) { if (info) { @@ -2020,7 +1978,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.daddr = dst->sin.sin_addr.s_addr; - fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; + fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; rt = ip_route_output_key(vxlan->net, &fl4); if (IS_ERR(rt)) { @@ -2076,7 +2034,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; - fl6.saddr = vxlan->saddr.sin6.sin6_addr; + fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; @@ -2247,7 +2205,7 @@ static void vxlan_cleanup(unsigned long arg) if (f->state & NUD_PERMANENT) continue; - timeout = f->used + vxlan->age_interval * HZ; + timeout = f->used + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", @@ -2311,8 +2269,8 @@ static int vxlan_open(struct net_device *dev) struct vxlan_sock *vs; int ret = 0; - vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL, - false, vxlan->flags); + vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan_rcv, + NULL, vxlan->cfg.no_share, vxlan->flags); if (IS_ERR(vs)) return PTR_ERR(vs); @@ -2326,7 +2284,7 @@ static int vxlan_open(struct net_device *dev) } } - if (vxlan->age_interval) + if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; @@ -2484,7 +2442,7 @@ static void vxlan_setup(struct net_device *dev) vxlan->age_timer.function = vxlan_cleanup; vxlan->age_timer.data = (unsigned long) vxlan; - vxlan->dst_port = htons(vxlan_port); + vxlan->cfg.dst_port = htons(vxlan_port); vxlan->dev = dev; @@ -2684,54 +2642,35 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, } EXPORT_SYMBOL_GPL(vxlan_sock_add); -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, + struct vxlan_config *conf) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; - __u32 vni; int err; bool use_ipv6 = false; - - if (!data[IFLA_VXLAN_ID]) - return -EINVAL; + __be16 default_port = vxlan->cfg.dst_port; vxlan->net = src_net; - vni = nla_get_u32(data[IFLA_VXLAN_ID]); - dst->remote_vni = vni; + dst->remote_vni = conf->vni; + + memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); /* Unless IPv6 is explicitly requested, assume IPv4 */ - dst->remote_ip.sa.sa_family = AF_INET; - if (data[IFLA_VXLAN_GROUP]) { - dst->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); - } else if (data[IFLA_VXLAN_GROUP6]) { - if (!IS_ENABLED(CONFIG_IPV6)) - return -EPFNOSUPPORT; + if (!dst->remote_ip.sa.sa_family) + dst->remote_ip.sa.sa_family = AF_INET; - dst->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); - dst->remote_ip.sa.sa_family = AF_INET6; + if (dst->remote_ip.sa.sa_family == AF_INET6 || + vxlan->cfg.saddr.sa.sa_family == AF_INET6) use_ipv6 = true; - } - if (data[IFLA_VXLAN_LOCAL]) { - vxlan->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); - vxlan->saddr.sa.sa_family = AF_INET; - } else if (data[IFLA_VXLAN_LOCAL6]) { - if (!IS_ENABLED(CONFIG_IPV6)) - return -EPFNOSUPPORT; - - /* TODO: respect scope id */ - vxlan->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); - vxlan->saddr.sa.sa_family = AF_INET6; - use_ipv6 = true; - } - - if (data[IFLA_VXLAN_LINK] && - (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { + if (conf->remote_ifindex) { struct net_device *lowerdev - = __dev_get_by_index(src_net, dst->remote_ifindex); + = __dev_get_by_index(src_net, conf->remote_ifindex); + + dst->remote_ifindex = conf->remote_ifindex; if (!lowerdev) { pr_info("ifindex %d does not exist\n", dst->remote_ifindex); @@ -2749,7 +2688,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, } #endif - if (!tb[IFLA_MTU]) + if (!conf->mtu) dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); dev->needed_headroom = lowerdev->hard_header_len + @@ -2757,79 +2696,17 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, } else if (use_ipv6) vxlan->flags |= VXLAN_F_IPV6; - if (data[IFLA_VXLAN_TOS]) - vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); + memcpy(&vxlan->cfg, conf, sizeof(*conf)); + if (!vxlan->cfg.dst_port) + vxlan->cfg.dst_port = default_port; + vxlan->flags |= conf->flags; - if (data[IFLA_VXLAN_TTL]) - vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); + if (!vxlan->cfg.age_interval) + vxlan->cfg.age_interval = FDB_AGE_DEFAULT; - if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) - vxlan->flags |= VXLAN_F_LEARN; - - if (data[IFLA_VXLAN_AGEING]) - vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); - else - vxlan->age_interval = FDB_AGE_DEFAULT; - - if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) - vxlan->flags |= VXLAN_F_PROXY; - - if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) - vxlan->flags |= VXLAN_F_RSC; - - if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) - vxlan->flags |= VXLAN_F_L2MISS; - - if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) - vxlan->flags |= VXLAN_F_L3MISS; - - if (data[IFLA_VXLAN_LIMIT]) - vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); - - if (data[IFLA_VXLAN_FLOWBASED] && - nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) - vxlan->flags |= VXLAN_F_FLOW_BASED; - - if (data[IFLA_VXLAN_PORT_RANGE]) { - const struct ifla_vxlan_port_range *p - = nla_data(data[IFLA_VXLAN_PORT_RANGE]); - vxlan->port_min = ntohs(p->low); - vxlan->port_max = ntohs(p->high); - } - - if (data[IFLA_VXLAN_PORT]) - vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); - - if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) - vxlan->flags |= VXLAN_F_UDP_CSUM; - - if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] && - nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) - vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; - - if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] && - nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) - vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; - - if (data[IFLA_VXLAN_REMCSUM_TX] && - nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) - vxlan->flags |= VXLAN_F_REMCSUM_TX; - - if (data[IFLA_VXLAN_REMCSUM_RX] && - nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) - vxlan->flags |= VXLAN_F_REMCSUM_RX; - - if (data[IFLA_VXLAN_GBP]) - vxlan->flags |= VXLAN_F_GBP; - - if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) - vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL; - - if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port, vxlan->flags)) { - pr_info("duplicate VNI %u\n", vni); + if (vxlan_find_vni(src_net, conf->vni, use_ipv6 ? AF_INET6 : AF_INET, + vxlan->cfg.dst_port, vxlan->flags)) return -EEXIST; - } dev->ethtool_ops = &vxlan_ethtool_ops; @@ -2839,7 +2716,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, &vxlan->default_dst.remote_ip, NUD_REACHABLE|NUD_PERMANENT, NLM_F_EXCL|NLM_F_CREATE, - vxlan->dst_port, + vxlan->cfg.dst_port, vxlan->default_dst.remote_vni, vxlan->default_dst.remote_ifindex, NTF_SELF); @@ -2858,6 +2735,151 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, return 0; } +struct net_device *vxlan_dev_create(struct net *net, const char *name, + u8 name_assign_type, struct vxlan_config *conf) +{ + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, name, name_assign_type, + &vxlan_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = vxlan_dev_configure(net, dev, conf); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + + return dev; +} +EXPORT_SYMBOL_GPL(vxlan_dev_create); + +static int vxlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct vxlan_config conf; + int err; + + if (!data[IFLA_VXLAN_ID]) + return -EINVAL; + + memset(&conf, 0, sizeof(conf)); + conf.vni = nla_get_u32(data[IFLA_VXLAN_ID]); + + if (data[IFLA_VXLAN_GROUP]) { + conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); + } else if (data[IFLA_VXLAN_GROUP6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); + conf.remote_ip.sa.sa_family = AF_INET6; + } + + if (data[IFLA_VXLAN_LOCAL]) { + conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); + conf.saddr.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_LOCAL6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + /* TODO: respect scope id */ + conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); + conf.saddr.sa.sa_family = AF_INET6; + } + + if (data[IFLA_VXLAN_LINK]) + conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); + + if (data[IFLA_VXLAN_TOS]) + conf.tos = nla_get_u8(data[IFLA_VXLAN_TOS]); + + if (data[IFLA_VXLAN_TTL]) + conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); + + if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) + conf.flags |= VXLAN_F_LEARN; + + if (data[IFLA_VXLAN_AGEING]) + conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); + + if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) + conf.flags |= VXLAN_F_PROXY; + + if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) + conf.flags |= VXLAN_F_RSC; + + if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) + conf.flags |= VXLAN_F_L2MISS; + + if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) + conf.flags |= VXLAN_F_L3MISS; + + if (data[IFLA_VXLAN_LIMIT]) + conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + + if (data[IFLA_VXLAN_FLOWBASED] && + nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) + conf.flags |= VXLAN_F_FLOW_BASED; + + if (data[IFLA_VXLAN_PORT_RANGE]) { + const struct ifla_vxlan_port_range *p + = nla_data(data[IFLA_VXLAN_PORT_RANGE]); + conf.port_min = ntohs(p->low); + conf.port_max = ntohs(p->high); + } + + if (data[IFLA_VXLAN_PORT]) + conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); + + if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) + conf.flags |= VXLAN_F_UDP_CSUM; + + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] && + nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) + conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; + + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] && + nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) + conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; + + if (data[IFLA_VXLAN_REMCSUM_TX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) + conf.flags |= VXLAN_F_REMCSUM_TX; + + if (data[IFLA_VXLAN_REMCSUM_RX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) + conf.flags |= VXLAN_F_REMCSUM_RX; + + if (data[IFLA_VXLAN_GBP]) + conf.flags |= VXLAN_F_GBP; + + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) + conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL; + + err = vxlan_dev_configure(src_net, dev, &conf); + switch (err) { + case -ENODEV: + pr_info("ifindex %d does not exist\n", conf.remote_ifindex); + break; + + case -EPERM: + pr_info("IPv6 is disabled via sysctl\n"); + break; + + case -EEXIST: + pr_info("duplicate VNI %u\n", conf.vni); + break; + } + + return err; +} + static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2904,8 +2926,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { - .low = htons(vxlan->port_min), - .high = htons(vxlan->port_max), + .low = htons(vxlan->cfg.port_min), + .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) @@ -2928,22 +2950,22 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; - if (!vxlan_addr_any(&vxlan->saddr)) { - if (vxlan->saddr.sa.sa_family == AF_INET) { + if (!vxlan_addr_any(&vxlan->cfg.saddr)) { + if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, - vxlan->saddr.sin.sin_addr.s_addr)) + vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, - &vxlan->saddr.sin6.sin6_addr)) + &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } - if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || - nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || + if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || + nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, @@ -2955,9 +2977,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L3MISS)) || nla_put_u8(skb, IFLA_VXLAN_FLOWBASED, !!(vxlan->flags & VXLAN_F_FLOW_BASED)) || - nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || - nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || - nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || + nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || + nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || + nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !!(vxlan->flags & VXLAN_F_UDP_CSUM)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 80a2da29e088..19535f85eb2c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -95,6 +95,11 @@ struct vxlanhdr { #define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1< Date: Tue, 21 Jul 2015 10:44:03 +0200 Subject: [PATCH 19/22] openvswitch: Make tunnel set action attach a metadata dst Utilize the new metadata dst to attach encapsulation instructions to the skb. The existing egress_tun_info via the OVS_CB() is left in place until all tunnel vports have been converted to the new method. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 10 ++++- net/openvswitch/datapath.c | 8 ++-- net/openvswitch/flow.h | 5 +++ net/openvswitch/flow_netlink.c | 70 +++++++++++++++++++++++++++++----- net/openvswitch/flow_netlink.h | 1 + net/openvswitch/flow_table.c | 4 +- 6 files changed, 82 insertions(+), 16 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 27c1687cfd92..cf04c2f8b32a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -733,7 +733,15 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); + + /* FIXME: Remove when all vports have been converted */ + OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info; + return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ff8c4a4c1609..02082107c74c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1018,7 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1030,7 +1030,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1157,7 +1157,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1165,7 +1165,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index cadc6c5c3545..b62cdb3e3589 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -33,6 +33,7 @@ #include #include #include +#include struct sk_buff; @@ -45,6 +46,10 @@ struct sk_buff; #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; + #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ecfa530d3461..e7906dfb8814 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1548,11 +1548,48 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } -/* Schedules 'sf_acts' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) { - kfree_rcu(sf_acts, rcu); + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1746,7 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct metadata_dst *tun_dst; struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; @@ -1771,12 +1810,22 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) - return PTR_ERR(a); + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; - tun_info = nla_data(a); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); + return PTR_ERR(a); + } + + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; + + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->key = key.tun_key; tun_info->options_len = key.tun_opts_len; @@ -2177,7 +2226,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2227,7 +2276,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ip_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index ec53eb6e632b..acd074408f0a 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -69,5 +69,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 4613df8c8290..b70d845e7efb 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -18,6 +18,7 @@ #include "flow.h" #include "datapath.h" +#include "flow_netlink.h" #include #include #include @@ -143,7 +144,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree((struct sw_flow_actions __force *)flow->sf_acts); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, From be4ace6e6b1bc12e18b25fe764917e09a1f96d7b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:04 +0200 Subject: [PATCH 20/22] openvswitch: Move dev pointer into vport itself This is the first step in representing all OVS vports as regular struct net_devices. Move the net_device pointer into the vport structure itself to get rid of struct vport_netdev. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 7 +-- net/openvswitch/dp_notify.c | 5 +- net/openvswitch/vport-internal_dev.c | 37 +++++------- net/openvswitch/vport-netdev.c | 86 +++++++++++++--------------- net/openvswitch/vport-netdev.h | 12 ---- net/openvswitch/vport.h | 3 +- 6 files changed, 59 insertions(+), 91 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 02082107c74c..19df28ee5094 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -188,7 +188,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -2219,13 +2219,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 2c631fe76be1..a7a80a6b77b0 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 6a55f7105505..a2c205d9a8d5 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -156,49 +156,44 @@ static void do_setup(struct net_device *netdev) static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, - do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) goto error_free_netdev; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; error_free_netdev: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -207,21 +202,19 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); + unregister_netdevice(vport->dev); rtnl_unlock(); } static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; + struct net_device *netdev = vport->dev; int len; if (unlikely(!(netdev->flags & IFF_UP))) { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 33e6d6e2908f..1c9696693f66 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -83,104 +83,97 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +static struct vport *netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + static void free_port_rcu(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + dev_put(vport->dev); + ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); + call_rcu(&vport->rcu, free_port_rcu); } const char *ovs_netdev_get_name(const struct vport *vport) { - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; + return vport->dev->name; } static unsigned int packet_length(const struct sk_buff *skb) @@ -195,18 +188,17 @@ static unsigned int packet_length(const struct sk_buff *skb) static int netdev_send(struct vport *vport, struct sk_buff *skb) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; + int mtu = vport->dev->mtu; int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, + vport->dev->name, packet_length(skb), mtu); goto drop; } - skb->dev = netdev_vport->dev; + skb->dev = vport->dev; len = skb->len; dev_queue_xmit(skb); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 6f7038e79c52..1c52aed255c5 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,18 +26,6 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 75d68248ba69..e05ec68439d1 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -107,7 +107,7 @@ struct vport_portids { * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -120,6 +120,7 @@ struct vport { struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** From c9db965c524ea27451e60d5ddcd242f6c33a70fd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:05 +0200 Subject: [PATCH 21/22] openvswitch: Abstract vport name through ovs_vport_name() This allows to get rid of the get_name() vport ops later on. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/vport-internal_dev.c | 1 - net/openvswitch/vport-netdev.c | 6 ------ net/openvswitch/vport-netdev.h | 1 - net/openvswitch/vport.c | 4 ++-- net/openvswitch/vport.h | 5 +++++ 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 19df28ee5094..ffe984f5b95c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -176,7 +176,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -1800,7 +1800,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - vport->ops->get_name(vport))) + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index a2c205d9a8d5..c058bbf876c3 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -242,7 +242,6 @@ static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 1c9696693f66..e682bdc34a5c 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -171,11 +171,6 @@ static void netdev_destroy(struct vport *vport) call_rcu(&vport->rcu, free_port_rcu); } -const char *ovs_netdev_get_name(const struct vport *vport) -{ - return vport->dev->name; -} - static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; @@ -223,7 +218,6 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, .send = netdev_send, }; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 1c52aed255c5..684fb88723a4 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,7 +26,6 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index af23ba077836..d14f59403c5e 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -113,7 +113,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -226,7 +226,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e05ec68439d1..1a689c28b5a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -237,6 +237,11 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev ? vport->dev->name : vport->ops->get_name(vport); +} + int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); From 614732eaa12dd462c0ab274700bed14f36afea5e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:06 +0200 Subject: [PATCH 22/22] openvswitch: Use regular VXLAN net_device device This gets rid of all OVS specific VXLAN code in the receive and transmit path by using a VXLAN net_device to represent the vport. Only a small shim layer remains which takes care of handling the VXLAN specific OVS Netlink configuration. Unexports vxlan_sock_add(), vxlan_sock_release(), vxlan_xmit_skb() since they are no longer needed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 242 ++++++++++++------------- include/net/rtnetlink.h | 1 + include/net/vxlan.h | 24 +-- net/core/rtnetlink.c | 26 ++- net/openvswitch/Kconfig | 12 -- net/openvswitch/Makefile | 1 - net/openvswitch/flow_netlink.c | 6 +- net/openvswitch/vport-netdev.c | 201 +++++++++++++++++++- net/openvswitch/vport-vxlan.c | 322 --------------------------------- net/openvswitch/vport-vxlan.h | 11 -- 10 files changed, 339 insertions(+), 507 deletions(-) delete mode 100644 net/openvswitch/vport-vxlan.c delete mode 100644 net/openvswitch/vport-vxlan.h diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30e1f215af73..e9feefb41f0b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -75,6 +75,9 @@ static struct rtnl_link_ops vxlan_link_ops; static const u8 all_zeros_mac[ETH_ALEN]; +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags); + /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -1027,7 +1030,7 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -void vxlan_sock_release(struct vxlan_sock *vs) +static void vxlan_sock_release(struct vxlan_sock *vs) { struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); @@ -1043,7 +1046,6 @@ void vxlan_sock_release(struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } -EXPORT_SYMBOL_GPL(vxlan_sock_release); /* Update multicast group membership when first VNI on * multicast address is brought up @@ -1126,6 +1128,102 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, return vh; } +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md, u32 vni, + struct metadata_dst *tun_dst) +{ + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; + struct vxlan_dev *vxlan; + struct pcpu_sw_netstats *stats; + union vxlan_addr saddr; + int err = 0; + union vxlan_addr *remote_ip; + + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_FLOW_BASED) + vni = 0; + + /* Is this VNI defined? */ + vxlan = vxlan_vs_find_vni(vs, vni); + if (!vxlan) + goto drop; + + remote_ip = &vxlan->default_dst.remote_ip; + skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); + skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) + goto drop; + + /* Re-examine inner Ethernet packet */ + if (remote_ip->sa.sa_family == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + + if (tun_dst) { + skb_dst_set(skb, (struct dst_entry *)tun_dst); + tun_dst = NULL; + } + + if ((vxlan->flags & VXLAN_F_LEARN) && + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) + goto drop; + + skb_reset_network_header(skb); + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_FLOW_BASED)) + skb->mark = md->gbp; + + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + + if (unlikely(err)) { + if (log_ecn_error) { + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } + if (err > 1) { + ++vxlan->dev->stats.rx_frame_errors; + ++vxlan->dev->stats.rx_errors; + goto drop; + } + } + + stats = this_cpu_ptr(vxlan->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + netif_rx(skb); + + return; +drop: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + + /* Consume bad packet */ + kfree_skb(skb); +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { @@ -1192,7 +1290,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) info->key.tun_flags |= TUNNEL_CSUM; md = ip_tunnel_info_opts(info, sizeof(*md)); - md->tun_dst = tun_dst; } else { memset(md, 0, sizeof(*md)); } @@ -1231,8 +1328,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md->vni = vxh->vx_vni; - vs->rcv(vs, skb, md); + vxlan_rcv(vs, skb, md, vni >> 8, tun_dst); return 0; drop: @@ -1252,104 +1348,6 @@ error: return 1; } -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct iphdr *oip = NULL; - struct ipv6hdr *oip6 = NULL; - struct vxlan_dev *vxlan; - struct pcpu_sw_netstats *stats; - union vxlan_addr saddr; - __u32 vni; - int err = 0; - union vxlan_addr *remote_ip; - - /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_FLOW_BASED) - vni = 0; - else - vni = ntohl(md->vni) >> 8; - - /* Is this VNI defined? */ - vxlan = vxlan_vs_find_vni(vs, vni); - if (!vxlan) - goto drop; - - remote_ip = &vxlan->default_dst.remote_ip; - skb_reset_mac_header(skb); - skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); - skb->protocol = eth_type_trans(skb, vxlan->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - goto drop; - - /* Re-examine inner Ethernet packet */ - if (remote_ip->sa.sa_family == AF_INET) { - oip = ip_hdr(skb); - saddr.sin.sin_addr.s_addr = oip->saddr; - saddr.sa.sa_family = AF_INET; -#if IS_ENABLED(CONFIG_IPV6) - } else { - oip6 = ipv6_hdr(skb); - saddr.sin6.sin6_addr = oip6->saddr; - saddr.sa.sa_family = AF_INET6; -#endif - } - - if (md->tun_dst) { - skb_dst_set(skb, (struct dst_entry *)md->tun_dst); - md->tun_dst = NULL; - } - - if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) - goto drop; - - skb_reset_network_header(skb); - /* In flow-based mode, GBP is carried in dst_metadata */ - if (!(vs->flags & VXLAN_F_FLOW_BASED)) - skb->mark = md->gbp; - - if (oip6) - err = IP6_ECN_decapsulate(oip6, skb); - if (oip) - err = IP_ECN_decapsulate(oip, skb); - - if (unlikely(err)) { - if (log_ecn_error) { - if (oip6) - net_info_ratelimited("non-ECT from %pI6\n", - &oip6->saddr); - if (oip) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); - } - if (err > 1) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; - goto drop; - } - } - - stats = this_cpu_ptr(vxlan->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netif_rx(skb); - - return; -drop: - if (md->tun_dst) - dst_release((struct dst_entry *)md->tun_dst); - - /* Consume bad packet */ - kfree_skb(skb); -} - static int arp_reduce(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1688,7 +1686,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, - __be16 src_port, __be16 dst_port, + __be16 src_port, __be16 dst_port, __u32 vni, struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; @@ -1738,7 +1736,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1771,10 +1769,10 @@ err: } #endif -int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __u32 vni, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; @@ -1817,7 +1815,7 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1844,7 +1842,6 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ttl, df, src_port, dst_port, xnet, !(vxflags & VXLAN_F_UDP_CSUM)); } -EXPORT_SYMBOL_GPL(vxlan_xmit_skb); /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, @@ -2012,10 +2009,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md->vni = htonl(vni << 8); err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, md, + src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), flags); if (err < 0) { @@ -2070,11 +2066,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md->vni = htonl(vni << 8); md->gbp = skb->mark; err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, md, + 0, ttl, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), vxlan->flags); #endif @@ -2269,8 +2264,8 @@ static int vxlan_open(struct net_device *dev) struct vxlan_sock *vs; int ret = 0; - vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan_rcv, - NULL, vxlan->cfg.no_share, vxlan->flags); + vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, + vxlan->cfg.no_share, vxlan->flags); if (IS_ERR(vs)) return PTR_ERR(vs); @@ -2563,7 +2558,6 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); @@ -2592,8 +2586,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, vs->sock = sock; atomic_set(&vs->refcnt, 1); - vs->rcv = rcv; - vs->data = data; vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ @@ -2617,9 +2609,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags) +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; @@ -2629,7 +2620,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, spin_lock(&vn->sock_lock); vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags); - if (vs && vs->rcv == rcv) { + if (vs) { if (!atomic_add_unless(&vs->refcnt, 1, 0)) vs = ERR_PTR(-EBUSY); spin_unlock(&vn->sock_lock); @@ -2638,9 +2629,8 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, spin_unlock(&vn->sock_lock); } - return vxlan_socket_create(net, port, rcv, data, flags); + return vxlan_socket_create(net, port, flags); } -EXPORT_SYMBOL_GPL(vxlan_sock_add); static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 343d922d15c2..18fdb98185ab 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -141,6 +141,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]); +int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 19535f85eb2c..eb8d721cdb67 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -101,22 +101,12 @@ struct vxlanhdr { #define FDB_HASH_SIZE (1<vn_sock->sock->sk)->inet_sport; +} static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 03d61b54aac0..5fb4af20c6dd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1960,16 +1960,30 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; - LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1991,13 +2005,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev) return -ENODEV; - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return 0; + return rtnl_delete_link(dev); } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 15840401a2ce..1119f46b80b4 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -44,18 +44,6 @@ config OPENVSWITCH_GRE If unsure, say Y. -config OPENVSWITCH_VXLAN - tristate "Open vSwitch VXLAN tunneling support" - depends on OPENVSWITCH - depends on VXLAN - default OPENVSWITCH - ---help--- - If you say Y here, then the Open vSwitch will be able create vxlan vport. - - Say N to exclude this support and reduce the binary size. - - If unsure, say Y. - config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 91b9478413ef..38e0e149c55e 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -16,5 +16,4 @@ openvswitch-y := \ vport-netdev.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o -obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e7906dfb8814..a6eb77ab1a64 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -47,9 +47,9 @@ #include #include #include +#include #include "flow_netlink.h" -#include "vport-vxlan.h" struct ovs_len_tbl { int len; @@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, { struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; + struct vxlan_metadata opts; int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); @@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index e682bdc34a5c..68d0582fc001 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -27,9 +27,13 @@ #include #include -#include +#include +#include +#include +#include #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -147,7 +151,8 @@ static void free_port_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(vport->dev); + if (vport->dev) + dev_put(vport->dev); ovs_vport_free(vport); } @@ -221,12 +226,202 @@ static struct vport_ops ovs_netdev_vport_ops = { .send = netdev_send, }; +/* Compat code for old userspace. */ +#if IS_ENABLED(CONFIG_VXLAN) +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + +static void vxlan_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, free_port_rcu); +} + +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ip_tunnel_info *egress_tun_info) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dst_port = vxlan_dev_dst_port(vxlan); + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = vxlan_destroy, + .get_options = vxlan_get_options, + .send = netdev_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, +}; + +static int vxlan_compat_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void vxlan_compat_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} +#else +static int vxlan_compat_init(void) +{ + return 0; +} + +static void vxlan_compat_exit(void) +{ +} +#endif + int __init ovs_netdev_init(void) { - return ovs_vport_ops_register(&ovs_netdev_vport_ops); + int err; + + err = ovs_vport_ops_register(&ovs_netdev_vport_ops); + if (err) + return err; + err = vxlan_compat_init(); + if (err) + vxlan_compat_exit(); + return err; } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); + vxlan_compat_exit(); } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c deleted file mode 100644 index 6f7986fabb70..000000000000 --- a/net/openvswitch/vport-vxlan.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright (c) 2014 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "datapath.h" -#include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in */ -}; - -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -/* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ip_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ip_tunnel_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} - -static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) - return -EMSGSIZE; - - if (vxlan_port->exts) { - struct nlattr *exts; - - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - - if (vxlan_port->exts & VXLAN_F_GBP && - nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); - } - - return 0; -} - -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, -}; - -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) -{ - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; - int err; - - if (nla_len(attr) < sizeof(struct nlattr)) - return -EINVAL; - - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); - if (err < 0) - return err; - - vxlan_port = vxlan_vport(vport); - - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; - - return 0; -} - -static struct vport *vxlan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); - if (a) { - err = vxlan_configure_exts(vport, a); - if (err) { - ovs_vport_free(vport); - goto error; - } - } - - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { - ovs_vport_free(vport); - return (void *)vs; - } - vxlan_port->vs = vs; - - return vport; - -error: - return ERR_PTR(err); -} - -static int vxlan_ext_gbp(struct sk_buff *skb) -{ - const struct ip_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - struct sock *sk = vxlan_port->vs->sock->sk; - __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ip_tunnel_key *tun_key; - struct vxlan_metadata md = {0}; - struct rtable *rt; - struct flowi4 fl; - __be16 src_port; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->key; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); - - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} - -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; -} - -static struct vport_ops ovs_vxlan_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, - .get_options = vxlan_get_options, - .send = vxlan_tnl_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, -}; - -static int __init ovs_vxlan_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); -} - -static void __exit ovs_vxlan_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); -} - -module_init(ovs_vxlan_tnl_init); -module_exit(ovs_vxlan_tnl_exit); - -MODULE_DESCRIPTION("OVS: VXLAN switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h deleted file mode 100644 index 4b08233e73d5..000000000000 --- a/net/openvswitch/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include -#include - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif