diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 88bb8cc3555b..afdf950617c3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -187,6 +187,7 @@ struct tun_struct { #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6|NETIF_F_UFO) + int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; @@ -934,6 +935,17 @@ static void tun_poll_controller(struct net_device *dev) return; } #endif + +static void tun_set_headroom(struct net_device *dev, int new_hr) +{ + struct tun_struct *tun = netdev_priv(dev); + + if (new_hr < NET_SKB_PAD) + new_hr = NET_SKB_PAD; + + tun->align = new_hr; +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -945,6 +957,7 @@ static const struct net_device_ops tun_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif + .ndo_set_rx_headroom = tun_set_headroom, }; static const struct net_device_ops tap_netdev_ops = { @@ -962,6 +975,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_poll_controller = tun_poll_controller, #endif .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = tun_set_headroom, }; static void tun_flow_init(struct tun_struct *tun) @@ -1086,7 +1100,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); - size_t len = total_len, align = NET_SKB_PAD, linear; + size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; @@ -1694,6 +1708,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); + tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ba21d072be31..4f30a6ae50d0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -35,6 +35,7 @@ struct pcpu_vstats { struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; + unsigned requested_headroom; }; /* @@ -271,6 +272,29 @@ static int veth_get_iflink(const struct net_device *dev) return iflink; } +static void veth_set_rx_headroom(struct net_device *dev, int new_hr) +{ + struct veth_priv *peer_priv, *priv = netdev_priv(dev); + struct net_device *peer; + + if (new_hr < 0) + new_hr = 0; + + rcu_read_lock(); + peer = rcu_dereference(priv->peer); + if (unlikely(!peer)) + goto out; + + peer_priv = netdev_priv(peer); + priv->requested_headroom = new_hr; + new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); + dev->needed_headroom = new_hr; + peer->needed_headroom = new_hr; + +out: + rcu_read_unlock(); +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -285,6 +309,7 @@ static const struct net_device_ops veth_netdev_ops = { #endif .ndo_get_iflink = veth_get_iflink, .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = veth_set_rx_headroom, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ @@ -301,6 +326,7 @@ static void veth_setup(struct net_device *dev) dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_PHONY_HEADROOM; dev->netdev_ops = &veth_netdev_ops; dev->ethtool_ops = &veth_ethtool_ops; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e52077ffe5ed..efe7cec111fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1093,6 +1093,12 @@ struct tc_to_netdev { * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. + * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); + * This function is used to specify the headroom that the skb must + * consider when allocation skb during packet reception. Setting + * appropriate rx headroom value allows avoiding skb head copy on + * forward. Setting a negative value reset the rx headroom to the + * default value. * */ struct net_device_ops { @@ -1278,6 +1284,8 @@ struct net_device_ops { bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); + void (*ndo_set_rx_headroom)(struct net_device *dev, + int needed_headroom); }; /** @@ -1315,6 +1323,8 @@ struct net_device_ops { * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured + * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external + * entity (i.e. the master device for bridged veth) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1343,6 +1353,7 @@ enum netdev_priv_flags { IFF_L3MDEV_SLAVE = 1<<23, IFF_TEAM = 1<<24, IFF_RXFH_CONFIGURED = 1<<25, + IFF_PHONY_HEADROOM = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); +/* returns the headroom that the master device needs to take in account + * when forwarding to this dev + */ +static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) +{ + return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; +} + +static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) +{ + if (dev->netdev_ops->ndo_set_rx_headroom) + dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); +} + +/* set the device rx headroom to the dev's default */ +static inline void netdev_reset_rx_headroom(struct net_device *dev) +{ + netdev_set_rx_headroom(dev, -1); +} + /* * Net namespace inlines */ diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b37a1cc97d98..a73df3315df9 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head) destroy_nbp(p); } +static unsigned get_max_headroom(struct net_bridge *br) +{ + unsigned max_headroom = 0; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); + + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + + return max_headroom; +} + +static void update_headroom(struct net_bridge *br, int new_hr) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) + netdev_set_rx_headroom(p->dev, new_hr); + + br->dev->needed_headroom = new_hr; +} + /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. @@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); list_del_rcu(&p->list); + if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) + update_headroom(br, get_max_headroom(br)); + netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); @@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + unsigned br_hr, dev_hr; bool changed_addr; /* Don't allow bridging non-ethernet like devices, or DSA-enabled @@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) netdev_update_features(br->dev); - if (br->dev->needed_headroom < dev->needed_headroom) - br->dev->needed_headroom = dev->needed_headroom; + br_hr = br->dev->needed_headroom; + dev_hr = netdev_get_fwd_headroom(dev); + if (br_hr < dev_hr) + update_headroom(br, dev_hr); + else + netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c4e8455d5d56..e6a7d494df24 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1973,6 +1996,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2039,8 +2068,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9fdc1..427e39a045cf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a792f..83a5534abd31 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev);