Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next

Pull networking updates from Jakub Kicinski:
 "Core:

   - BPF:
      - add syscall program type and libbpf support for generating
        instructions and bindings for in-kernel BPF loaders (BPF loaders
        for BPF), this is a stepping stone for signed BPF programs
      - infrastructure to migrate TCP child sockets from one listener to
        another in the same reuseport group/map to improve flexibility
        of service hand-off/restart
      - add broadcast support to XDP redirect

   - allow bypass of the lockless qdisc to improving performance (for
     pktgen: +23% with one thread, +44% with 2 threads)

   - add a simpler version of "DO_ONCE()" which does not require jump
     labels, intended for slow-path usage

   - virtio/vsock: introduce SOCK_SEQPACKET support

   - add getsocketopt to retrieve netns cookie

   - ip: treat lowest address of a IPv4 subnet as ordinary unicast
     address allowing reclaiming of precious IPv4 addresses

   - ipv6: use prandom_u32() for ID generation

   - ip: add support for more flexible field selection for hashing
     across multi-path routes (w/ offload to mlxsw)

   - icmp: add support for extended RFC 8335 PROBE (ping)

   - seg6: add support for SRv6 End.DT46 behavior

   - mptcp:
      - DSS checksum support (RFC 8684) to detect middlebox meddling
      - support Connection-time 'C' flag
      - time stamping support

   - sctp: packetization Layer Path MTU Discovery (RFC 8899)

   - xfrm: speed up state addition with seq set

   - WiFi:
      - hidden AP discovery on 6 GHz and other HE 6 GHz improvements
      - aggregation handling improvements for some drivers
      - minstrel improvements for no-ack frames
      - deferred rate control for TXQs to improve reaction times
      - switch from round robin to virtual time-based airtime scheduler

   - add trace points:
      - tcp checksum errors
      - openvswitch - action execution, upcalls
      - socket errors via sk_error_report

  Device APIs:

   - devlink: add rate API for hierarchical control of max egress rate
     of virtual devices (VFs, SFs etc.)

   - don't require RCU read lock to be held around BPF hooks in NAPI
     context

   - page_pool: generic buffer recycling

  New hardware/drivers:

   - mobile:
      - iosm: PCIe Driver for Intel M.2 Modem
      - support for Qualcomm MSM8998 (ipa)

   - WiFi: Qualcomm QCN9074 and WCN6855 PCI devices

   - sparx5: Microchip SparX-5 family of Enterprise Ethernet switches

   - Mellanox BlueField Gigabit Ethernet (control NIC of the DPU)

   - NXP SJA1110 Automotive Ethernet 10-port switch

   - Qualcomm QCA8327 switch support (qca8k)

   - Mikrotik 10/25G NIC (atl1c)

  Driver changes:

   - ACPI support for some MDIO, MAC and PHY devices from Marvell and
     NXP (our first foray into MAC/PHY description via ACPI)

   - HW timestamping (PTP) support: bnxt_en, ice, sja1105, hns3, tja11xx

   - Mellanox/Nvidia NIC (mlx5)
      - NIC VF offload of L2 bridging
      - support IRQ distribution to Sub-functions

   - Marvell (prestera):
      - add flower and match all
      - devlink trap
      - link aggregation

   - Netronome (nfp): connection tracking offload

   - Intel 1GE (igc): add AF_XDP support

   - Marvell DPU (octeontx2): ingress ratelimit offload

   - Google vNIC (gve): new ring/descriptor format support

   - Qualcomm mobile (rmnet & ipa): inline checksum offload support

   - MediaTek WiFi (mt76)
      - mt7915 MSI support
      - mt7915 Tx status reporting
      - mt7915 thermal sensors support
      - mt7921 decapsulation offload
      - mt7921 enable runtime pm and deep sleep

   - Realtek WiFi (rtw88)
      - beacon filter support
      - Tx antenna path diversity support
      - firmware crash information via devcoredump

   - Qualcomm WiFi (wcn36xx)
      - Wake-on-WLAN support with magic packets and GTK rekeying

   - Micrel PHY (ksz886x/ksz8081): add cable test support"

* tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2168 commits)
  tcp: change ICSK_CA_PRIV_SIZE definition
  tcp_yeah: check struct yeah size at compile time
  gve: DQO: Fix off by one in gve_rx_dqo()
  stmmac: intel: set PCI_D3hot in suspend
  stmmac: intel: Enable PHY WOL option in EHL
  net: stmmac: option to enable PHY WOL with PMT enabled
  net: say "local" instead of "static" addresses in ndo_dflt_fdb_{add,del}
  net: use netdev_info in ndo_dflt_fdb_{add,del}
  ptp: Set lookup cookie when creating a PTP PPS source.
  net: sock: add trace for socket errors
  net: sock: introduce sk_error_report
  net: dsa: replay the local bridge FDB entries pointing to the bridge dev too
  net: dsa: ensure during dsa_fdb_offload_notify that dev_hold and dev_put are on the same dev
  net: dsa: include fdb entries pointing to bridge in the host fdb list
  net: dsa: include bridge addresses which are local in the host fdb list
  net: dsa: sync static FDB entries on foreign interfaces to hardware
  net: dsa: install the host MDB and FDB entries in the master's RX filter
  net: dsa: reference count the FDB addresses at the cross-chip notifier level
  net: dsa: introduce a separate cross-chip notifier type for host FDBs
  net: dsa: reference count the MDB entries at the cross-chip notifier level
  ...
This commit is contained in:
Linus Torvalds
2021-06-30 15:51:09 -07:00
1908 changed files with 108602 additions and 27721 deletions

View File

@@ -524,8 +524,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
nr_maps++;
}
diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
GFP_KERNEL);
diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
if (!diag)
return ERR_PTR(-ENOMEM);

View File

@@ -148,6 +148,7 @@
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include "net-sysfs.h"
@@ -3487,13 +3488,16 @@ EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
skb_dump(KERN_ERR, skb, true);
dump_stack();
}
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
skb_dump(KERN_ERR, skb, true);
dump_stack();
}
DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
@@ -3852,10 +3856,33 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
if (likely(!netif_xmit_frozen_or_stopped(txq)))
qdisc_run(q);
if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
qdisc_run_begin(q)) {
/* Retest nolock_qdisc_is_empty() within the protection
* of q->seqlock to protect from racing with requeuing.
*/
if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = q->enqueue(skb, q, &to_free) &
NET_XMIT_MASK;
__qdisc_run(q);
qdisc_run_end(q);
goto no_lock_out;
}
qdisc_bstats_cpu_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
qdisc_run_end(q);
return NET_XMIT_SUCCESS;
}
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
qdisc_run(q);
no_lock_out:
if (unlikely(to_free))
kfree_skb_list(to_free);
return rc;
@@ -5277,9 +5304,9 @@ another_round:
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
preempt_disable();
migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
preempt_enable();
migrate_enable();
if (ret2 != XDP_PASS) {
ret = NET_RX_DROP;
@@ -6520,11 +6547,18 @@ EXPORT_SYMBOL(napi_schedule_prep);
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
* Variant of __napi_schedule() assuming hard irqs are masked
* Variant of __napi_schedule() assuming hard irqs are masked.
*
* On PREEMPT_RT enabled kernels this maps to __napi_schedule()
* because the interrupt disabled assumption might not be true
* due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
____napi_schedule(this_cpu_ptr(&softnet_data), n);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
____napi_schedule(this_cpu_ptr(&softnet_data), n);
else
__napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

View File

@@ -190,6 +190,80 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
return devlink_port_get_from_attrs(devlink, info->attrs);
}
static inline bool
devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
{
return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
}
static inline bool
devlink_rate_is_node(struct devlink_rate *devlink_rate)
{
return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
}
static struct devlink_rate *
devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
{
struct devlink_rate *devlink_rate;
struct devlink_port *devlink_port;
devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
if (IS_ERR(devlink_port))
return ERR_CAST(devlink_port);
devlink_rate = devlink_port->devlink_rate;
return devlink_rate ?: ERR_PTR(-ENODEV);
}
static struct devlink_rate *
devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
{
static struct devlink_rate *devlink_rate;
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
if (devlink_rate_is_node(devlink_rate) &&
!strcmp(node_name, devlink_rate->name))
return devlink_rate;
}
return ERR_PTR(-ENODEV);
}
static struct devlink_rate *
devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
{
const char *rate_node_name;
size_t len;
if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
return ERR_PTR(-EINVAL);
rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
len = strlen(rate_node_name);
/* Name cannot be empty or decimal number */
if (!len || strspn(rate_node_name, "0123456789") == len)
return ERR_PTR(-EINVAL);
return devlink_rate_node_get_by_name(devlink, rate_node_name);
}
static struct devlink_rate *
devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
{
return devlink_rate_node_get_from_attrs(devlink, info->attrs);
}
static struct devlink_rate *
devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
{
struct nlattr **attrs = info->attrs;
if (attrs[DEVLINK_ATTR_PORT_INDEX])
return devlink_rate_leaf_get_from_info(devlink, info);
else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
return devlink_rate_node_get_from_info(devlink, info);
else
return ERR_PTR(-EINVAL);
}
struct devlink_sb {
struct list_head list;
unsigned int index;
@@ -408,12 +482,14 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
#define DEVLINK_NL_FLAG_NEED_RATE BIT(2)
#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3)
/* The per devlink instance lock is taken by default in the pre-doit
* operation, yet several commands do not require this. The global
* devlink lock is taken and protects from disruption by user-calls.
*/
#define DEVLINK_NL_FLAG_NO_LOCK BIT(2)
#define DEVLINK_NL_FLAG_NO_LOCK BIT(4)
static int devlink_nl_pre_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
@@ -442,6 +518,24 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
devlink_port = devlink_port_get_from_info(devlink, info);
if (!IS_ERR(devlink_port))
info->user_ptr[1] = devlink_port;
} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) {
struct devlink_rate *devlink_rate;
devlink_rate = devlink_rate_get_from_info(devlink, info);
if (IS_ERR(devlink_rate)) {
err = PTR_ERR(devlink_rate);
goto unlock;
}
info->user_ptr[1] = devlink_rate;
} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) {
struct devlink_rate *rate_node;
rate_node = devlink_rate_node_get_from_info(devlink, info);
if (IS_ERR(rate_node)) {
err = PTR_ERR(rate_node);
goto unlock;
}
info->user_ptr[1] = rate_node;
}
return 0;
@@ -748,6 +842,56 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *
return 0;
}
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink *devlink,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid,
u32 seq, int flags,
struct netlink_ext_ack *extack)
{
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
goto nla_put_failure;
if (devlink_rate_is_leaf(devlink_rate)) {
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
devlink_rate->devlink_port->index))
goto nla_put_failure;
} else if (devlink_rate_is_node(devlink_rate)) {
if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
devlink_rate->name))
goto nla_put_failure;
}
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE,
devlink_rate->tx_share, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX,
devlink_rate->tx_max, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (devlink_rate->parent)
if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
devlink_rate->parent->name))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static bool
devlink_port_fn_state_valid(enum devlink_port_fn_state state)
{
@@ -919,6 +1063,111 @@ static void devlink_port_notify(struct devlink_port *devlink_port,
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static void devlink_rate_notify(struct devlink_rate *devlink_rate,
enum devlink_command cmd)
{
struct devlink *devlink = devlink_rate->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_RATE_NEW &&
cmd != DEVLINK_CMD_RATE_DEL);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
struct devlink_rate *devlink_rate;
struct devlink *devlink;
int start = cb->args[0];
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
continue;
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
u32 id = NETLINK_CB(cb->skb).portid;
if (idx < start) {
idx++;
continue;
}
err = devlink_nl_rate_fill(msg, devlink,
devlink_rate,
cmd, id,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, NULL);
if (err) {
mutex_unlock(&devlink->lock);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
}
out:
mutex_unlock(&devlink_mutex);
if (err != -EMSGSIZE)
return err;
cb->args[0] = idx;
return msg->len;
}
static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *devlink_rate = info->user_ptr[1];
struct devlink *devlink = devlink_rate->devlink;
struct sk_buff *msg;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
DEVLINK_CMD_RATE_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static bool
devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
struct devlink_rate *parent)
{
while (parent) {
if (parent == devlink_rate)
return true;
parent = parent->parent;
}
return false;
}
static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
@@ -1339,6 +1588,255 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
return devlink->ops->port_del(devlink, port_index, extack);
}
static int
devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
struct genl_info *info,
struct nlattr *nla_parent)
{
struct devlink *devlink = devlink_rate->devlink;
const char *parent_name = nla_data(nla_parent);
const struct devlink_ops *ops = devlink->ops;
size_t len = strlen(parent_name);
struct devlink_rate *parent;
int err = -EOPNOTSUPP;
parent = devlink_rate->parent;
if (parent && len) {
NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent.");
return -EBUSY;
} else if (parent && !len) {
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_parent_set(devlink_rate, NULL,
devlink_rate->priv, NULL,
info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_parent_set(devlink_rate, NULL,
devlink_rate->priv, NULL,
info->extack);
if (err)
return err;
refcount_dec(&parent->refcnt);
devlink_rate->parent = NULL;
} else if (!parent && len) {
parent = devlink_rate_node_get_by_name(devlink, parent_name);
if (IS_ERR(parent))
return -ENODEV;
if (parent == devlink_rate) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed");
return -EINVAL;
}
if (devlink_rate_is_node(devlink_rate) &&
devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node.");
return -EEXIST;
}
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_parent_set(devlink_rate, parent,
devlink_rate->priv, parent->priv,
info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_parent_set(devlink_rate, parent,
devlink_rate->priv, parent->priv,
info->extack);
if (err)
return err;
refcount_inc(&parent->refcnt);
devlink_rate->parent = parent;
}
return 0;
}
static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
const struct devlink_ops *ops,
struct genl_info *info)
{
struct nlattr *nla_parent, **attrs = info->attrs;
int err = -EOPNOTSUPP;
u64 rate;
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
if (err)
return err;
devlink_rate->tx_share = rate;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
if (err)
return err;
devlink_rate->tx_max = rate;
}
nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
if (nla_parent) {
err = devlink_nl_rate_parent_node_set(devlink_rate, info,
nla_parent);
if (err)
return err;
}
return 0;
}
static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
struct genl_info *info,
enum devlink_rate_type type)
{
struct nlattr **attrs = info->attrs;
if (type == DEVLINK_RATE_TYPE_LEAF) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
!ops->rate_leaf_parent_set) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
return false;
}
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
!ops->rate_node_parent_set) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
return false;
}
} else {
WARN(1, "Unknown type of rate object");
return false;
}
return true;
}
static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *devlink_rate = info->user_ptr[1];
struct devlink *devlink = devlink_rate->devlink;
const struct devlink_ops *ops = devlink->ops;
int err;
if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
return -EOPNOTSUPP;
err = devlink_nl_rate_set(devlink_rate, ops, info);
if (!err)
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
return err;
}
static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_rate *rate_node;
const struct devlink_ops *ops;
int err;
ops = devlink->ops;
if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported");
return -EOPNOTSUPP;
}
if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
return -EOPNOTSUPP;
rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
if (!IS_ERR(rate_node))
return -EEXIST;
else if (rate_node == ERR_PTR(-EINVAL))
return -EINVAL;
rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
if (!rate_node)
return -ENOMEM;
rate_node->devlink = devlink;
rate_node->type = DEVLINK_RATE_TYPE_NODE;
rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
if (!rate_node->name) {
err = -ENOMEM;
goto err_strdup;
}
err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
if (err)
goto err_node_new;
err = devlink_nl_rate_set(rate_node, ops, info);
if (err)
goto err_rate_set;
refcount_set(&rate_node->refcnt, 1);
list_add(&rate_node->list, &devlink->rate_list);
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
return 0;
err_rate_set:
ops->rate_node_del(rate_node, rate_node->priv, info->extack);
err_node_new:
kfree(rate_node->name);
err_strdup:
kfree(rate_node);
return err;
}
static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *rate_node = info->user_ptr[1];
struct devlink *devlink = rate_node->devlink;
const struct devlink_ops *ops = devlink->ops;
int err;
if (refcount_read(&rate_node->refcnt) > 1) {
NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node.");
return -EBUSY;
}
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
err = ops->rate_node_del(rate_node, rate_node->priv, info->extack);
if (rate_node->parent)
refcount_dec(&rate_node->parent->refcnt);
list_del(&rate_node->list);
kfree(rate_node->name);
kfree(rate_node);
return err;
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_sb *devlink_sb,
enum devlink_command cmd, u32 portid,
@@ -2207,6 +2705,23 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
struct netlink_ext_ack *extack)
{
struct devlink_rate *devlink_rate;
/* Take the lock to sync with devlink_rate_nodes_destroy() */
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_rate, &devlink->rate_list, list)
if (devlink_rate_is_node(devlink_rate)) {
mutex_unlock(&devlink->lock);
NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists.");
return -EBUSY;
}
mutex_unlock(&devlink->lock);
return 0;
}
static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
@@ -2221,6 +2736,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
if (!ops->eswitch_mode_set)
return -EOPNOTSUPP;
mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
err = devlink_rate_nodes_check(devlink, mode, info->extack);
if (err)
return err;
err = ops->eswitch_mode_set(devlink, mode, info->extack);
if (err)
return err;
@@ -6994,8 +7512,9 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
}
}
static int devlink_trap_stats_put(struct sk_buff *msg,
struct devlink_stats __percpu *trap_stats)
static int
devlink_trap_group_stats_put(struct sk_buff *msg,
struct devlink_stats __percpu *trap_stats)
{
struct devlink_stats stats;
struct nlattr *attr;
@@ -7023,6 +7542,50 @@ nla_put_failure:
return -EMSGSIZE;
}
static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_item *trap_item)
{
struct devlink_stats stats;
struct nlattr *attr;
u64 drops = 0;
int err;
if (devlink->ops->trap_drop_counter_get) {
err = devlink->ops->trap_drop_counter_get(devlink,
trap_item->trap,
&drops);
if (err)
return err;
}
devlink_trap_stats_read(trap_item->stats, &stats);
attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
if (!attr)
return -EMSGSIZE;
if (devlink->ops->trap_drop_counter_get &&
nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
stats.rx_packets, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
stats.rx_bytes, DEVLINK_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, attr);
return -EMSGSIZE;
}
static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd, u32 portid, u32 seq,
@@ -7060,7 +7623,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
err = devlink_trap_stats_put(msg, trap_item->stats);
err = devlink_trap_stats_put(msg, devlink, trap_item);
if (err)
goto nla_put_failure;
@@ -7277,7 +7840,7 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
group_item->policer_item->policer->id))
goto nla_put_failure;
err = devlink_trap_stats_put(msg, group_item->stats);
err = devlink_trap_group_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -7801,6 +8364,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 },
[DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 },
[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 },
[DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 },
[DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 },
[DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
};
static const struct genl_small_ops devlink_nl_ops[] = {
@@ -7826,6 +8394,30 @@ static const struct genl_small_ops devlink_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_RATE_GET,
.doit = devlink_nl_cmd_rate_get_doit,
.dumpit = devlink_nl_cmd_rate_get_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RATE_SET,
.doit = devlink_nl_cmd_rate_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
},
{
.cmd = DEVLINK_CMD_RATE_NEW,
.doit = devlink_nl_cmd_rate_new_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_RATE_DEL,
.doit = devlink_nl_cmd_rate_del_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE,
},
{
.cmd = DEVLINK_CMD_PORT_SPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -8201,6 +8793,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
__devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->rate_list);
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
INIT_LIST_HEAD(&devlink->resource_list);
@@ -8303,6 +8896,7 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->resource_list));
WARN_ON(!list_empty(&devlink->dpipe_table_list));
WARN_ON(!list_empty(&devlink->sb_list));
WARN_ON(!list_empty(&devlink->rate_list));
WARN_ON(!list_empty(&devlink->port_list));
xa_destroy(&devlink->snapshot_ids);
@@ -8619,6 +9213,110 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
/**
* devlink_rate_leaf_create - create devlink rate leaf
*
* @devlink_port: devlink port object to create rate object on
* @priv: driver private data
*
* Create devlink rate object of type leaf on provided @devlink_port.
* Throws call trace if @devlink_port already has a devlink rate object.
*
* Context: Takes and release devlink->lock <mutex>.
*
* Return: -ENOMEM if failed to allocate rate object, 0 otherwise.
*/
int
devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
{
struct devlink *devlink = devlink_port->devlink;
struct devlink_rate *devlink_rate;
devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
if (!devlink_rate)
return -ENOMEM;
mutex_lock(&devlink->lock);
WARN_ON(devlink_port->devlink_rate);
devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
devlink_rate->devlink = devlink;
devlink_rate->devlink_port = devlink_port;
devlink_rate->priv = priv;
list_add_tail(&devlink_rate->list, &devlink->rate_list);
devlink_port->devlink_rate = devlink_rate;
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
mutex_unlock(&devlink->lock);
return 0;
}
EXPORT_SYMBOL_GPL(devlink_rate_leaf_create);
/**
* devlink_rate_leaf_destroy - destroy devlink rate leaf
*
* @devlink_port: devlink port linked to the rate object
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_rate_leaf_destroy(struct devlink_port *devlink_port)
{
struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
struct devlink *devlink = devlink_port->devlink;
if (!devlink_rate)
return;
mutex_lock(&devlink->lock);
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
if (devlink_rate->parent)
refcount_dec(&devlink_rate->parent->refcnt);
list_del(&devlink_rate->list);
devlink_port->devlink_rate = NULL;
mutex_unlock(&devlink->lock);
kfree(devlink_rate);
}
EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy);
/**
* devlink_rate_nodes_destroy - destroy all devlink rate nodes on device
*
* @devlink: devlink instance
*
* Unset parent for all rate objects and destroy all rate nodes
* on specified device.
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_rate_nodes_destroy(struct devlink *devlink)
{
static struct devlink_rate *devlink_rate, *tmp;
const struct devlink_ops *ops = devlink->ops;
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
if (!devlink_rate->parent)
continue;
refcount_dec(&devlink_rate->parent->refcnt);
if (devlink_rate_is_leaf(devlink_rate))
ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
else if (devlink_rate_is_node(devlink_rate))
ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
}
list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
if (devlink_rate_is_node(devlink_rate)) {
ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
list_del(&devlink_rate->list);
kfree(devlink_rate->name);
kfree(devlink_rate);
}
}
mutex_unlock(&devlink->lock);
}
EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy);
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
@@ -8630,12 +9328,18 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
case DEVLINK_PORT_FLAVOUR_VIRTUAL:
n = snprintf(name, len, "p%u", attrs->phys.port_number);
if (n < len && attrs->split)
n += snprintf(name + n, len - n, "s%u",
attrs->phys.split_subport_number);
if (!attrs->split)
n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
n = snprintf(name, len, "p%us%u",
attrs->phys.port_number,
attrs->phys.split_subport_number);
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -8677,8 +9381,6 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
attrs->pci_sf.sf);
break;
case DEVLINK_PORT_FLAVOUR_VIRTUAL:
return -EOPNOTSUPP;
}
if (n >= len)

View File

@@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
return -ENOTSUPP;
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3255,19 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* SKB_GSO_TCPV4 needs to be changed into
* SKB_GSO_TCPV6.
*/
/* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
if (shinfo->gso_type & SKB_GSO_TCPV4) {
shinfo->gso_type &= ~SKB_GSO_TCPV4;
shinfo->gso_type |= SKB_GSO_TCPV6;
}
/* Due to IPv6 header, MSS needs to be downgraded. */
skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IPV6);
@@ -3282,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
return -ENOTSUPP;
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -3296,19 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* SKB_GSO_TCPV6 needs to be changed into
* SKB_GSO_TCPV4.
*/
/* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
if (shinfo->gso_type & SKB_GSO_TCPV6) {
shinfo->gso_type &= ~SKB_GSO_TCPV6;
shinfo->gso_type |= SKB_GSO_TCPV4;
}
/* Due to IPv4 header, MSS can be upgraded. */
skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IP);
@@ -3919,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
/* XDP_REDIRECT works by a three-step process, implemented in the functions
* below:
*
* 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
* of the redirect and store it (along with some other metadata) in a per-CPU
* struct bpf_redirect_info.
*
* 2. When the program returns the XDP_REDIRECT return code, the driver will
* call xdp_do_redirect() which will use the information in struct
* bpf_redirect_info to actually enqueue the frame into a map type-specific
* bulk queue structure.
*
* 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
* which will flush all the different bulk queues, thus completing the
* redirect.
*
* Pointers to the map entries will be kept around for this whole sequence of
* steps, protected by RCU. However, there is no top-level rcu_read_lock() in
* the core code; instead, the RCU protection relies on everything happening
* inside a single NAPI poll sequence, which means it's between a pair of calls
* to local_bh_disable()/local_bh_enable().
*
* The map entries are marked as __rcu and the map code makes sure to
* dereference those pointers with rcu_dereference_check() in a way that works
* for both sections that to hold an rcu_read_lock() and sections that are
* called from NAPI without a separate rcu_read_lock(). The code below does not
* use RCU annotations, but relies on those in the map code.
*/
void xdp_do_flush(void)
{
__dev_flush();
@@ -3927,6 +3933,23 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
int cpu;
for_each_possible_cpu(cpu) {
ri = per_cpu_ptr(&bpf_redirect_info, cpu);
/* Avoid polluting remote cacheline due to writes if
* not needed. Once we pass this test, we need the
* cmpxchg() to make sure it hasn't been changed in
* the meantime by remote CPU.
*/
if (unlikely(READ_ONCE(ri->map) == map))
cmpxchg(&ri->map, map, NULL);
}
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
@@ -3934,6 +3957,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3943,7 +3967,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
err = dev_map_enqueue(fwd, xdp, dev);
map = READ_ONCE(ri->map);
if (unlikely(map)) {
WRITE_ONCE(ri->map, NULL);
err = dev_map_enqueue_multi(xdp, dev, map,
ri->flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_enqueue(fwd, xdp, dev);
}
break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3985,13 +4016,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map;
int err;
switch (map_type) {
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
map = READ_ONCE(ri->map);
if (unlikely(map)) {
WRITE_ONCE(ri->map, NULL);
err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
ri->flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
}
if (unlikely(err))
goto err;
break;
@@ -10008,11 +10047,13 @@ out:
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
struct sock *migrating_sk,
u32 hash)
{
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
reuse_kern->migrating_sk = migrating_sk;
reuse_kern->data_end = skb->data + skb_headlen(skb);
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10021,12 +10062,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
struct sock *migrating_sk,
u32 hash)
{
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
action = BPF_PROG_RUN(prog, &reuse_kern);
if (action == SK_PASS)
@@ -10136,6 +10178,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return &sk_reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return &sk_reuseport_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -10165,6 +10209,14 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
case offsetof(struct sk_reuseport_md, sk):
info->reg_type = PTR_TO_SOCKET;
return size == sizeof(__u64);
case offsetof(struct sk_reuseport_md, migrating_sk):
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
return size == sizeof(__u64);
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10237,6 +10289,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
case offsetof(struct sk_reuseport_md, sk):
SK_REUSEPORT_LOAD_FIELD(sk);
break;
case offsetof(struct sk_reuseport_md, migrating_sk):
SK_REUSEPORT_LOAD_FIELD(migrating_sk);
break;
}
return insn - insn_buf;

View File

@@ -943,8 +943,8 @@ bool __skb_flow_dissect(const struct net *net,
int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
/* Tail taggers don't break flow dissection */
if (!ops->tail_tag) {
/* Only DSA header taggers break flow dissection */
if (ops->needed_headroom) {
if (ops->flow_dissect)
ops->flow_dissect(skb, &proto, &offset);
else

View File

@@ -3142,7 +3142,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
int bucket = state->bucket;
int bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {

View File

@@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum);

View File

@@ -36,6 +36,7 @@
#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
#include <trace/events/napi.h>
#include <linux/kconfig.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -389,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
static atomic_t ip_ident;
struct ipv6hdr *ip6h;
WARN_ON_ONCE(!irqs_disabled());
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
WARN_ON_ONCE(!irqs_disabled());
udp_len = len + sizeof(*udph);
if (np->ipv6)

View File

@@ -17,6 +17,7 @@
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
#include <linux/poison.h>
#include <trace/events/page_pool.h>
@@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
return NULL;
}
page->pp_magic |= PP_SIGNATURE;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
@@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
put_page(page);
continue;
}
page->pp_magic |= PP_SIGNATURE;
pool->alloc.cache[pool->alloc.count++] = page;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
DMA_ATTR_SKIP_CPU_SYNC);
page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
page->pp_magic = 0;
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
@@ -622,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
}
EXPORT_SYMBOL(page_pool_update_nid);
bool page_pool_return_skb_page(struct page *page)
{
struct page_pool *pp;
page = compound_head(page);
if (unlikely(page->pp_magic != PP_SIGNATURE))
return false;
pp = page->pp;
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
page->pp = NULL;
page_pool_put_full_page(pp, page, false);
return true;
}
EXPORT_SYMBOL(page_pool_return_skb_page);

View File

@@ -467,7 +467,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
static void pktgen_run_all_threads(struct pktgen_net *pn);
static void pktgen_reset_all_threads(struct pktgen_net *pn);
static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
static void pktgen_stop_all_threads(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
@@ -516,14 +516,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
pktgen_stop_all_threads_ifs(pn);
pktgen_stop_all_threads(pn);
else if (!strcmp(data, "start"))
pktgen_run_all_threads(pn);
else if (!strcmp(data, "reset"))
pktgen_reset_all_threads(pn);
else
return -EINVAL;
@@ -3027,20 +3024,25 @@ static void pktgen_run(struct pktgen_thread *t)
t->control &= ~(T_STOP);
}
static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags)
{
struct pktgen_thread *t;
func_enter();
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= T_STOP;
t->control |= (flags);
mutex_unlock(&pktgen_thread_lock);
}
static void pktgen_stop_all_threads(struct pktgen_net *pn)
{
func_enter();
pktgen_handle_all_threads(pn, T_STOP);
}
static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
@@ -3103,16 +3105,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
static void pktgen_run_all_threads(struct pktgen_net *pn)
{
struct pktgen_thread *t;
func_enter();
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_RUN);
mutex_unlock(&pktgen_thread_lock);
pktgen_handle_all_threads(pn, T_RUN);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3122,16 +3117,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn)
static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
struct pktgen_thread *t;
func_enter();
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_REMDEVALL);
mutex_unlock(&pktgen_thread_lock);
pktgen_handle_all_threads(pn, T_REMDEVALL);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));

View File

@@ -9,7 +9,7 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
* Vitaly E. Lavrov RTA_OK arithmetics was wrong.
* Vitaly E. Lavrov RTA_OK arithmetic was wrong.
*/
#include <linux/bitops.h>
@@ -234,7 +234,7 @@ unlock:
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
* @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
* @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Like rtnl_register, but for use by removable modules.
*/
@@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module);
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
* @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
* @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Registers the specified function pointers (at least one of them has
* to be non-NULL) to be called whenever a request message for the
@@ -376,12 +376,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (rtnl_link_ops_get(ops->kind))
return -EEXIST;
/* The check for setup is here because if ops
/* The check for alloc/setup is here because if ops
* does not have that filled up, it is not possible
* to use the ops for creating device. So do not
* fill up dellink as well. That disables rtnl_dellink.
*/
if (ops->setup && !ops->dellink)
if ((ops->alloc || ops->setup) && !ops->dellink)
ops->dellink = unregister_netdevice_queue;
list_add_tail(&ops->list, &link_ops);
@@ -543,7 +543,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
{
const struct rtnl_af_ops *ops;
list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
ASSERT_RTNL();
list_for_each_entry(ops, &rtnl_af_ops, list) {
if (ops->family == family)
return ops;
}
@@ -1819,6 +1821,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_prop_list(skb, dev))
goto nla_put_failure;
if (dev->dev.parent &&
nla_put_string(skb, IFLA_PARENT_DEV_NAME,
dev_name(dev->dev.parent)))
goto nla_put_failure;
if (dev->dev.parent && dev->dev.parent->bus &&
nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
dev->dev.parent->bus->name))
goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -1878,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
[IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
[IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2274,27 +2287,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
rcu_read_lock();
af_ops = rtnl_af_lookup(nla_type(af));
if (!af_ops) {
rcu_read_unlock();
if (!af_ops)
return -EAFNOSUPPORT;
}
if (!af_ops->set_link_af) {
rcu_read_unlock();
if (!af_ops->set_link_af)
return -EOPNOTSUPP;
}
if (af_ops->validate_link_af) {
err = af_ops->validate_link_af(dev, af);
if (err < 0) {
rcu_read_unlock();
if (err < 0)
return err;
}
}
rcu_read_unlock();
}
}
@@ -2574,7 +2578,7 @@ static int do_set_proto_down(struct net_device *dev,
if (nl_proto_down) {
proto_down = nla_get_u8(nl_proto_down);
/* Dont turn off protodown if there are active reasons */
/* Don't turn off protodown if there are active reasons */
if (!proto_down && dev->proto_down_reason) {
NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
return -EBUSY;
@@ -2868,17 +2872,12 @@ static int do_setlink(const struct sk_buff *skb,
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
rcu_read_lock();
BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
err = af_ops->set_link_af(dev, af, extack);
if (err < 0) {
rcu_read_unlock();
if (err < 0)
goto errout;
}
rcu_read_unlock();
status |= DO_SETLINK_NOTIFY;
}
}
@@ -3177,8 +3176,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
return ERR_PTR(-EINVAL);
}
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
if (ops->alloc) {
dev = ops->alloc(tb, ifname, name_assign_type,
num_tx_queues, num_rx_queues);
if (IS_ERR(dev))
return dev;
} else {
dev = alloc_netdev_mqs(ops->priv_size, ifname,
name_assign_type, ops->setup,
num_tx_queues, num_rx_queues);
}
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -3411,7 +3419,7 @@ replay:
return -EOPNOTSUPP;
}
if (!ops->setup)
if (!ops->alloc && !ops->setup)
return -EOPNOTSUPP;
if (!ifname[0]) {
@@ -3939,12 +3947,12 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
pr_info("%s: FDB only supports static addresses\n", dev->name);
netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
if (vid) {
pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
return err;
}
@@ -4078,7 +4086,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
pr_info("%s: FDB only supports static addresses\n", dev->name);
netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}

View File

@@ -70,6 +70,7 @@
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/page_pool.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
if (skb->head_frag)
if (skb->head_frag) {
if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
else
} else {
kfree(head);
}
}
static void skb_release_data(struct sk_buff *skb)
@@ -664,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb)
skb_zcopy_clear(skb, true);
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i]);
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -1046,6 +1050,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
@@ -1289,7 +1294,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg)
}
spin_unlock_irqrestore(&q->lock, flags);
sk->sk_error_report(sk);
sk_error_report(sk);
release:
consume_skb(skb);
@@ -3497,7 +3502,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
__skb_frag_unref(fragfrom);
__skb_frag_unref(fragfrom, skb->pp_recycle);
}
/* Reposition in the original skb */
@@ -4680,7 +4685,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4711,7 +4716,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
sk->sk_err = 0;
if (skb_next)
sk->sk_error_report(sk);
sk_error_report(sk);
return skb;
}
@@ -5287,6 +5292,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
/* The page pool signature of struct page will eventually figure out
* which pages can be recycled or not but for now let's prohibit slab
* allocated and page_pool allocated SKBs from being coalesced.
*/
if (to->pp_recycle != from->pp_recycle)
return false;
if (len <= skb_tailroom(to)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));

View File

@@ -399,29 +399,6 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
long timeo, int *err)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
int ret = 0;
if (sk->sk_shutdown & RCV_SHUTDOWN)
return 1;
if (!timeo)
return ret;
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
ret = sk_wait_event(sk, &timeo,
!list_empty(&psock->ingress_msg) ||
!skb_queue_empty(&sk->sk_receive_queue), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_wait_data);
/* Receive sk_msg from psock->ingress_msg to @msg. */
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
int len, int flags)
@@ -601,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
return sk_psock_skb_ingress(psock, skb);
}
static void sock_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
kfree_skb(skb);
}
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
@@ -640,7 +623,7 @@ start:
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
kfree_skb(skb);
sock_drop(psock->sk, skb);
goto end;
}
off += ret;
@@ -731,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
skb_bpf_redirect_clear(skb);
kfree_skb(skb);
sock_drop(psock->sk, skb);
}
__sk_psock_purge_ingress_msg(psock);
}
@@ -847,7 +830,7 @@ out:
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static void sk_psock_skb_redirect(struct sk_buff *skb)
static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
@@ -857,8 +840,8 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* return code, but then didn't set a redirect interface.
*/
if (unlikely(!sk_other)) {
kfree_skb(skb);
return;
sock_drop(from->sk, skb);
return -EIO;
}
psock_other = sk_psock(sk_other);
/* This error indicates the socket is being torn down or had another
@@ -866,26 +849,30 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* a socket that is in this state so we drop the skb.
*/
if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
kfree_skb(skb);
return;
skb_bpf_redirect_clear(skb);
sock_drop(from->sk, skb);
return -EIO;
}
spin_lock_bh(&psock_other->ingress_lock);
if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
spin_unlock_bh(&psock_other->ingress_lock);
kfree_skb(skb);
return;
skb_bpf_redirect_clear(skb);
sock_drop(from->sk, skb);
return -EIO;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
spin_unlock_bh(&psock_other->ingress_lock);
return 0;
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
struct sk_psock *from, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
sk_psock_skb_redirect(from, skb);
break;
case __SK_PASS:
case __SK_DROP:
@@ -909,20 +896,21 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
sk_psock_tls_verdict_apply(skb, psock, ret);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
int verdict)
{
struct sock *sk_other;
int err = -EIO;
int err = 0;
switch (verdict) {
case __SK_PASS:
err = -EIO;
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
@@ -945,18 +933,25 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
err = 0;
}
spin_unlock_bh(&psock->ingress_lock);
if (err < 0) {
skb_bpf_redirect_clear(skb);
goto out_free;
}
}
break;
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
err = sk_psock_skb_redirect(psock, skb);
break;
case __SK_DROP:
default:
out_free:
kfree_skb(skb);
sock_drop(psock->sk, skb);
}
return err;
}
static void sk_psock_write_space(struct sock *sk)
@@ -988,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
sk = strp->sk;
psock = sk_psock(sk);
if (unlikely(!psock)) {
kfree_skb(skb);
sock_drop(sk, skb);
goto out;
}
prog = READ_ONCE(psock->progs.stream_verdict);
@@ -1109,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
kfree_skb(skb);
sock_drop(sk, skb);
goto out;
}
prog = READ_ONCE(psock->progs.stream_verdict);
@@ -1123,7 +1118,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_verdict_apply(psock, skb, ret);
if (sk_psock_verdict_apply(psock, skb, ret) < 0)
len = 0;
out:
rcu_read_unlock();
return len;

View File

@@ -331,6 +331,22 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sk_backlog_rcv);
void sk_error_report(struct sock *sk)
{
sk->sk_error_report(sk);
switch (sk->sk_family) {
case AF_INET:
fallthrough;
case AF_INET6:
trace_inet_sk_error_report(sk);
break;
default:
break;
}
}
EXPORT_SYMBOL(sk_error_report);
static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
@@ -776,6 +792,58 @@ void sock_enable_timestamps(struct sock *sk)
}
EXPORT_SYMBOL(sock_enable_timestamps);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{
switch (optname) {
case SO_TIMESTAMP_OLD:
__sock_set_timestamps(sk, valbool, false, false);
break;
case SO_TIMESTAMP_NEW:
__sock_set_timestamps(sk, valbool, true, false);
break;
case SO_TIMESTAMPNS_OLD:
__sock_set_timestamps(sk, valbool, false, true);
break;
case SO_TIMESTAMPNS_NEW:
__sock_set_timestamps(sk, valbool, true, true);
break;
}
}
int sock_set_timestamping(struct sock *sk, int optname, int val)
{
if (val & ~SOF_TIMESTAMPING_MASK)
return -EINVAL;
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))
return -EINVAL;
sk->sk_tskey = tcp_sk(sk)->snd_una;
} else {
sk->sk_tskey = 0;
}
}
if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY))
return -EINVAL;
sk->sk_tsflags = val;
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
return 0;
}
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
@@ -997,54 +1065,15 @@ set_sndbuf:
break;
case SO_TIMESTAMP_OLD:
__sock_set_timestamps(sk, valbool, false, false);
break;
case SO_TIMESTAMP_NEW:
__sock_set_timestamps(sk, valbool, true, false);
break;
case SO_TIMESTAMPNS_OLD:
__sock_set_timestamps(sk, valbool, false, true);
break;
case SO_TIMESTAMPNS_NEW:
__sock_set_timestamps(sk, valbool, true, true);
sock_set_timestamp(sk, valbool, optname);
break;
case SO_TIMESTAMPING_NEW:
case SO_TIMESTAMPING_OLD:
if (val & ~SOF_TIMESTAMPING_MASK) {
ret = -EINVAL;
break;
}
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN)) {
ret = -EINVAL;
break;
}
sk->sk_tskey = tcp_sk(sk)->snd_una;
} else {
sk->sk_tskey = 0;
}
}
if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
ret = -EINVAL;
break;
}
sk->sk_tsflags = val;
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
ret = sock_set_timestamping(sk, optname, val);
break;
case SO_RCVLOWAT:
@@ -1622,6 +1651,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_bound_dev_if;
break;
case SO_NETNS_COOKIE:
lv = sizeof(u64);
if (len != lv)
return -EINVAL;
v.val64 = sock_net(sk)->net_cookie;
break;
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).

View File

@@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
stab->sks = bpf_map_area_alloc(stab->map.max_entries *
stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
if (!stab->sks) {

View File

@@ -6,6 +6,7 @@
* selecting the socket index from the array of available sockets.
*/
#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
@@ -17,6 +18,74 @@
DEFINE_SPINLOCK(reuseport_lock);
static DEFINE_IDA(reuseport_ida);
static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
struct sock_reuseport *reuse, bool bind_inany);
static int reuseport_sock_index(struct sock *sk,
const struct sock_reuseport *reuse,
bool closed)
{
int left, right;
if (!closed) {
left = 0;
right = reuse->num_socks;
} else {
left = reuse->max_socks - reuse->num_closed_socks;
right = reuse->max_socks;
}
for (; left < right; left++)
if (reuse->socks[left] == sk)
return left;
return -1;
}
static void __reuseport_add_sock(struct sock *sk,
struct sock_reuseport *reuse)
{
reuse->socks[reuse->num_socks] = sk;
/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
smp_wmb();
reuse->num_socks++;
}
static bool __reuseport_detach_sock(struct sock *sk,
struct sock_reuseport *reuse)
{
int i = reuseport_sock_index(sk, reuse, false);
if (i == -1)
return false;
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
return true;
}
static void __reuseport_add_closed_sock(struct sock *sk,
struct sock_reuseport *reuse)
{
reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
/* paired with READ_ONCE() in inet_csk_bind_conflict() */
WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
}
static bool __reuseport_detach_closed_sock(struct sock *sk,
struct sock_reuseport *reuse)
{
int i = reuseport_sock_index(sk, reuse, true);
if (i == -1)
return false;
reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
/* paired with READ_ONCE() in inet_csk_bind_conflict() */
WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
return true;
}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
@@ -49,6 +118,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
if (reuse->num_closed_socks) {
/* sk was shutdown()ed before */
ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
goto out;
}
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
@@ -72,9 +147,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
}
reuse->reuseport_id = id;
reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -90,14 +165,30 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
if (more_socks_size > U16_MAX)
if (more_socks_size > U16_MAX) {
if (reuse->num_closed_socks) {
/* Make room by removing a closed sk.
* The child has already been migrated.
* Only reqsk left at this point.
*/
struct sock *sk;
sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
__reuseport_detach_closed_sock(sk, reuse);
return reuse;
}
return NULL;
}
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
more_reuse->num_socks = reuse->num_socks;
more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
@@ -105,9 +196,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
memcpy(more_reuse->socks +
(more_reuse->max_socks - more_reuse->num_closed_socks),
reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
reuse->num_closed_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
@@ -152,13 +247,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
lockdep_is_held(&reuseport_lock));
if (old_reuse && old_reuse->num_closed_socks) {
/* sk was shutdown()ed before */
int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
spin_unlock_bh(&reuseport_lock);
return err;
}
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
if (reuse->num_socks == reuse->max_socks) {
if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
@@ -166,10 +269,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
}
reuse->socks[reuse->num_socks] = sk;
/* paired with smp_rmb() in reuseport_select_sock() */
smp_wmb();
reuse->num_socks++;
__reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
@@ -180,15 +280,77 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
EXPORT_SYMBOL(reuseport_add_sock);
static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
struct sock_reuseport *reuse, bool bind_inany)
{
if (old_reuse == reuse) {
/* If sk was in the same reuseport group, just pop sk out of
* the closed section and push sk into the listening section.
*/
__reuseport_detach_closed_sock(sk, old_reuse);
__reuseport_add_sock(sk, old_reuse);
return 0;
}
if (!reuse) {
/* In bind()/listen() path, we cannot carry over the eBPF prog
* for the shutdown()ed socket. In setsockopt() path, we should
* not change the eBPF prog of listening sockets by attaching a
* prog to the shutdown()ed socket. Thus, we will allocate a new
* reuseport group and detach sk from the old group.
*/
int id;
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse)
return -ENOMEM;
id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
if (id < 0) {
kfree(reuse);
return id;
}
reuse->reuseport_id = id;
reuse->bind_inany = bind_inany;
} else {
/* Move sk from the old group to the new one if
* - all the other listeners in the old group were close()d or
* shutdown()ed, and then sk2 has listen()ed on the same port
* OR
* - sk listen()ed without bind() (or with autobind), was
* shutdown()ed, and then listen()s on another port which
* sk2 listen()s on.
*/
if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse)
return -ENOMEM;
}
}
__reuseport_detach_closed_sock(sk, old_reuse);
__reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
/* reuseport_grow() has detached a closed sk */
if (!reuse)
goto out;
/* Notify the bpf side. The sk may be added to a sockarray
* map. If so, sockarray logic will remove it from the map.
*
@@ -201,19 +363,52 @@ void reuseport_detach_sock(struct sock *sk)
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
if (reuse->socks[i] == sk) {
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
if (!__reuseport_detach_closed_sock(sk, reuse))
__reuseport_detach_sock(sk, reuse);
if (reuse->num_socks + reuse->num_closed_socks == 0)
call_rcu(&reuse->rcu, reuseport_free_rcu);
out:
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
void reuseport_stop_listen_sock(struct sock *sk)
{
if (sk->sk_protocol == IPPROTO_TCP) {
struct sock_reuseport *reuse;
struct bpf_prog *prog;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
prog = rcu_dereference_protected(reuse->prog,
lockdep_is_held(&reuseport_lock));
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
(prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
/* Migration capable, move sk from the listening section
* to the closed section.
*/
bpf_sk_reuseport_detach(sk);
__reuseport_detach_sock(sk, reuse);
__reuseport_add_closed_sock(sk, reuse);
spin_unlock_bh(&reuseport_lock);
return;
}
spin_unlock_bh(&reuseport_lock);
}
/* Not capable to do migration, detach immediately */
reuseport_detach_sock(sk);
}
EXPORT_SYMBOL(reuseport_stop_listen_sock);
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
@@ -244,6 +439,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
return reuse->socks[index];
}
static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
u32 hash, u16 num_socks)
{
int i, j;
i = j = reciprocal_scale(hash, num_socks);
while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
i++;
if (i >= num_socks)
i = 0;
if (i == j)
return NULL;
}
return reuse->socks[i];
}
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
@@ -274,32 +486,21 @@ struct sock *reuseport_select_sock(struct sock *sk,
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
/* paired with smp_wmb() in __reuseport_add_sock() */
smp_rmb();
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2) {
int i, j;
i = j = reciprocal_scale(hash, socks);
while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
i++;
if (i >= socks)
i = 0;
if (i == j)
goto out;
}
sk2 = reuse->socks[i];
}
if (!sk2)
sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
out:
@@ -308,14 +509,90 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
/**
* reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
* @sk: close()ed or shutdown()ed socket in the group.
* @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
* NEW_SYN_RECV request socket during 3WHS.
* @skb: skb to run through BPF filter.
* Returns a socket (with sk_refcnt +1) that should accept the child socket
* (or NULL on error).
*/
struct sock *reuseport_migrate_sock(struct sock *sk,
struct sock *migrating_sk,
struct sk_buff *skb)
{
struct sock_reuseport *reuse;
struct sock *nsk = NULL;
bool allocated = false;
struct bpf_prog *prog;
u16 socks;
u32 hash;
rcu_read_lock();
reuse = rcu_dereference(sk->sk_reuseport_cb);
if (!reuse)
goto out;
socks = READ_ONCE(reuse->num_socks);
if (unlikely(!socks))
goto failure;
/* paired with smp_wmb() in __reuseport_add_sock() */
smp_rmb();
hash = migrating_sk->sk_hash;
prog = rcu_dereference(reuse->prog);
if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
goto select_by_hash;
goto failure;
}
if (!skb) {
skb = alloc_skb(0, GFP_ATOMIC);
if (!skb)
goto failure;
allocated = true;
}
nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
if (allocated)
kfree_skb(skb);
select_by_hash:
if (!nsk)
nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
nsk = NULL;
goto failure;
}
out:
rcu_read_unlock();
return nsk;
failure:
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
goto out;
}
EXPORT_SYMBOL(reuseport_migrate_sock);
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
if (sk_unhashed(sk) && sk->sk_reuseport) {
int err = reuseport_alloc(sk, false);
if (sk_unhashed(sk)) {
int err;
if (!sk->sk_reuseport)
return -EINVAL;
err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -341,13 +618,24 @@ int reuseport_detach_prog(struct sock *sk)
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
if (!rcu_access_pointer(sk->sk_reuseport_cb))
return sk->sk_reuseport ? -ENOENT : -EINVAL;
old_prog = NULL;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
/* reuse must be checked after acquiring the reuseport_lock
* because reuseport_grow() can detach a closed sk.
*/
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
return sk->sk_reuseport ? -ENOENT : -EINVAL;
}
if (sk_unhashed(sk) && reuse->num_closed_socks) {
spin_unlock_bh(&reuseport_lock);
return -ENOENT;
}
old_prog = rcu_replace_pointer(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);

View File

@@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator)
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
int type = xdp_rxq->mem.type;
int id = xdp_rxq->mem.id;
/* Reset mem info to defaults */
xdp_rxq->mem.id = 0;
xdp_rxq->mem.type = 0;
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return;
@@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
if (type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
page_pool_destroy(xa->page_pool);
@@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
/* Reset mem info to defaults */
xdp_rxq->mem.id = 0;
xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -584,3 +585,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
{
unsigned int headroom, totalsize;
struct xdp_frame *nxdpf;
struct page *page;
void *addr;
headroom = xdpf->headroom + sizeof(*xdpf);
totalsize = headroom + xdpf->len;
if (unlikely(totalsize > PAGE_SIZE))
return NULL;
page = dev_alloc_page();
if (!page)
return NULL;
addr = page_to_virt(page);
memcpy(addr, xdpf, totalsize);
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
nxdpf->mem.id = 0;
return nxdpf;
}