Merge branch 'xdp_xmit-bulking'
Jesper Dangaard Brouer says: ==================== This patchset change ndo_xdp_xmit API to take a bulk of xdp frames. When kernel is compiled with CONFIG_RETPOLINE, every indirect function pointer (branch) call hurts performance. For XDP this have a huge negative performance impact. This patchset reduce the needed (indirect) calls to ndo_xdp_xmit, but also prepares for further optimizations. The DMA APIs use of indirect function pointer calls is the primary source the regression. It is left for a followup patchset, to use bulking calls towards the DMA API (via the scatter-gatter calls). The other advantage of this API change is that drivers can easier amortize the cost of any sync/locking scheme, over the bulk of packets. The assumption of the current API is that the driver implemementing the NDO will also allocate a dedicated XDP TX queue for every CPU in the system. Which is not always possible or practical to configure. E.g. ixgbe cannot load an XDP program on a machine with more than 96 CPUs, due to limited hardware TX queues. E.g. virtio_net is hard to configure as it requires manually increasing the queues. E.g. tun driver chooses to use a per XDP frame producer lock modulo smp_processor_id over avail queues. I'm considered adding 'flags' to ndo_xdp_xmit, but it's not part of this patchset. This will be a followup patchset, once we know if this will be needed (e.g. for non-map xdp_redirect flush-flag, and if AF_XDP chooses to use ndo_xdp_xmit for TX). --- V5: Fixed up issues spotted by Daniel and John V4: Splitout the patches from 4 to 8 patches. I cannot split the driver changes from the NDO change, but I've tried to isolated the NDO change together with the driver change as much as possible. ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
10f678683e
drivers/net
include
kernel/bpf
net/core
samples/bpf
@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
|
||||
* @dev: netdev
|
||||
* @xdp: XDP buffer
|
||||
*
|
||||
* Returns Zero if sent, else an error code
|
||||
* Returns number of frames successfully sent. Frames that fail are
|
||||
* free'ed via XDP return API.
|
||||
*
|
||||
* For error cases, a negative errno code is returned and no-frames
|
||||
* are transmitted (caller must handle freeing frames).
|
||||
**/
|
||||
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
|
||||
{
|
||||
struct i40e_netdev_priv *np = netdev_priv(dev);
|
||||
unsigned int queue_index = smp_processor_id();
|
||||
struct i40e_vsi *vsi = np->vsi;
|
||||
int err;
|
||||
int drops = 0;
|
||||
int i;
|
||||
|
||||
if (test_bit(__I40E_VSI_DOWN, vsi->state))
|
||||
return -ENETDOWN;
|
||||
@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
|
||||
return -ENXIO;
|
||||
|
||||
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
|
||||
if (err != I40E_XDP_TX)
|
||||
return -ENOSPC;
|
||||
for (i = 0; i < n; i++) {
|
||||
struct xdp_frame *xdpf = frames[i];
|
||||
int err;
|
||||
|
||||
return 0;
|
||||
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
|
||||
if (err != I40E_XDP_TX) {
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
drops++;
|
||||
}
|
||||
}
|
||||
|
||||
return n - drops;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
|
||||
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
|
||||
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
|
||||
bool __i40e_chk_linearize(struct sk_buff *skb);
|
||||
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
|
||||
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
|
||||
void i40e_xdp_flush(struct net_device *dev);
|
||||
|
||||
/**
|
||||
|
@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
|
||||
}
|
||||
}
|
||||
|
||||
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
static int ixgbe_xdp_xmit(struct net_device *dev, int n,
|
||||
struct xdp_frame **frames)
|
||||
{
|
||||
struct ixgbe_adapter *adapter = netdev_priv(dev);
|
||||
struct ixgbe_ring *ring;
|
||||
int err;
|
||||
int drops = 0;
|
||||
int i;
|
||||
|
||||
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
|
||||
return -ENETDOWN;
|
||||
@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
if (unlikely(!ring))
|
||||
return -ENXIO;
|
||||
|
||||
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
|
||||
if (err != IXGBE_XDP_TX)
|
||||
return -ENOSPC;
|
||||
for (i = 0; i < n; i++) {
|
||||
struct xdp_frame *xdpf = frames[i];
|
||||
int err;
|
||||
|
||||
return 0;
|
||||
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
|
||||
if (err != IXGBE_XDP_TX) {
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
drops++;
|
||||
}
|
||||
}
|
||||
|
||||
return n - drops;
|
||||
}
|
||||
|
||||
static void ixgbe_xdp_flush(struct net_device *dev)
|
||||
|
@ -70,6 +70,7 @@
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/xdp.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/skb_array.h>
|
||||
@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
|
||||
.ndo_get_stats64 = tun_net_get_stats64,
|
||||
};
|
||||
|
||||
static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
|
||||
static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
|
||||
{
|
||||
struct tun_struct *tun = netdev_priv(dev);
|
||||
struct tun_file *tfile;
|
||||
u32 numqueues;
|
||||
int ret = 0;
|
||||
int drops = 0;
|
||||
int cnt = n;
|
||||
int i;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
numqueues = READ_ONCE(tun->numqueues);
|
||||
if (!numqueues) {
|
||||
ret = -ENOSPC;
|
||||
goto out;
|
||||
rcu_read_unlock();
|
||||
return -ENXIO; /* Caller will free/return all frames */
|
||||
}
|
||||
|
||||
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
|
||||
numqueues]);
|
||||
/* Encode the XDP flag into lowest bit for consumer to differ
|
||||
* XDP buffer from sk_buff.
|
||||
*/
|
||||
if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
|
||||
this_cpu_inc(tun->pcpu_stats->tx_dropped);
|
||||
ret = -ENOSPC;
|
||||
}
|
||||
|
||||
out:
|
||||
spin_lock(&tfile->tx_ring.producer_lock);
|
||||
for (i = 0; i < n; i++) {
|
||||
struct xdp_frame *xdp = frames[i];
|
||||
/* Encode the XDP flag into lowest bit for consumer to differ
|
||||
* XDP buffer from sk_buff.
|
||||
*/
|
||||
void *frame = tun_xdp_to_ptr(xdp);
|
||||
|
||||
if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
|
||||
this_cpu_inc(tun->pcpu_stats->tx_dropped);
|
||||
xdp_return_frame_rx_napi(xdp);
|
||||
drops++;
|
||||
}
|
||||
}
|
||||
spin_unlock(&tfile->tx_ring.producer_lock);
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
return cnt - drops;
|
||||
}
|
||||
|
||||
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
|
||||
@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
|
||||
if (unlikely(!frame))
|
||||
return -EOVERFLOW;
|
||||
|
||||
return tun_xdp_xmit(dev, frame);
|
||||
return tun_xdp_xmit(dev, 1, &frame);
|
||||
}
|
||||
|
||||
static void tun_xdp_flush(struct net_device *dev)
|
||||
|
@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
|
||||
virtqueue_kick(sq->vq);
|
||||
}
|
||||
|
||||
static int __virtnet_xdp_xmit(struct virtnet_info *vi,
|
||||
struct xdp_frame *xdpf)
|
||||
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
|
||||
struct send_queue *sq,
|
||||
struct xdp_frame *xdpf)
|
||||
{
|
||||
struct virtio_net_hdr_mrg_rxbuf *hdr;
|
||||
struct xdp_frame *xdpf_sent;
|
||||
struct send_queue *sq;
|
||||
unsigned int len;
|
||||
unsigned int qp;
|
||||
int err;
|
||||
|
||||
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
|
||||
sq = &vi->sq[qp];
|
||||
|
||||
/* Free up any pending old buffers before queueing new ones. */
|
||||
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
|
||||
xdp_return_frame(xdpf_sent);
|
||||
|
||||
/* virtqueue want to use data area in-front of packet */
|
||||
if (unlikely(xdpf->metasize > 0))
|
||||
return -EOPNOTSUPP;
|
||||
@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
|
||||
struct xdp_frame *xdpf)
|
||||
{
|
||||
struct xdp_frame *xdpf_sent;
|
||||
struct send_queue *sq;
|
||||
unsigned int len;
|
||||
unsigned int qp;
|
||||
|
||||
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
|
||||
sq = &vi->sq[qp];
|
||||
|
||||
/* Free up any pending old buffers before queueing new ones. */
|
||||
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
|
||||
xdp_return_frame(xdpf_sent);
|
||||
|
||||
return __virtnet_xdp_xmit_one(vi, sq, xdpf);
|
||||
}
|
||||
|
||||
static int virtnet_xdp_xmit(struct net_device *dev,
|
||||
int n, struct xdp_frame **frames)
|
||||
{
|
||||
struct virtnet_info *vi = netdev_priv(dev);
|
||||
struct receive_queue *rq = vi->rq;
|
||||
struct xdp_frame *xdpf_sent;
|
||||
struct bpf_prog *xdp_prog;
|
||||
struct send_queue *sq;
|
||||
unsigned int len;
|
||||
unsigned int qp;
|
||||
int drops = 0;
|
||||
int err;
|
||||
int i;
|
||||
|
||||
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
|
||||
sq = &vi->sq[qp];
|
||||
|
||||
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
|
||||
* indicate XDP resources have been successfully allocated.
|
||||
@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
|
||||
if (!xdp_prog)
|
||||
return -ENXIO;
|
||||
|
||||
return __virtnet_xdp_xmit(vi, xdpf);
|
||||
/* Free up any pending old buffers before queueing new ones. */
|
||||
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
|
||||
xdp_return_frame(xdpf_sent);
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
struct xdp_frame *xdpf = frames[i];
|
||||
|
||||
err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
|
||||
if (err) {
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
drops++;
|
||||
}
|
||||
}
|
||||
return n - drops;
|
||||
}
|
||||
|
||||
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
|
||||
@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
|
||||
xdpf = convert_to_xdp_frame(&xdp);
|
||||
if (unlikely(!xdpf))
|
||||
goto err_xdp;
|
||||
err = __virtnet_xdp_xmit(vi, xdpf);
|
||||
err = __virtnet_xdp_tx_xmit(vi, xdpf);
|
||||
if (unlikely(err)) {
|
||||
trace_xdp_exception(vi->dev, xdp_prog, act);
|
||||
goto err_xdp;
|
||||
@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
|
||||
xdpf = convert_to_xdp_frame(&xdp);
|
||||
if (unlikely(!xdpf))
|
||||
goto err_xdp;
|
||||
err = __virtnet_xdp_xmit(vi, xdpf);
|
||||
err = __virtnet_xdp_tx_xmit(vi, xdpf);
|
||||
if (unlikely(err)) {
|
||||
trace_xdp_exception(vi->dev, xdp_prog, act);
|
||||
if (unlikely(xdp_page != page))
|
||||
|
@ -487,14 +487,17 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
|
||||
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
|
||||
|
||||
/* Map specifics */
|
||||
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
|
||||
struct xdp_buff;
|
||||
|
||||
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
|
||||
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
|
||||
void __dev_map_flush(struct bpf_map *map);
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx);
|
||||
|
||||
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
|
||||
void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
|
||||
void __cpu_map_flush(struct bpf_map *map);
|
||||
struct xdp_buff;
|
||||
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx);
|
||||
|
||||
@ -573,6 +576,16 @@ static inline void __dev_map_flush(struct bpf_map *map)
|
||||
{
|
||||
}
|
||||
|
||||
struct xdp_buff;
|
||||
struct bpf_dtab_netdev;
|
||||
|
||||
static inline
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline
|
||||
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
|
||||
{
|
||||
@ -587,7 +600,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
|
||||
{
|
||||
}
|
||||
|
||||
struct xdp_buff;
|
||||
static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
|
||||
struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
|
@ -1185,9 +1185,13 @@ struct dev_ifalias {
|
||||
* This function is used to set or query state related to XDP on the
|
||||
* netdevice and manage BPF offload. See definition of
|
||||
* enum bpf_netdev_command for details.
|
||||
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
|
||||
* This function is used to submit a XDP packet for transmit on a
|
||||
* netdevice.
|
||||
* int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
|
||||
* This function is used to submit @n XDP packets for transmit on a
|
||||
* netdevice. Returns number of frames successfully transmitted, frames
|
||||
* that got dropped are freed/returned via xdp_return_frame().
|
||||
* Returns negative number, means general error invoking ndo, meaning
|
||||
* no frames were xmit'ed and core-caller will free all frames.
|
||||
* TODO: Consider add flag to allow sending flush operation.
|
||||
* void (*ndo_xdp_flush)(struct net_device *dev);
|
||||
* This function is used to inform the driver to flush a particular
|
||||
* xdp tx queue. Must be called on same CPU as xdp_xmit.
|
||||
@ -1375,8 +1379,8 @@ struct net_device_ops {
|
||||
int needed_headroom);
|
||||
int (*ndo_bpf)(struct net_device *dev,
|
||||
struct netdev_bpf *bpf);
|
||||
int (*ndo_xdp_xmit)(struct net_device *dev,
|
||||
struct xdp_frame *xdp);
|
||||
int (*ndo_xdp_xmit)(struct net_device *dev, int n,
|
||||
struct xdp_frame **xdp);
|
||||
void (*ndo_xdp_flush)(struct net_device *dev);
|
||||
};
|
||||
|
||||
|
@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
|
||||
void __page_pool_put_page(struct page_pool *pool,
|
||||
struct page *page, bool allow_direct);
|
||||
|
||||
static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
|
||||
static inline void page_pool_put_page(struct page_pool *pool,
|
||||
struct page *page, bool allow_direct)
|
||||
{
|
||||
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
|
||||
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
__page_pool_put_page(pool, page, false);
|
||||
__page_pool_put_page(pool, page, allow_direct);
|
||||
#endif
|
||||
}
|
||||
/* Very limited use-cases allow recycle direct */
|
||||
|
@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
|
||||
}
|
||||
|
||||
void xdp_return_frame(struct xdp_frame *xdpf);
|
||||
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
|
||||
void xdp_return_buff(struct xdp_buff *xdp);
|
||||
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
|
@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
|
||||
__entry->map_id, __entry->map_index)
|
||||
);
|
||||
|
||||
#ifndef __DEVMAP_OBJ_TYPE
|
||||
#define __DEVMAP_OBJ_TYPE
|
||||
struct _bpf_dtab_netdev {
|
||||
struct net_device *dev;
|
||||
};
|
||||
#endif /* __DEVMAP_OBJ_TYPE */
|
||||
|
||||
#define devmap_ifindex(fwd, map) \
|
||||
(!fwd ? 0 : \
|
||||
(!map ? 0 : \
|
||||
((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
|
||||
((struct net_device *)fwd)->ifindex : 0)))
|
||||
((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
|
||||
|
||||
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
|
||||
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
|
||||
@ -222,6 +229,47 @@ TRACE_EVENT(xdp_cpumap_enqueue,
|
||||
__entry->to_cpu)
|
||||
);
|
||||
|
||||
TRACE_EVENT(xdp_devmap_xmit,
|
||||
|
||||
TP_PROTO(const struct bpf_map *map, u32 map_index,
|
||||
int sent, int drops,
|
||||
const struct net_device *from_dev,
|
||||
const struct net_device *to_dev, int err),
|
||||
|
||||
TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, map_id)
|
||||
__field(u32, act)
|
||||
__field(u32, map_index)
|
||||
__field(int, drops)
|
||||
__field(int, sent)
|
||||
__field(int, from_ifindex)
|
||||
__field(int, to_ifindex)
|
||||
__field(int, err)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->map_id = map->id;
|
||||
__entry->act = XDP_REDIRECT;
|
||||
__entry->map_index = map_index;
|
||||
__entry->drops = drops;
|
||||
__entry->sent = sent;
|
||||
__entry->from_ifindex = from_dev->ifindex;
|
||||
__entry->to_ifindex = to_dev->ifindex;
|
||||
__entry->err = err;
|
||||
),
|
||||
|
||||
TP_printk("ndo_xdp_xmit"
|
||||
" map_id=%d map_index=%d action=%s"
|
||||
" sent=%d drops=%d"
|
||||
" from_ifindex=%d to_ifindex=%d err=%d",
|
||||
__entry->map_id, __entry->map_index,
|
||||
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
|
||||
__entry->sent, __entry->drops,
|
||||
__entry->from_ifindex, __entry->to_ifindex, __entry->err)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_XDP_H */
|
||||
|
||||
#include <trace/define_trace.h>
|
||||
|
@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
|
||||
err = __ptr_ring_produce(q, xdpf);
|
||||
if (err) {
|
||||
drops++;
|
||||
xdp_return_frame(xdpf);
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
}
|
||||
processed++;
|
||||
}
|
||||
|
@ -48,15 +48,25 @@
|
||||
* calls will fail at this point.
|
||||
*/
|
||||
#include <linux/bpf.h>
|
||||
#include <net/xdp.h>
|
||||
#include <linux/filter.h>
|
||||
#include <trace/events/xdp.h>
|
||||
|
||||
#define DEV_CREATE_FLAG_MASK \
|
||||
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
|
||||
|
||||
#define DEV_MAP_BULK_SIZE 16
|
||||
struct xdp_bulk_queue {
|
||||
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
|
||||
struct net_device *dev_rx;
|
||||
unsigned int count;
|
||||
};
|
||||
|
||||
struct bpf_dtab_netdev {
|
||||
struct net_device *dev;
|
||||
struct net_device *dev; /* must be first member, due to tracepoint */
|
||||
struct bpf_dtab *dtab;
|
||||
unsigned int bit;
|
||||
struct xdp_bulk_queue __percpu *bulkq;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
|
||||
__set_bit(bit, bitmap);
|
||||
}
|
||||
|
||||
static int bq_xmit_all(struct bpf_dtab_netdev *obj,
|
||||
struct xdp_bulk_queue *bq)
|
||||
{
|
||||
struct net_device *dev = obj->dev;
|
||||
int sent = 0, drops = 0, err = 0;
|
||||
int i;
|
||||
|
||||
if (unlikely(!bq->count))
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < bq->count; i++) {
|
||||
struct xdp_frame *xdpf = bq->q[i];
|
||||
|
||||
prefetch(xdpf);
|
||||
}
|
||||
|
||||
sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
|
||||
if (sent < 0) {
|
||||
err = sent;
|
||||
sent = 0;
|
||||
goto error;
|
||||
}
|
||||
drops = bq->count - sent;
|
||||
out:
|
||||
bq->count = 0;
|
||||
|
||||
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
|
||||
sent, drops, bq->dev_rx, dev, err);
|
||||
bq->dev_rx = NULL;
|
||||
return 0;
|
||||
error:
|
||||
/* If ndo_xdp_xmit fails with an errno, no frames have been
|
||||
* xmit'ed and it's our responsibility to them free all.
|
||||
*/
|
||||
for (i = 0; i < bq->count; i++) {
|
||||
struct xdp_frame *xdpf = bq->q[i];
|
||||
|
||||
/* RX path under NAPI protection, can return frames faster */
|
||||
xdp_return_frame_rx_napi(xdpf);
|
||||
drops++;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
|
||||
* from the driver before returning from its napi->poll() routine. The poll()
|
||||
* routine is called either from busy_poll context or net_rx_action signaled
|
||||
@ -221,6 +275,7 @@ void __dev_map_flush(struct bpf_map *map)
|
||||
|
||||
for_each_set_bit(bit, bitmap, map->max_entries) {
|
||||
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
|
||||
struct xdp_bulk_queue *bq;
|
||||
struct net_device *netdev;
|
||||
|
||||
/* This is possible if the dev entry is removed by user space
|
||||
@ -230,6 +285,9 @@ void __dev_map_flush(struct bpf_map *map)
|
||||
continue;
|
||||
|
||||
__clear_bit(bit, bitmap);
|
||||
|
||||
bq = this_cpu_ptr(dev->bulkq);
|
||||
bq_xmit_all(dev, bq);
|
||||
netdev = dev->dev;
|
||||
if (likely(netdev->netdev_ops->ndo_xdp_flush))
|
||||
netdev->netdev_ops->ndo_xdp_flush(netdev);
|
||||
@ -240,21 +298,61 @@ void __dev_map_flush(struct bpf_map *map)
|
||||
* update happens in parallel here a dev_put wont happen until after reading the
|
||||
* ifindex.
|
||||
*/
|
||||
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
|
||||
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
|
||||
{
|
||||
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
||||
struct bpf_dtab_netdev *dev;
|
||||
struct bpf_dtab_netdev *obj;
|
||||
|
||||
if (key >= map->max_entries)
|
||||
return NULL;
|
||||
|
||||
dev = READ_ONCE(dtab->netdev_map[key]);
|
||||
return dev ? dev->dev : NULL;
|
||||
obj = READ_ONCE(dtab->netdev_map[key]);
|
||||
return obj;
|
||||
}
|
||||
|
||||
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
|
||||
* Thus, safe percpu variable access.
|
||||
*/
|
||||
static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx)
|
||||
|
||||
{
|
||||
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
|
||||
|
||||
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
|
||||
bq_xmit_all(obj, bq);
|
||||
|
||||
/* Ingress dev_rx will be the same for all xdp_frame's in
|
||||
* bulk_queue, because bq stored per-CPU and must be flushed
|
||||
* from net_device drivers NAPI func end.
|
||||
*/
|
||||
if (!bq->dev_rx)
|
||||
bq->dev_rx = dev_rx;
|
||||
|
||||
bq->q[bq->count++] = xdpf;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
struct net_device *dev = dst->dev;
|
||||
struct xdp_frame *xdpf;
|
||||
|
||||
if (!dev->netdev_ops->ndo_xdp_xmit)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
xdpf = convert_to_xdp_frame(xdp);
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
return bq_enqueue(dst, xdpf, dev_rx);
|
||||
}
|
||||
|
||||
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
|
||||
{
|
||||
struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
|
||||
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
|
||||
struct net_device *dev = dev = obj ? obj->dev : NULL;
|
||||
|
||||
return dev ? &dev->ifindex : NULL;
|
||||
}
|
||||
@ -263,13 +361,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
|
||||
{
|
||||
if (dev->dev->netdev_ops->ndo_xdp_flush) {
|
||||
struct net_device *fl = dev->dev;
|
||||
struct xdp_bulk_queue *bq;
|
||||
unsigned long *bitmap;
|
||||
|
||||
int cpu;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
|
||||
__clear_bit(dev->bit, bitmap);
|
||||
|
||||
bq = per_cpu_ptr(dev->bulkq, cpu);
|
||||
bq_xmit_all(dev, bq);
|
||||
|
||||
fl->netdev_ops->ndo_xdp_flush(dev->dev);
|
||||
}
|
||||
}
|
||||
@ -281,6 +384,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
|
||||
|
||||
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
|
||||
dev_map_flush_old(dev);
|
||||
free_percpu(dev->bulkq);
|
||||
dev_put(dev->dev);
|
||||
kfree(dev);
|
||||
}
|
||||
@ -313,6 +417,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||
{
|
||||
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
||||
struct net *net = current->nsproxy->net_ns;
|
||||
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
|
||||
struct bpf_dtab_netdev *dev, *old_dev;
|
||||
u32 i = *(u32 *)key;
|
||||
u32 ifindex = *(u32 *)value;
|
||||
@ -327,13 +432,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||
if (!ifindex) {
|
||||
dev = NULL;
|
||||
} else {
|
||||
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
|
||||
map->numa_node);
|
||||
dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
||||
dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
|
||||
sizeof(void *), gfp);
|
||||
if (!dev->bulkq) {
|
||||
kfree(dev);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
dev->dev = dev_get_by_index(net, ifindex);
|
||||
if (!dev->dev) {
|
||||
free_percpu(dev->bulkq);
|
||||
kfree(dev);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -405,6 +517,9 @@ static struct notifier_block dev_map_notifier = {
|
||||
|
||||
static int __init dev_map_init(void)
|
||||
{
|
||||
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
|
||||
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
|
||||
offsetof(struct _bpf_dtab_netdev, dev));
|
||||
register_netdevice_notifier(&dev_map_notifier);
|
||||
return 0;
|
||||
}
|
||||
|
@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
|
||||
u32 index)
|
||||
{
|
||||
struct xdp_frame *xdpf;
|
||||
int err;
|
||||
int sent;
|
||||
|
||||
if (!dev->netdev_ops->ndo_xdp_xmit) {
|
||||
return -EOPNOTSUPP;
|
||||
@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
|
||||
if (err)
|
||||
return err;
|
||||
sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
|
||||
if (sent <= 0)
|
||||
return sent;
|
||||
dev->netdev_ops->ndo_xdp_flush(dev);
|
||||
return 0;
|
||||
}
|
||||
@ -3065,20 +3065,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
|
||||
|
||||
switch (map->map_type) {
|
||||
case BPF_MAP_TYPE_DEVMAP: {
|
||||
struct net_device *dev = fwd;
|
||||
struct xdp_frame *xdpf;
|
||||
struct bpf_dtab_netdev *dst = fwd;
|
||||
|
||||
if (!dev->netdev_ops->ndo_xdp_xmit)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
xdpf = convert_to_xdp_frame(xdp);
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
/* TODO: move to inside map code instead, for bulk support
|
||||
* err = dev_map_enqueue(dev, xdp);
|
||||
*/
|
||||
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
|
||||
err = dev_map_enqueue(dst, xdp, dev_rx);
|
||||
if (err)
|
||||
return err;
|
||||
__dev_map_insert_ctx(map, index);
|
||||
|
@ -308,7 +308,13 @@ err:
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
|
||||
|
||||
static void xdp_return(void *data, struct xdp_mem_info *mem)
|
||||
/* XDP RX runs under NAPI protection, and in different delivery error
|
||||
* scenarios (e.g. queue full), it is possible to return the xdp_frame
|
||||
* while still leveraging this protection. The @napi_direct boolian
|
||||
* is used for those calls sites. Thus, allowing for faster recycling
|
||||
* of xdp_frames/pages in those cases.
|
||||
*/
|
||||
static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
|
||||
{
|
||||
struct xdp_mem_allocator *xa;
|
||||
struct page *page;
|
||||
@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
|
||||
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
|
||||
page = virt_to_head_page(data);
|
||||
if (xa)
|
||||
page_pool_put_page(xa->page_pool, page);
|
||||
page_pool_put_page(xa->page_pool, page, napi_direct);
|
||||
else
|
||||
put_page(page);
|
||||
rcu_read_unlock();
|
||||
@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
|
||||
|
||||
void xdp_return_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
xdp_return(xdpf->data, &xdpf->mem);
|
||||
__xdp_return(xdpf->data, &xdpf->mem, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame);
|
||||
|
||||
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
|
||||
{
|
||||
__xdp_return(xdpf->data, &xdpf->mem, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
|
||||
|
||||
void xdp_return_buff(struct xdp_buff *xdp)
|
||||
{
|
||||
xdp_return(xdp->data, &xdp->rxq->mem);
|
||||
__xdp_return(xdp->data, &xdp->rxq->mem, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_buff);
|
||||
|
@ -125,6 +125,7 @@ struct datarec {
|
||||
u64 processed;
|
||||
u64 dropped;
|
||||
u64 info;
|
||||
u64 err;
|
||||
};
|
||||
#define MAX_CPUS 64
|
||||
|
||||
@ -208,3 +209,51 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
|
||||
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
|
||||
.key_size = sizeof(u32),
|
||||
.value_size = sizeof(struct datarec),
|
||||
.max_entries = 1,
|
||||
};
|
||||
|
||||
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
|
||||
* Code in: kernel/include/trace/events/xdp.h
|
||||
*/
|
||||
struct devmap_xmit_ctx {
|
||||
u64 __pad; // First 8 bytes are not accessible by bpf code
|
||||
int map_id; // offset:8; size:4; signed:1;
|
||||
u32 act; // offset:12; size:4; signed:0;
|
||||
u32 map_index; // offset:16; size:4; signed:0;
|
||||
int drops; // offset:20; size:4; signed:1;
|
||||
int sent; // offset:24; size:4; signed:1;
|
||||
int from_ifindex; // offset:28; size:4; signed:1;
|
||||
int to_ifindex; // offset:32; size:4; signed:1;
|
||||
int err; // offset:36; size:4; signed:1;
|
||||
};
|
||||
|
||||
SEC("tracepoint/xdp/xdp_devmap_xmit")
|
||||
int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
|
||||
{
|
||||
struct datarec *rec;
|
||||
u32 key = 0;
|
||||
|
||||
rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
|
||||
if (!rec)
|
||||
return 0;
|
||||
rec->processed += ctx->sent;
|
||||
rec->dropped += ctx->drops;
|
||||
|
||||
/* Record bulk events, then userspace can calc average bulk size */
|
||||
rec->info += 1;
|
||||
|
||||
/* Record error cases, where no frame were sent */
|
||||
if (ctx->err)
|
||||
rec->err++;
|
||||
|
||||
/* Catch API error of drv ndo_xdp_xmit sent more than count */
|
||||
if (ctx->drops < 0)
|
||||
rec->err++;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
@ -117,6 +117,7 @@ struct datarec {
|
||||
__u64 processed;
|
||||
__u64 dropped;
|
||||
__u64 info;
|
||||
__u64 err;
|
||||
};
|
||||
#define MAX_CPUS 64
|
||||
|
||||
@ -141,6 +142,7 @@ struct stats_record {
|
||||
struct record_u64 xdp_exception[XDP_ACTION_MAX];
|
||||
struct record xdp_cpumap_kthread;
|
||||
struct record xdp_cpumap_enqueue[MAX_CPUS];
|
||||
struct record xdp_devmap_xmit;
|
||||
};
|
||||
|
||||
static bool map_collect_record(int fd, __u32 key, struct record *rec)
|
||||
@ -151,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
|
||||
__u64 sum_processed = 0;
|
||||
__u64 sum_dropped = 0;
|
||||
__u64 sum_info = 0;
|
||||
__u64 sum_err = 0;
|
||||
int i;
|
||||
|
||||
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
|
||||
@ -169,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
|
||||
sum_dropped += values[i].dropped;
|
||||
rec->cpu[i].info = values[i].info;
|
||||
sum_info += values[i].info;
|
||||
rec->cpu[i].err = values[i].err;
|
||||
sum_err += values[i].err;
|
||||
}
|
||||
rec->total.processed = sum_processed;
|
||||
rec->total.dropped = sum_dropped;
|
||||
rec->total.info = sum_info;
|
||||
rec->total.err = sum_err;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -273,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
|
||||
return pps;
|
||||
}
|
||||
|
||||
static double calc_err(struct datarec *r, struct datarec *p, double period)
|
||||
{
|
||||
__u64 packets = 0;
|
||||
double pps = 0;
|
||||
|
||||
if (period > 0) {
|
||||
packets = r->err - p->err;
|
||||
pps = packets / period;
|
||||
}
|
||||
return pps;
|
||||
}
|
||||
|
||||
static void stats_print(struct stats_record *stats_rec,
|
||||
struct stats_record *stats_prev,
|
||||
bool err_only)
|
||||
@ -397,7 +415,7 @@ static void stats_print(struct stats_record *stats_rec,
|
||||
info = calc_info(r, p, t);
|
||||
if (info > 0)
|
||||
i_str = "sched";
|
||||
if (pps > 0)
|
||||
if (pps > 0 || drop > 0)
|
||||
printf(fmt1, "cpumap-kthread",
|
||||
i, pps, drop, info, i_str);
|
||||
}
|
||||
@ -409,6 +427,50 @@ static void stats_print(struct stats_record *stats_rec,
|
||||
printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
|
||||
}
|
||||
|
||||
/* devmap ndo_xdp_xmit stats */
|
||||
{
|
||||
char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
|
||||
char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
|
||||
struct record *rec, *prev;
|
||||
double drop, info, err;
|
||||
char *i_str = "";
|
||||
char *err_str = "";
|
||||
|
||||
rec = &stats_rec->xdp_devmap_xmit;
|
||||
prev = &stats_prev->xdp_devmap_xmit;
|
||||
t = calc_period(rec, prev);
|
||||
for (i = 0; i < nr_cpus; i++) {
|
||||
struct datarec *r = &rec->cpu[i];
|
||||
struct datarec *p = &prev->cpu[i];
|
||||
|
||||
pps = calc_pps(r, p, t);
|
||||
drop = calc_drop(r, p, t);
|
||||
info = calc_info(r, p, t);
|
||||
err = calc_err(r, p, t);
|
||||
if (info > 0) {
|
||||
i_str = "bulk-average";
|
||||
info = (pps+drop) / info; /* calc avg bulk */
|
||||
}
|
||||
if (err > 0)
|
||||
err_str = "drv-err";
|
||||
if (pps > 0 || drop > 0)
|
||||
printf(fmt1, "devmap-xmit",
|
||||
i, pps, drop, info, i_str, err_str);
|
||||
}
|
||||
pps = calc_pps(&rec->total, &prev->total, t);
|
||||
drop = calc_drop(&rec->total, &prev->total, t);
|
||||
info = calc_info(&rec->total, &prev->total, t);
|
||||
err = calc_err(&rec->total, &prev->total, t);
|
||||
if (info > 0) {
|
||||
i_str = "bulk-average";
|
||||
info = (pps+drop) / info; /* calc avg bulk */
|
||||
}
|
||||
if (err > 0)
|
||||
err_str = "drv-err";
|
||||
printf(fmt2, "devmap-xmit", "total", pps, drop,
|
||||
info, i_str, err_str);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@ -437,6 +499,9 @@ static bool stats_collect(struct stats_record *rec)
|
||||
fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
|
||||
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
|
||||
|
||||
fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
|
||||
map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -480,6 +545,7 @@ static struct stats_record *alloc_stats_record(void)
|
||||
|
||||
rec_sz = sizeof(struct datarec);
|
||||
rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
|
||||
rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
|
||||
|
||||
for (i = 0; i < MAX_CPUS; i++)
|
||||
rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
|
||||
@ -498,6 +564,7 @@ static void free_stats_record(struct stats_record *r)
|
||||
free(r->xdp_exception[i].cpu);
|
||||
|
||||
free(r->xdp_cpumap_kthread.cpu);
|
||||
free(r->xdp_devmap_xmit.cpu);
|
||||
|
||||
for (i = 0; i < MAX_CPUS; i++)
|
||||
free(r->xdp_cpumap_enqueue[i].cpu);
|
||||
|
Loading…
x
Reference in New Issue
Block a user