2018-05-02 14:01:23 +03:00
// SPDX-License-Identifier: GPL-2.0
/* XDP user-space packet buffer
* Copyright ( c ) 2018 Intel Corporation .
*/
# include <linux/init.h>
# include <linux/sched/mm.h>
# include <linux/sched/signal.h>
# include <linux/sched/task.h>
# include <linux/uaccess.h>
# include <linux/slab.h>
# include <linux/bpf.h>
# include <linux/mm.h>
2018-07-31 06:43:53 +03:00
# include <linux/netdevice.h>
# include <linux/rtnetlink.h>
2019-01-24 21:59:38 +03:00
# include <linux/idr.h>
2019-08-15 15:13:55 +03:00
# include <linux/vmalloc.h>
2018-05-02 14:01:23 +03:00
# include "xdp_umem.h"
2018-06-04 15:05:51 +03:00
# include "xsk_queue.h"
2018-05-02 14:01:23 +03:00
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
# define XDP_UMEM_MIN_CHUNK_SIZE 2048
2018-05-02 14:01:23 +03:00
2019-01-24 21:59:38 +03:00
static DEFINE_IDA ( umem_ida ) ;
2018-06-04 15:05:57 +03:00
void xdp_add_sk_umem ( struct xdp_umem * umem , struct xdp_sock * xs )
{
unsigned long flags ;
2019-10-21 11:16:58 +03:00
if ( ! xs - > tx )
return ;
2018-06-04 15:05:57 +03:00
spin_lock_irqsave ( & umem - > xsk_list_lock , flags ) ;
list_add_rcu ( & xs - > list , & umem - > xsk_list ) ;
spin_unlock_irqrestore ( & umem - > xsk_list_lock , flags ) ;
}
void xdp_del_sk_umem ( struct xdp_umem * umem , struct xdp_sock * xs )
{
unsigned long flags ;
2019-10-21 11:16:58 +03:00
if ( ! xs - > tx )
return ;
2018-10-05 14:25:15 +03:00
spin_lock_irqsave ( & umem - > xsk_list_lock , flags ) ;
list_del_rcu ( & xs - > list ) ;
spin_unlock_irqrestore ( & umem - > xsk_list_lock , flags ) ;
2018-06-04 15:05:57 +03:00
}
2018-10-01 15:51:34 +03:00
/* The umem is stored both in the _rx struct and the _tx struct as we do
* not know if the device has more tx queues than rx , or the opposite .
* This might also change during run time .
*/
2019-01-10 22:29:02 +03:00
static int xdp_reg_umem_at_qid ( struct net_device * dev , struct xdp_umem * umem ,
u16 queue_id )
2018-07-31 06:43:53 +03:00
{
2019-01-10 22:29:02 +03:00
if ( queue_id > = max_t ( unsigned int ,
dev - > real_num_rx_queues ,
dev - > real_num_tx_queues ) )
return - EINVAL ;
2018-10-01 15:51:34 +03:00
if ( queue_id < dev - > real_num_rx_queues )
dev - > _rx [ queue_id ] . umem = umem ;
if ( queue_id < dev - > real_num_tx_queues )
dev - > _tx [ queue_id ] . umem = umem ;
2019-01-10 22:29:02 +03:00
return 0 ;
2018-10-01 15:51:34 +03:00
}
2018-07-31 06:43:53 +03:00
2018-10-01 15:51:36 +03:00
struct xdp_umem * xdp_get_umem_from_qid ( struct net_device * dev ,
u16 queue_id )
2018-10-01 15:51:34 +03:00
{
if ( queue_id < dev - > real_num_rx_queues )
return dev - > _rx [ queue_id ] . umem ;
if ( queue_id < dev - > real_num_tx_queues )
return dev - > _tx [ queue_id ] . umem ;
2018-07-31 06:43:53 +03:00
2018-10-01 15:51:34 +03:00
return NULL ;
}
2018-12-18 16:45:13 +03:00
EXPORT_SYMBOL ( xdp_get_umem_from_qid ) ;
2018-07-31 06:43:53 +03:00
2018-10-01 15:51:34 +03:00
static void xdp_clear_umem_at_qid ( struct net_device * dev , u16 queue_id )
{
2018-10-01 15:51:37 +03:00
if ( queue_id < dev - > real_num_rx_queues )
2018-10-01 15:51:34 +03:00
dev - > _rx [ queue_id ] . umem = NULL ;
2018-10-01 15:51:37 +03:00
if ( queue_id < dev - > real_num_tx_queues )
2018-10-01 15:51:34 +03:00
dev - > _tx [ queue_id ] . umem = NULL ;
2018-07-31 06:43:53 +03:00
}
2018-06-04 15:05:55 +03:00
int xdp_umem_assign_dev ( struct xdp_umem * umem , struct net_device * dev ,
2018-10-01 15:51:34 +03:00
u16 queue_id , u16 flags )
2018-06-04 15:05:55 +03:00
{
bool force_zc , force_copy ;
struct netdev_bpf bpf ;
2018-10-01 15:51:34 +03:00
int err = 0 ;
2018-06-04 15:05:55 +03:00
2019-07-08 14:03:44 +03:00
ASSERT_RTNL ( ) ;
2018-06-04 15:05:55 +03:00
force_zc = flags & XDP_ZEROCOPY ;
force_copy = flags & XDP_COPY ;
if ( force_zc & & force_copy )
return - EINVAL ;
2019-07-08 14:03:44 +03:00
if ( xdp_get_umem_from_qid ( dev , queue_id ) )
return - EBUSY ;
2018-06-04 15:05:55 +03:00
2019-01-10 22:29:02 +03:00
err = xdp_reg_umem_at_qid ( dev , umem , queue_id ) ;
if ( err )
2019-07-08 14:03:44 +03:00
return err ;
2019-01-10 22:29:02 +03:00
2018-10-01 15:51:34 +03:00
umem - > dev = dev ;
umem - > queue_id = queue_id ;
2019-06-28 11:04:06 +03:00
xsk: add support for need_wakeup flag in AF_XDP rings
This commit adds support for a new flag called need_wakeup in the
AF_XDP Tx and fill rings. When this flag is set, it means that the
application has to explicitly wake up the kernel Rx (for the bit in
the fill ring) or kernel Tx (for bit in the Tx ring) processing by
issuing a syscall. Poll() can wake up both depending on the flags
submitted and sendto() will wake up tx processing only.
The main reason for introducing this new flag is to be able to
efficiently support the case when application and driver is executing
on the same core. Previously, the driver was just busy-spinning on the
fill ring if it ran out of buffers in the HW and there were none on
the fill ring. This approach works when the application is running on
another core as it can replenish the fill ring while the driver is
busy-spinning. Though, this is a lousy approach if both of them are
running on the same core as the probability of the fill ring getting
more entries when the driver is busy-spinning is zero. With this new
feature the driver now sets the need_wakeup flag and returns to the
application. The application can then replenish the fill queue and
then explicitly wake up the Rx processing in the kernel using the
syscall poll(). For Tx, the flag is only set to one if the driver has
no outstanding Tx completion interrupts. If it has some, the flag is
zero as it will be woken up by a completion interrupt anyway.
As a nice side effect, this new flag also improves the performance of
the case where application and driver are running on two different
cores as it reduces the number of syscalls to the kernel. The kernel
tells user space if it needs to be woken up by a syscall, and this
eliminates many of the syscalls.
This flag needs some simple driver support. If the driver does not
support this, the Rx flag is always zero and the Tx flag is always
one. This makes any application relying on this feature default to the
old behaviour of not requiring any syscalls in the Rx path and always
having to call sendto() in the Tx path.
For backwards compatibility reasons, this feature has to be explicitly
turned on using a new bind flag (XDP_USE_NEED_WAKEUP). I recommend
that you always turn it on as it so far always have had a positive
performance impact.
The name and inspiration of the flag has been taken from io_uring by
Jens Axboe. Details about this feature in io_uring can be found in
http://kernel.dk/io_uring.pdf, section 8.3.
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-14 10:27:17 +03:00
if ( flags & XDP_USE_NEED_WAKEUP ) {
umem - > flags | = XDP_UMEM_USES_NEED_WAKEUP ;
/* Tx needs to be explicitly woken up the first time.
* Also for supporting drivers that do not implement this
* feature . They will always have to call sendto ( ) .
*/
xsk_set_tx_need_wakeup ( umem ) ;
}
2019-06-28 11:04:06 +03:00
dev_hold ( dev ) ;
2018-10-01 15:51:34 +03:00
if ( force_copy )
/* For copy-mode, we are done. */
2019-07-08 14:03:44 +03:00
return 0 ;
2018-06-04 15:05:55 +03:00
2019-08-14 10:27:16 +03:00
if ( ! dev - > netdev_ops - > ndo_bpf | | ! dev - > netdev_ops - > ndo_xsk_wakeup ) {
2018-10-01 15:51:34 +03:00
err = - EOPNOTSUPP ;
goto err_unreg_umem ;
2018-07-31 06:43:53 +03:00
}
2018-06-04 15:05:55 +03:00
2018-07-31 06:43:52 +03:00
bpf . command = XDP_SETUP_XSK_UMEM ;
bpf . xsk . umem = umem ;
bpf . xsk . queue_id = queue_id ;
2018-06-04 15:05:55 +03:00
2018-07-31 06:43:52 +03:00
err = dev - > netdev_ops - > ndo_bpf ( dev , & bpf ) ;
if ( err )
2018-10-01 15:51:34 +03:00
goto err_unreg_umem ;
2018-06-04 15:05:55 +03:00
2018-07-31 06:43:52 +03:00
umem - > zc = true ;
return 0 ;
2018-07-31 06:43:53 +03:00
2018-10-01 15:51:34 +03:00
err_unreg_umem :
if ( ! force_zc )
err = 0 ; /* fallback to copy mode */
2019-02-12 10:51:14 +03:00
if ( err )
xdp_clear_umem_at_qid ( dev , queue_id ) ;
2018-10-01 15:51:34 +03:00
return err ;
2018-06-04 15:05:55 +03:00
}
2019-06-28 11:04:07 +03:00
void xdp_umem_clear_dev ( struct xdp_umem * umem )
2018-06-04 15:05:55 +03:00
{
struct netdev_bpf bpf ;
int err ;
2019-06-28 11:04:07 +03:00
ASSERT_RTNL ( ) ;
2019-06-07 20:27:32 +03:00
if ( ! umem - > dev )
return ;
2018-10-01 15:51:34 +03:00
if ( umem - > zc ) {
2018-06-04 15:05:55 +03:00
bpf . command = XDP_SETUP_XSK_UMEM ;
bpf . xsk . umem = NULL ;
bpf . xsk . queue_id = umem - > queue_id ;
err = umem - > dev - > netdev_ops - > ndo_bpf ( umem - > dev , & bpf ) ;
if ( err )
WARN ( 1 , " failed to disable umem! \n " ) ;
2018-10-01 15:51:34 +03:00
}
2019-06-07 20:27:32 +03:00
xdp_clear_umem_at_qid ( umem - > dev , umem - > queue_id ) ;
2018-06-04 15:05:55 +03:00
2019-06-28 11:04:06 +03:00
dev_put ( umem - > dev ) ;
umem - > dev = NULL ;
umem - > zc = false ;
2018-06-04 15:05:55 +03:00
}
2019-08-08 12:38:03 +03:00
static void xdp_umem_unmap_pages ( struct xdp_umem * umem )
{
unsigned int i ;
for ( i = 0 ; i < umem - > npgs ; i + + )
2019-08-15 15:13:55 +03:00
if ( PageHighMem ( umem - > pgs [ i ] ) )
vunmap ( umem - > pages [ i ] . addr ) ;
}
static int xdp_umem_map_pages ( struct xdp_umem * umem )
{
unsigned int i ;
void * addr ;
for ( i = 0 ; i < umem - > npgs ; i + + ) {
if ( PageHighMem ( umem - > pgs [ i ] ) )
addr = vmap ( & umem - > pgs [ i ] , 1 , VM_MAP , PAGE_KERNEL ) ;
else
addr = page_address ( umem - > pgs [ i ] ) ;
if ( ! addr ) {
xdp_umem_unmap_pages ( umem ) ;
return - ENOMEM ;
}
umem - > pages [ i ] . addr = addr ;
}
return 0 ;
2019-08-08 12:38:03 +03:00
}
2018-05-02 14:01:23 +03:00
static void xdp_umem_unpin_pages ( struct xdp_umem * umem )
{
2020-01-31 09:13:35 +03:00
unpin_user_pages_dirty_lock ( umem - > pgs , umem - > npgs , true ) ;
2018-05-22 10:35:02 +03:00
kfree ( umem - > pgs ) ;
umem - > pgs = NULL ;
2018-05-02 14:01:23 +03:00
}
static void xdp_umem_unaccount_pages ( struct xdp_umem * umem )
{
2018-06-08 01:06:01 +03:00
if ( umem - > user ) {
atomic_long_sub ( umem - > npgs , & umem - > user - > locked_vm ) ;
free_uid ( umem - > user ) ;
}
2018-05-02 14:01:23 +03:00
}
static void xdp_umem_release ( struct xdp_umem * umem )
{
2019-06-28 11:04:07 +03:00
rtnl_lock ( ) ;
2018-06-04 15:05:55 +03:00
xdp_umem_clear_dev ( umem ) ;
2019-06-28 11:04:07 +03:00
rtnl_unlock ( ) ;
2018-06-04 15:05:55 +03:00
2019-01-24 21:59:38 +03:00
ida_simple_remove ( & umem_ida , umem - > id ) ;
2018-05-02 14:01:24 +03:00
if ( umem - > fq ) {
xskq_destroy ( umem - > fq ) ;
umem - > fq = NULL ;
}
2018-05-02 14:01:31 +03:00
if ( umem - > cq ) {
xskq_destroy ( umem - > cq ) ;
umem - > cq = NULL ;
}
2018-09-07 11:18:46 +03:00
xsk_reuseq_destroy ( umem ) ;
2019-08-08 12:38:03 +03:00
xdp_umem_unmap_pages ( umem ) ;
2018-05-22 10:35:02 +03:00
xdp_umem_unpin_pages ( umem ) ;
2018-05-02 14:01:23 +03:00
2020-01-14 12:49:25 +03:00
kvfree ( umem - > pages ) ;
2018-06-04 15:05:52 +03:00
umem - > pages = NULL ;
2018-05-02 14:01:23 +03:00
xdp_umem_unaccount_pages ( umem ) ;
kfree ( umem ) ;
}
static void xdp_umem_release_deferred ( struct work_struct * work )
{
struct xdp_umem * umem = container_of ( work , struct xdp_umem , work ) ;
xdp_umem_release ( umem ) ;
}
void xdp_get_umem ( struct xdp_umem * umem )
{
2018-05-22 10:35:03 +03:00
refcount_inc ( & umem - > users ) ;
2018-05-02 14:01:23 +03:00
}
void xdp_put_umem ( struct xdp_umem * umem )
{
if ( ! umem )
return ;
2018-05-22 10:35:03 +03:00
if ( refcount_dec_and_test ( & umem - > users ) ) {
2018-05-02 14:01:23 +03:00
INIT_WORK ( & umem - > work , xdp_umem_release_deferred ) ;
schedule_work ( & umem - > work ) ;
}
}
static int xdp_umem_pin_pages ( struct xdp_umem * umem )
{
unsigned int gup_flags = FOLL_WRITE ;
long npgs ;
int err ;
2018-06-11 14:57:12 +03:00
umem - > pgs = kcalloc ( umem - > npgs , sizeof ( * umem - > pgs ) ,
GFP_KERNEL | __GFP_NOWARN ) ;
2018-05-02 14:01:23 +03:00
if ( ! umem - > pgs )
return - ENOMEM ;
2019-02-11 19:15:29 +03:00
down_read ( & current - > mm - > mmap_sem ) ;
2020-01-31 09:13:17 +03:00
npgs = pin_user_pages ( umem - > address , umem - > npgs ,
mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM
Pach series "Add FOLL_LONGTERM to GUP fast and use it".
HFI1, qib, and mthca, use get_user_pages_fast() due to its performance
advantages. These pages can be held for a significant time. But
get_user_pages_fast() does not protect against mapping FS DAX pages.
Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks. XDP has also
shown interest in using this functionality.[1]
In addition we change get_user_pages() to use the new FOLL_LONGTERM flag
and remove the specialized get_user_pages_longterm call.
[1] https://lkml.org/lkml/2019/3/19/939
"longterm" is a relative thing and at this point is probably a misnomer.
This is really flagging a pin which is going to be given to hardware and
can't move. I've thought of a couple of alternative names but I think we
have to settle on if we are going to use FL_LAYOUT or something else to
solve the "longterm" problem. Then I think we can change the flag to a
better name.
Secondly, it depends on how often you are registering memory. I have
spoken with some RDMA users who consider MR in the performance path...
For the overall application performance. I don't have the numbers as the
tests for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an aside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
This patch (of 7):
This patch starts a series which aims to support FOLL_LONGTERM in
get_user_pages_fast(). Some callers who would like to do a longterm (user
controlled pin) of pages with the fast variant of GUP for performance
purposes.
Rather than have a separate get_user_pages_longterm() call, introduce
FOLL_LONGTERM and change the longterm callers to use it.
This patch does not change any functionality. In the short term
"longterm" or user controlled pins are unsafe for Filesystems and FS DAX
in particular has been blocked. However, callers of get_user_pages_fast()
were not "protected".
FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it
requires vmas to determine if DAX is in use.
NOTE: In merging with the CMA changes we opt to change the
get_user_pages() call in check_and_migrate_cma_pages() to a call of
__get_user_pages_locked() on the newly migrated pages. This makes the
code read better in that we are calling __get_user_pages_locked() on the
pages before and after a potential migration.
As a side affect some of the interfaces are cleaned up but this is not the
primary purpose of the series.
In review[1] it was asked:
<quote>
> This I don't get - if you do lock down long term mappings performance
> of the actual get_user_pages call shouldn't matter to start with.
>
> What do I miss?
A couple of points.
First "longterm" is a relative thing and at this point is probably a
misnomer. This is really flagging a pin which is going to be given to
hardware and can't move. I've thought of a couple of alternative names
but I think we have to settle on if we are going to use FL_LAYOUT or
something else to solve the "longterm" problem. Then I think we can
change the flag to a better name.
Second, It depends on how often you are registering memory. I have spoken
with some RDMA users who consider MR in the performance path... For the
overall application performance. I don't have the numbers as the tests
for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an asside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
</quote>
[1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965
[ira.weiny@intel.com: v3]
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 03:17:03 +03:00
gup_flags | FOLL_LONGTERM , & umem - > pgs [ 0 ] , NULL ) ;
2019-02-11 19:15:29 +03:00
up_read ( & current - > mm - > mmap_sem ) ;
2018-05-02 14:01:23 +03:00
if ( npgs ! = umem - > npgs ) {
if ( npgs > = 0 ) {
umem - > npgs = npgs ;
err = - ENOMEM ;
goto out_pin ;
}
err = npgs ;
goto out_pgs ;
}
return 0 ;
out_pin :
xdp_umem_unpin_pages ( umem ) ;
out_pgs :
kfree ( umem - > pgs ) ;
umem - > pgs = NULL ;
return err ;
}
static int xdp_umem_account_pages ( struct xdp_umem * umem )
{
unsigned long lock_limit , new_npgs , old_npgs ;
if ( capable ( CAP_IPC_LOCK ) )
return 0 ;
lock_limit = rlimit ( RLIMIT_MEMLOCK ) > > PAGE_SHIFT ;
umem - > user = get_uid ( current_user ( ) ) ;
do {
old_npgs = atomic_long_read ( & umem - > user - > locked_vm ) ;
new_npgs = old_npgs + umem - > npgs ;
if ( new_npgs > lock_limit ) {
free_uid ( umem - > user ) ;
umem - > user = NULL ;
return - ENOBUFS ;
}
} while ( atomic_long_cmpxchg ( & umem - > user - > locked_vm , old_npgs ,
new_npgs ) ! = old_npgs ) ;
return 0 ;
}
2018-05-22 10:35:02 +03:00
static int xdp_umem_reg ( struct xdp_umem * umem , struct xdp_umem_reg * mr )
2018-05-02 14:01:23 +03:00
{
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 05:25:22 +03:00
bool unaligned_chunks = mr - > flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG ;
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
u32 chunk_size = mr - > chunk_size , headroom = mr - > headroom ;
unsigned int chunks , chunks_per_page ;
2018-05-02 14:01:23 +03:00
u64 addr = mr - > addr , size = mr - > len ;
2020-04-14 10:35:15 +03:00
int err ;
2018-05-02 14:01:23 +03:00
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
if ( chunk_size < XDP_UMEM_MIN_CHUNK_SIZE | | chunk_size > PAGE_SIZE ) {
2018-05-02 14:01:23 +03:00
/* Strictly speaking we could support this, if:
* - huge pages , or *
* - using an IOMMU , or
* - making sure the memory area is consecutive
* but for now , we simply say " computer says no " .
*/
return - EINVAL ;
}
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 05:25:22 +03:00
if ( mr - > flags & ~ ( XDP_UMEM_UNALIGNED_CHUNK_FLAG |
XDP_UMEM_USES_NEED_WAKEUP ) )
return - EINVAL ;
if ( ! unaligned_chunks & & ! is_power_of_2 ( chunk_size ) )
2018-05-02 14:01:23 +03:00
return - EINVAL ;
if ( ! PAGE_ALIGNED ( addr ) ) {
/* Memory area has to be page size aligned. For
* simplicity , this might change .
*/
return - EINVAL ;
}
if ( ( addr + size ) < addr )
return - EINVAL ;
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
chunks = ( unsigned int ) div_u64 ( size , chunk_size ) ;
if ( chunks = = 0 )
2018-05-02 14:01:23 +03:00
return - EINVAL ;
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 05:25:22 +03:00
if ( ! unaligned_chunks ) {
chunks_per_page = PAGE_SIZE / chunk_size ;
if ( chunks < chunks_per_page | | chunks % chunks_per_page )
return - EINVAL ;
}
2018-05-02 14:01:23 +03:00
2020-04-14 10:35:15 +03:00
if ( headroom > = chunk_size - XDP_PACKET_HEADROOM )
2018-05-02 14:01:23 +03:00
return - EINVAL ;
umem - > address = ( unsigned long ) addr ;
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 05:25:22 +03:00
umem - > chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
: ~ ( ( u64 ) chunk_size - 1 ) ;
2018-08-31 14:40:02 +03:00
umem - > size = size ;
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
umem - > headroom = headroom ;
umem - > chunk_size_nohr = chunk_size - headroom ;
2018-05-02 14:01:23 +03:00
umem - > npgs = size / PAGE_SIZE ;
umem - > pgs = NULL ;
umem - > user = NULL ;
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 05:25:22 +03:00
umem - > flags = mr - > flags ;
2018-06-04 15:05:57 +03:00
INIT_LIST_HEAD ( & umem - > xsk_list ) ;
spin_lock_init ( & umem - > xsk_list_lock ) ;
2018-05-02 14:01:23 +03:00
2018-05-22 10:35:03 +03:00
refcount_set ( & umem - > users , 1 ) ;
2018-05-02 14:01:23 +03:00
err = xdp_umem_account_pages ( umem ) ;
if ( err )
2019-03-13 17:15:49 +03:00
return err ;
2018-05-02 14:01:23 +03:00
err = xdp_umem_pin_pages ( umem ) ;
if ( err )
goto out_account ;
2018-06-04 15:05:52 +03:00
2020-01-14 12:49:25 +03:00
umem - > pages = kvcalloc ( umem - > npgs , sizeof ( * umem - > pages ) ,
GFP_KERNEL_ACCOUNT ) ;
2018-06-04 15:05:52 +03:00
if ( ! umem - > pages ) {
err = - ENOMEM ;
2019-08-15 23:56:35 +03:00
goto out_pin ;
2018-06-04 15:05:52 +03:00
}
2019-08-15 15:13:55 +03:00
err = xdp_umem_map_pages ( umem ) ;
if ( ! err )
return 0 ;
2018-06-04 15:05:52 +03:00
2020-01-14 12:49:25 +03:00
kvfree ( umem - > pages ) ;
2018-05-02 14:01:23 +03:00
2019-08-15 23:56:35 +03:00
out_pin :
xdp_umem_unpin_pages ( umem ) ;
2018-05-02 14:01:23 +03:00
out_account :
xdp_umem_unaccount_pages ( umem ) ;
return err ;
}
2018-05-02 14:01:26 +03:00
2018-05-22 10:35:02 +03:00
struct xdp_umem * xdp_umem_create ( struct xdp_umem_reg * mr )
{
struct xdp_umem * umem ;
int err ;
umem = kzalloc ( sizeof ( * umem ) , GFP_KERNEL ) ;
if ( ! umem )
return ERR_PTR ( - ENOMEM ) ;
2019-01-24 21:59:38 +03:00
err = ida_simple_get ( & umem_ida , 0 , 0 , GFP_KERNEL ) ;
if ( err < 0 ) {
kfree ( umem ) ;
return ERR_PTR ( err ) ;
}
umem - > id = err ;
2018-05-22 10:35:02 +03:00
err = xdp_umem_reg ( umem , mr ) ;
if ( err ) {
2019-01-24 21:59:38 +03:00
ida_simple_remove ( & umem_ida , umem - > id ) ;
2018-05-22 10:35:02 +03:00
kfree ( umem ) ;
return ERR_PTR ( err ) ;
}
return umem ;
}
2018-05-02 14:01:26 +03:00
bool xdp_umem_validate_queues ( struct xdp_umem * umem )
{
2018-05-18 15:00:23 +03:00
return umem - > fq & & umem - > cq ;
2018-05-02 14:01:26 +03:00
}