2018-05-02 13:01:23 +02:00
// SPDX-License-Identifier: GPL-2.0
/* XDP user-space packet buffer
* Copyright ( c ) 2018 Intel Corporation .
*/
# include <linux/init.h>
# include <linux/sched/mm.h>
# include <linux/sched/signal.h>
# include <linux/sched/task.h>
# include <linux/uaccess.h>
# include <linux/slab.h>
# include <linux/bpf.h>
# include <linux/mm.h>
2018-07-30 20:43:53 -07:00
# include <linux/netdevice.h>
# include <linux/rtnetlink.h>
2019-01-24 19:59:38 +01:00
# include <linux/idr.h>
2019-08-15 15:13:55 +03:00
# include <linux/vmalloc.h>
2018-05-02 13:01:23 +02:00
# include "xdp_umem.h"
2018-06-04 14:05:51 +02:00
# include "xsk_queue.h"
2018-05-02 13:01:23 +02:00
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 13:57:13 +02:00
# define XDP_UMEM_MIN_CHUNK_SIZE 2048
2018-05-02 13:01:23 +02:00
2019-01-24 19:59:38 +01:00
static DEFINE_IDA ( umem_ida ) ;
2020-08-28 10:26:17 +02:00
static void xdp_umem_unpin_pages ( struct xdp_umem * umem )
2018-10-01 14:51:34 +02:00
{
2020-08-28 10:26:17 +02:00
unpin_user_pages_dirty_lock ( umem - > pgs , umem - > npgs , true ) ;
2018-07-30 20:43:53 -07:00
2021-05-21 10:33:01 +02:00
kvfree ( umem - > pgs ) ;
2020-08-28 10:26:17 +02:00
umem - > pgs = NULL ;
2018-10-01 14:51:34 +02:00
}
2018-07-30 20:43:53 -07:00
2020-08-28 10:26:17 +02:00
static void xdp_umem_unaccount_pages ( struct xdp_umem * umem )
2018-10-01 14:51:34 +02:00
{
2020-08-28 10:26:17 +02:00
if ( umem - > user ) {
atomic_long_sub ( umem - > npgs , & umem - > user - > locked_vm ) ;
free_uid ( umem - > user ) ;
}
2018-07-30 20:43:53 -07:00
}
2020-08-28 10:26:21 +02:00
static void xdp_umem_addr_unmap ( struct xdp_umem * umem )
{
vunmap ( umem - > addrs ) ;
umem - > addrs = NULL ;
}
static int xdp_umem_addr_map ( struct xdp_umem * umem , struct page * * pages ,
u32 nr_pages )
{
umem - > addrs = vmap ( pages , nr_pages , VM_MAP , PAGE_KERNEL ) ;
if ( ! umem - > addrs )
return - ENOMEM ;
return 0 ;
}
2018-05-02 13:01:23 +02:00
static void xdp_umem_release ( struct xdp_umem * umem )
{
2020-08-28 10:26:19 +02:00
umem - > zc = false ;
2019-01-24 19:59:38 +01:00
ida_simple_remove ( & umem_ida , umem - > id ) ;
2020-08-28 10:26:21 +02:00
xdp_umem_addr_unmap ( umem ) ;
2018-05-22 09:35:02 +02:00
xdp_umem_unpin_pages ( umem ) ;
2018-05-02 13:01:23 +02:00
xdp_umem_unaccount_pages ( umem ) ;
kfree ( umem ) ;
}
xsk: Fix umem cleanup bug at socket destruct
Fix a bug that is triggered when a partially setup socket is
destroyed. For a fully setup socket, a socket that has been bound to a
device, the cleanup of the umem is performed at the end of the buffer
pool's cleanup work queue item. This has to be performed in a work
queue, and not in RCU cleanup, as it is doing a vunmap that cannot
execute in interrupt context. However, when a socket has only been
partially set up so that a umem has been created but the buffer pool
has not, the code erroneously directly calls the umem cleanup function
instead of using a work queue, and this leads to a BUG_ON() in
vunmap().
As there in this case is no buffer pool, we cannot use its work queue,
so we need to introduce a work queue for the umem and schedule this for
the cleanup. So in the case there is no pool, we are going to use the
umem's own work queue to schedule the cleanup. But if there is a
pool, the cleanup of the umem is still being performed by the pool's
work queue, as it is important that the umem is cleaned up after the
pool.
Fixes: e5e1a4bc916d ("xsk: Fix possible memory leak at socket close")
Reported-by: Marek Majtyka <marekx.majtyka@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Marek Majtyka <marekx.majtyka@intel.com>
Link: https://lore.kernel.org/bpf/1605873219-21629-1-git-send-email-magnus.karlsson@gmail.com
2020-11-20 12:53:39 +01:00
static void xdp_umem_release_deferred ( struct work_struct * work )
{
struct xdp_umem * umem = container_of ( work , struct xdp_umem , work ) ;
xdp_umem_release ( umem ) ;
}
2018-05-02 13:01:23 +02:00
void xdp_get_umem ( struct xdp_umem * umem )
{
2018-05-22 09:35:03 +02:00
refcount_inc ( & umem - > users ) ;
2018-05-02 13:01:23 +02:00
}
xsk: Fix umem cleanup bug at socket destruct
Fix a bug that is triggered when a partially setup socket is
destroyed. For a fully setup socket, a socket that has been bound to a
device, the cleanup of the umem is performed at the end of the buffer
pool's cleanup work queue item. This has to be performed in a work
queue, and not in RCU cleanup, as it is doing a vunmap that cannot
execute in interrupt context. However, when a socket has only been
partially set up so that a umem has been created but the buffer pool
has not, the code erroneously directly calls the umem cleanup function
instead of using a work queue, and this leads to a BUG_ON() in
vunmap().
As there in this case is no buffer pool, we cannot use its work queue,
so we need to introduce a work queue for the umem and schedule this for
the cleanup. So in the case there is no pool, we are going to use the
umem's own work queue to schedule the cleanup. But if there is a
pool, the cleanup of the umem is still being performed by the pool's
work queue, as it is important that the umem is cleaned up after the
pool.
Fixes: e5e1a4bc916d ("xsk: Fix possible memory leak at socket close")
Reported-by: Marek Majtyka <marekx.majtyka@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Marek Majtyka <marekx.majtyka@intel.com>
Link: https://lore.kernel.org/bpf/1605873219-21629-1-git-send-email-magnus.karlsson@gmail.com
2020-11-20 12:53:39 +01:00
void xdp_put_umem ( struct xdp_umem * umem , bool defer_cleanup )
2018-05-02 13:01:23 +02:00
{
if ( ! umem )
return ;
xsk: Fix umem cleanup bug at socket destruct
Fix a bug that is triggered when a partially setup socket is
destroyed. For a fully setup socket, a socket that has been bound to a
device, the cleanup of the umem is performed at the end of the buffer
pool's cleanup work queue item. This has to be performed in a work
queue, and not in RCU cleanup, as it is doing a vunmap that cannot
execute in interrupt context. However, when a socket has only been
partially set up so that a umem has been created but the buffer pool
has not, the code erroneously directly calls the umem cleanup function
instead of using a work queue, and this leads to a BUG_ON() in
vunmap().
As there in this case is no buffer pool, we cannot use its work queue,
so we need to introduce a work queue for the umem and schedule this for
the cleanup. So in the case there is no pool, we are going to use the
umem's own work queue to schedule the cleanup. But if there is a
pool, the cleanup of the umem is still being performed by the pool's
work queue, as it is important that the umem is cleaned up after the
pool.
Fixes: e5e1a4bc916d ("xsk: Fix possible memory leak at socket close")
Reported-by: Marek Majtyka <marekx.majtyka@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Marek Majtyka <marekx.majtyka@intel.com>
Link: https://lore.kernel.org/bpf/1605873219-21629-1-git-send-email-magnus.karlsson@gmail.com
2020-11-20 12:53:39 +01:00
if ( refcount_dec_and_test ( & umem - > users ) ) {
if ( defer_cleanup ) {
INIT_WORK ( & umem - > work , xdp_umem_release_deferred ) ;
schedule_work ( & umem - > work ) ;
} else {
xdp_umem_release ( umem ) ;
}
}
2018-05-02 13:01:23 +02:00
}
2020-05-04 15:33:52 +02:00
static int xdp_umem_pin_pages ( struct xdp_umem * umem , unsigned long address )
2018-05-02 13:01:23 +02:00
{
unsigned int gup_flags = FOLL_WRITE ;
long npgs ;
int err ;
2021-05-21 10:33:01 +02:00
umem - > pgs = kvcalloc ( umem - > npgs , sizeof ( * umem - > pgs ) , GFP_KERNEL | __GFP_NOWARN ) ;
2018-05-02 13:01:23 +02:00
if ( ! umem - > pgs )
return - ENOMEM ;
2020-06-08 21:33:25 -07:00
mmap_read_lock ( current - > mm ) ;
2020-05-04 15:33:52 +02:00
npgs = pin_user_pages ( address , umem - > npgs ,
mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM
Pach series "Add FOLL_LONGTERM to GUP fast and use it".
HFI1, qib, and mthca, use get_user_pages_fast() due to its performance
advantages. These pages can be held for a significant time. But
get_user_pages_fast() does not protect against mapping FS DAX pages.
Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks. XDP has also
shown interest in using this functionality.[1]
In addition we change get_user_pages() to use the new FOLL_LONGTERM flag
and remove the specialized get_user_pages_longterm call.
[1] https://lkml.org/lkml/2019/3/19/939
"longterm" is a relative thing and at this point is probably a misnomer.
This is really flagging a pin which is going to be given to hardware and
can't move. I've thought of a couple of alternative names but I think we
have to settle on if we are going to use FL_LAYOUT or something else to
solve the "longterm" problem. Then I think we can change the flag to a
better name.
Secondly, it depends on how often you are registering memory. I have
spoken with some RDMA users who consider MR in the performance path...
For the overall application performance. I don't have the numbers as the
tests for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an aside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
This patch (of 7):
This patch starts a series which aims to support FOLL_LONGTERM in
get_user_pages_fast(). Some callers who would like to do a longterm (user
controlled pin) of pages with the fast variant of GUP for performance
purposes.
Rather than have a separate get_user_pages_longterm() call, introduce
FOLL_LONGTERM and change the longterm callers to use it.
This patch does not change any functionality. In the short term
"longterm" or user controlled pins are unsafe for Filesystems and FS DAX
in particular has been blocked. However, callers of get_user_pages_fast()
were not "protected".
FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it
requires vmas to determine if DAX is in use.
NOTE: In merging with the CMA changes we opt to change the
get_user_pages() call in check_and_migrate_cma_pages() to a call of
__get_user_pages_locked() on the newly migrated pages. This makes the
code read better in that we are calling __get_user_pages_locked() on the
pages before and after a potential migration.
As a side affect some of the interfaces are cleaned up but this is not the
primary purpose of the series.
In review[1] it was asked:
<quote>
> This I don't get - if you do lock down long term mappings performance
> of the actual get_user_pages call shouldn't matter to start with.
>
> What do I miss?
A couple of points.
First "longterm" is a relative thing and at this point is probably a
misnomer. This is really flagging a pin which is going to be given to
hardware and can't move. I've thought of a couple of alternative names
but I think we have to settle on if we are going to use FL_LAYOUT or
something else to solve the "longterm" problem. Then I think we can
change the flag to a better name.
Second, It depends on how often you are registering memory. I have spoken
with some RDMA users who consider MR in the performance path... For the
overall application performance. I don't have the numbers as the tests
for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an asside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
</quote>
[1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965
[ira.weiny@intel.com: v3]
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-13 17:17:03 -07:00
gup_flags | FOLL_LONGTERM , & umem - > pgs [ 0 ] , NULL ) ;
2020-06-08 21:33:25 -07:00
mmap_read_unlock ( current - > mm ) ;
2018-05-02 13:01:23 +02:00
if ( npgs ! = umem - > npgs ) {
if ( npgs > = 0 ) {
umem - > npgs = npgs ;
err = - ENOMEM ;
goto out_pin ;
}
err = npgs ;
goto out_pgs ;
}
return 0 ;
out_pin :
xdp_umem_unpin_pages ( umem ) ;
out_pgs :
2021-05-21 10:33:01 +02:00
kvfree ( umem - > pgs ) ;
2018-05-02 13:01:23 +02:00
umem - > pgs = NULL ;
return err ;
}
static int xdp_umem_account_pages ( struct xdp_umem * umem )
{
unsigned long lock_limit , new_npgs , old_npgs ;
if ( capable ( CAP_IPC_LOCK ) )
return 0 ;
lock_limit = rlimit ( RLIMIT_MEMLOCK ) > > PAGE_SHIFT ;
umem - > user = get_uid ( current_user ( ) ) ;
do {
old_npgs = atomic_long_read ( & umem - > user - > locked_vm ) ;
new_npgs = old_npgs + umem - > npgs ;
if ( new_npgs > lock_limit ) {
free_uid ( umem - > user ) ;
umem - > user = NULL ;
return - ENOBUFS ;
}
} while ( atomic_long_cmpxchg ( & umem - > user - > locked_vm , old_npgs ,
new_npgs ) ! = old_npgs ) ;
return 0 ;
}
2018-05-22 09:35:02 +02:00
static int xdp_umem_reg ( struct xdp_umem * umem , struct xdp_umem_reg * mr )
2018-05-02 13:01:23 +02:00
{
2020-09-10 09:56:09 +02:00
u32 npgs_rem , chunk_size = mr - > chunk_size , headroom = mr - > headroom ;
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 02:25:22 +00:00
bool unaligned_chunks = mr - > flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG ;
2020-05-25 10:03:59 +02:00
u64 npgs , addr = mr - > addr , size = mr - > len ;
2020-09-10 09:56:09 +02:00
unsigned int chunks , chunks_rem ;
2020-04-14 09:35:15 +02:00
int err ;
2018-05-02 13:01:23 +02:00
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 13:57:13 +02:00
if ( chunk_size < XDP_UMEM_MIN_CHUNK_SIZE | | chunk_size > PAGE_SIZE ) {
2018-05-02 13:01:23 +02:00
/* Strictly speaking we could support this, if:
* - huge pages , or *
* - using an IOMMU , or
* - making sure the memory area is consecutive
* but for now , we simply say " computer says no " .
*/
return - EINVAL ;
}
2020-08-28 10:26:19 +02:00
if ( mr - > flags & ~ XDP_UMEM_UNALIGNED_CHUNK_FLAG )
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 02:25:22 +00:00
return - EINVAL ;
if ( ! unaligned_chunks & & ! is_power_of_2 ( chunk_size ) )
2018-05-02 13:01:23 +02:00
return - EINVAL ;
if ( ! PAGE_ALIGNED ( addr ) ) {
/* Memory area has to be page size aligned. For
* simplicity , this might change .
*/
return - EINVAL ;
}
if ( ( addr + size ) < addr )
return - EINVAL ;
2020-09-10 09:56:09 +02:00
npgs = div_u64_rem ( size , PAGE_SIZE , & npgs_rem ) ;
if ( npgs_rem )
npgs + + ;
2020-05-25 10:03:59 +02:00
if ( npgs > U32_MAX )
return - EINVAL ;
2020-09-10 09:56:09 +02:00
chunks = ( unsigned int ) div_u64_rem ( size , chunk_size , & chunks_rem ) ;
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 13:57:13 +02:00
if ( chunks = = 0 )
2018-05-02 13:01:23 +02:00
return - EINVAL ;
2020-09-10 09:56:09 +02:00
if ( ! unaligned_chunks & & chunks_rem )
return - EINVAL ;
2018-05-02 13:01:23 +02:00
2020-04-14 09:35:15 +02:00
if ( headroom > = chunk_size - XDP_PACKET_HEADROOM )
2018-05-02 13:01:23 +02:00
return - EINVAL ;
2018-08-31 13:40:02 +02:00
umem - > size = size ;
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 13:57:13 +02:00
umem - > headroom = headroom ;
2020-05-20 21:20:53 +02:00
umem - > chunk_size = chunk_size ;
2020-08-28 10:26:17 +02:00
umem - > chunks = chunks ;
2020-05-25 10:03:59 +02:00
umem - > npgs = ( u32 ) npgs ;
2018-05-02 13:01:23 +02:00
umem - > pgs = NULL ;
umem - > user = NULL ;
xsk: add support to allow unaligned chunk placement
Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.
Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-08-27 02:25:22 +00:00
umem - > flags = mr - > flags ;
2018-05-02 13:01:23 +02:00
2020-08-28 10:26:22 +02:00
INIT_LIST_HEAD ( & umem - > xsk_dma_list ) ;
2018-05-22 09:35:03 +02:00
refcount_set ( & umem - > users , 1 ) ;
2018-05-02 13:01:23 +02:00
err = xdp_umem_account_pages ( umem ) ;
if ( err )
2019-03-13 15:15:49 +01:00
return err ;
2018-05-02 13:01:23 +02:00
2020-05-04 15:33:52 +02:00
err = xdp_umem_pin_pages ( umem , ( unsigned long ) addr ) ;
2018-05-02 13:01:23 +02:00
if ( err )
goto out_account ;
2018-06-04 14:05:52 +02:00
2020-08-28 10:26:21 +02:00
err = xdp_umem_addr_map ( umem , umem - > pgs , umem - > npgs ) ;
if ( err )
goto out_unpin ;
2020-05-20 21:20:53 +02:00
return 0 ;
2018-05-02 13:01:23 +02:00
2020-08-28 10:26:21 +02:00
out_unpin :
xdp_umem_unpin_pages ( umem ) ;
2018-05-02 13:01:23 +02:00
out_account :
xdp_umem_unaccount_pages ( umem ) ;
return err ;
}
2018-05-02 13:01:26 +02:00
2018-05-22 09:35:02 +02:00
struct xdp_umem * xdp_umem_create ( struct xdp_umem_reg * mr )
{
struct xdp_umem * umem ;
int err ;
umem = kzalloc ( sizeof ( * umem ) , GFP_KERNEL ) ;
if ( ! umem )
return ERR_PTR ( - ENOMEM ) ;
2019-01-24 19:59:38 +01:00
err = ida_simple_get ( & umem_ida , 0 , 0 , GFP_KERNEL ) ;
if ( err < 0 ) {
kfree ( umem ) ;
return ERR_PTR ( err ) ;
}
umem - > id = err ;
2018-05-22 09:35:02 +02:00
err = xdp_umem_reg ( umem , mr ) ;
if ( err ) {
2019-01-24 19:59:38 +01:00
ida_simple_remove ( & umem_ida , umem - > id ) ;
2018-05-22 09:35:02 +02:00
kfree ( umem ) ;
return ERR_PTR ( err ) ;
}
return umem ;
}