2018-05-02 13:01:28 +02:00
// SPDX-License-Identifier: GPL-2.0
/* XSKMAP used for AF_XDP sockets
* Copyright ( c ) 2018 Intel Corporation .
*/
# include <linux/bpf.h>
# include <linux/capability.h>
# include <net/xdp_sock.h>
# include <linux/slab.h>
# include <linux/sched.h>
2020-05-20 21:20:50 +02:00
# include "xsk.h"
2019-08-15 11:30:13 +02:00
int xsk_map_inc ( struct xsk_map * map )
{
bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails
92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into
potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit
(32k). Due to using 32-bit counter, it's possible in practice to overflow
refcounter and make it wrap around to 0, causing erroneous map free, while
there are still references to it, causing use-after-free problems.
But having a failing refcounting operations are problematic in some cases. One
example is mmap() interface. After establishing initial memory-mapping, user
is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily
splitting it into multiple non-contiguous regions. All this happening without
any control from the users of mmap subsystem. Rather mmap subsystem sends
notifications to original creator of memory mapping through open/close
callbacks, which are optionally specified during initial memory mapping
creation. These callbacks are used to maintain accurate refcount for bpf_map
(see next patch in this series). The problem is that open() callback is not
supposed to fail, because memory-mapped resource is set up and properly
referenced. This is posing a problem for using memory-mapping with BPF maps.
One solution to this is to maintain separate refcount for just memory-mappings
and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively.
There are similar use cases in current work on tcp-bpf, necessitating extra
counter as well. This seems like a rather unfortunate and ugly solution that
doesn't scale well to various new use cases.
Another approach to solve this is to use non-failing refcount_t type, which
uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX,
stays there. This utlimately causes memory leak, but prevents use after free.
But given refcounting is not the most performance-critical operation with BPF
maps (it's not used from running BPF program code), we can also just switch to
64-bit counter that can't overflow in practice, potentially disadvantaging
32-bit platforms a tiny bit. This simplifies semantics and allows above
described scenarios to not worry about failing refcount increment operation.
In terms of struct bpf_map size, we are still good and use the same amount of
space:
BEFORE (3 cache lines, 8 bytes of padding at the end):
struct bpf_map {
const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */
struct bpf_map * inner_map_meta; /* 8 8 */
void * security; /* 16 8 */
enum bpf_map_type map_type; /* 24 4 */
u32 key_size; /* 28 4 */
u32 value_size; /* 32 4 */
u32 max_entries; /* 36 4 */
u32 map_flags; /* 40 4 */
int spin_lock_off; /* 44 4 */
u32 id; /* 48 4 */
int numa_node; /* 52 4 */
u32 btf_key_type_id; /* 56 4 */
u32 btf_value_type_id; /* 60 4 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct btf * btf; /* 64 8 */
struct bpf_map_memory memory; /* 72 16 */
bool unpriv_array; /* 88 1 */
bool frozen; /* 89 1 */
/* XXX 38 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */
atomic_t usercnt; /* 132 4 */
struct work_struct work; /* 136 32 */
char name[16]; /* 168 16 */
/* size: 192, cachelines: 3, members: 21 */
/* sum members: 146, holes: 1, sum holes: 38 */
/* padding: 8 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
AFTER (same 3 cache lines, no extra padding now):
struct bpf_map {
const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */
struct bpf_map * inner_map_meta; /* 8 8 */
void * security; /* 16 8 */
enum bpf_map_type map_type; /* 24 4 */
u32 key_size; /* 28 4 */
u32 value_size; /* 32 4 */
u32 max_entries; /* 36 4 */
u32 map_flags; /* 40 4 */
int spin_lock_off; /* 44 4 */
u32 id; /* 48 4 */
int numa_node; /* 52 4 */
u32 btf_key_type_id; /* 56 4 */
u32 btf_value_type_id; /* 60 4 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct btf * btf; /* 64 8 */
struct bpf_map_memory memory; /* 72 16 */
bool unpriv_array; /* 88 1 */
bool frozen; /* 89 1 */
/* XXX 38 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */
atomic64_t usercnt; /* 136 8 */
struct work_struct work; /* 144 32 */
char name[16]; /* 176 16 */
/* size: 192, cachelines: 3, members: 21 */
/* sum members: 154, holes: 1, sum holes: 38 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
This patch, while modifying all users of bpf_map_inc, also cleans up its
interface to match bpf_map_put with separate operations for bpf_map_inc and
bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref,
respectively). Also, given there are no users of bpf_map_inc_not_zero
specifying uref=true, remove uref flag and default to uref=false internally.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
2019-11-17 09:28:02 -08:00
bpf_map_inc ( & map - > map ) ;
return 0 ;
2019-08-15 11:30:13 +02:00
}
void xsk_map_put ( struct xsk_map * map )
{
bpf_map_put ( & map - > map ) ;
}
static struct xsk_map_node * xsk_map_node_alloc ( struct xsk_map * map ,
struct xdp_sock * * map_entry )
{
struct xsk_map_node * node ;
int err ;
node = kzalloc ( sizeof ( * node ) , GFP_ATOMIC | __GFP_NOWARN ) ;
if ( ! node )
2019-09-24 09:25:21 -07:00
return ERR_PTR ( - ENOMEM ) ;
2019-08-15 11:30:13 +02:00
err = xsk_map_inc ( map ) ;
if ( err ) {
kfree ( node ) ;
return ERR_PTR ( err ) ;
}
node - > map = map ;
node - > map_entry = map_entry ;
return node ;
}
static void xsk_map_node_free ( struct xsk_map_node * node )
{
xsk_map_put ( node - > map ) ;
kfree ( node ) ;
}
static void xsk_map_sock_add ( struct xdp_sock * xs , struct xsk_map_node * node )
{
spin_lock_bh ( & xs - > map_list_lock ) ;
list_add_tail ( & node - > node , & xs - > map_list ) ;
spin_unlock_bh ( & xs - > map_list_lock ) ;
}
static void xsk_map_sock_delete ( struct xdp_sock * xs ,
struct xdp_sock * * map_entry )
{
struct xsk_map_node * n , * tmp ;
spin_lock_bh ( & xs - > map_list_lock ) ;
list_for_each_entry_safe ( n , tmp , & xs - > map_list , node ) {
if ( map_entry = = n - > map_entry ) {
list_del ( & n - > node ) ;
xsk_map_node_free ( n ) ;
}
}
spin_unlock_bh ( & xs - > map_list_lock ) ;
}
2018-05-02 13:01:28 +02:00
static struct bpf_map * xsk_map_alloc ( union bpf_attr * attr )
{
2019-11-01 12:03:44 +01:00
struct bpf_map_memory mem ;
2019-12-19 07:10:02 +01:00
int err , numa_node ;
2018-05-02 13:01:28 +02:00
struct xsk_map * m ;
2019-12-19 07:10:02 +01:00
u64 size ;
2018-05-02 13:01:28 +02:00
if ( ! capable ( CAP_NET_ADMIN ) )
return ERR_PTR ( - EPERM ) ;
if ( attr - > max_entries = = 0 | | attr - > key_size ! = 4 | |
attr - > value_size ! = 4 | |
attr - > map_flags & ~ ( BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY ) )
return ERR_PTR ( - EINVAL ) ;
2019-11-01 12:03:44 +01:00
numa_node = bpf_map_attr_numa_node ( attr ) ;
size = struct_size ( m , xsk_map , attr - > max_entries ) ;
2019-12-19 07:10:02 +01:00
err = bpf_map_charge_init ( & mem , size ) ;
2019-11-01 12:03:44 +01:00
if ( err < 0 )
return ERR_PTR ( err ) ;
m = bpf_map_area_alloc ( size , numa_node ) ;
if ( ! m ) {
bpf_map_charge_finish ( & mem ) ;
2018-05-02 13:01:28 +02:00
return ERR_PTR ( - ENOMEM ) ;
2019-11-01 12:03:44 +01:00
}
2018-05-02 13:01:28 +02:00
bpf_map_init_from_attr ( & m - > map , attr ) ;
2019-11-01 12:03:44 +01:00
bpf_map_charge_move ( & m - > map . memory , & mem ) ;
2019-08-15 11:30:13 +02:00
spin_lock_init ( & m - > lock ) ;
2018-05-02 13:01:28 +02:00
return & m - > map ;
}
static void xsk_map_free ( struct bpf_map * map )
{
struct xsk_map * m = container_of ( map , struct xsk_map , map ) ;
2018-08-17 23:26:14 +02:00
bpf_clear_redirect_map ( map ) ;
2018-05-02 13:01:28 +02:00
synchronize_net ( ) ;
2019-11-01 12:03:44 +01:00
bpf_map_area_free ( m ) ;
2018-05-02 13:01:28 +02:00
}
static int xsk_map_get_next_key ( struct bpf_map * map , void * key , void * next_key )
{
struct xsk_map * m = container_of ( map , struct xsk_map , map ) ;
u32 index = key ? * ( u32 * ) key : U32_MAX ;
u32 * next = next_key ;
if ( index > = m - > map . max_entries ) {
* next = 0 ;
return 0 ;
}
if ( index = = m - > map . max_entries - 1 )
return - ENOENT ;
* next = index + 1 ;
return 0 ;
}
2019-11-01 12:03:45 +01:00
static u32 xsk_map_gen_lookup ( struct bpf_map * map , struct bpf_insn * insn_buf )
{
const int ret = BPF_REG_0 , mp = BPF_REG_1 , index = BPF_REG_2 ;
struct bpf_insn * insn = insn_buf ;
* insn + + = BPF_LDX_MEM ( BPF_W , ret , index , 0 ) ;
* insn + + = BPF_JMP_IMM ( BPF_JGE , ret , map - > max_entries , 5 ) ;
* insn + + = BPF_ALU64_IMM ( BPF_LSH , ret , ilog2 ( sizeof ( struct xsk_sock * ) ) ) ;
* insn + + = BPF_ALU64_IMM ( BPF_ADD , mp , offsetof ( struct xsk_map , xsk_map ) ) ;
* insn + + = BPF_ALU64_REG ( BPF_ADD , ret , mp ) ;
* insn + + = BPF_LDX_MEM ( BPF_SIZEOF ( struct xsk_sock * ) , ret , ret , 0 ) ;
* insn + + = BPF_JMP_IMM ( BPF_JA , 0 , 0 , 1 ) ;
* insn + + = BPF_MOV64_IMM ( ret , 0 ) ;
return insn - insn_buf ;
}
2018-05-02 13:01:28 +02:00
static void * xsk_map_lookup_elem ( struct bpf_map * map , void * key )
2019-06-06 13:59:40 -07:00
{
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
return __xsk_map_lookup_elem ( map , * ( u32 * ) key ) ;
}
static void * xsk_map_lookup_elem_sys_only ( struct bpf_map * map , void * key )
2018-05-02 13:01:28 +02:00
{
2018-10-09 10:04:50 +09:00
return ERR_PTR ( - EOPNOTSUPP ) ;
2018-05-02 13:01:28 +02:00
}
static int xsk_map_update_elem ( struct bpf_map * map , void * key , void * value ,
u64 map_flags )
{
struct xsk_map * m = container_of ( map , struct xsk_map , map ) ;
2019-08-15 11:30:13 +02:00
struct xdp_sock * xs , * old_xs , * * map_entry ;
2018-05-02 13:01:28 +02:00
u32 i = * ( u32 * ) key , fd = * ( u32 * ) value ;
2019-08-15 11:30:13 +02:00
struct xsk_map_node * node ;
2018-05-02 13:01:28 +02:00
struct socket * sock ;
int err ;
if ( unlikely ( map_flags > BPF_EXIST ) )
return - EINVAL ;
if ( unlikely ( i > = m - > map . max_entries ) )
return - E2BIG ;
sock = sockfd_lookup ( fd , & err ) ;
if ( ! sock )
return err ;
if ( sock - > sk - > sk_family ! = PF_XDP ) {
sockfd_put ( sock ) ;
return - EOPNOTSUPP ;
}
xs = ( struct xdp_sock * ) sock - > sk ;
2019-08-15 11:30:13 +02:00
map_entry = & m - > xsk_map [ i ] ;
node = xsk_map_node_alloc ( m , map_entry ) ;
if ( IS_ERR ( node ) ) {
sockfd_put ( sock ) ;
return PTR_ERR ( node ) ;
}
2018-05-02 13:01:28 +02:00
2019-08-15 11:30:13 +02:00
spin_lock_bh ( & m - > lock ) ;
old_xs = READ_ONCE ( * map_entry ) ;
if ( old_xs = = xs ) {
err = 0 ;
goto out ;
2019-08-15 11:30:14 +02:00
} else if ( old_xs & & map_flags = = BPF_NOEXIST ) {
err = - EEXIST ;
goto out ;
} else if ( ! old_xs & & map_flags = = BPF_EXIST ) {
err = - ENOENT ;
goto out ;
2019-08-15 11:30:13 +02:00
}
xsk_map_sock_add ( xs , node ) ;
WRITE_ONCE ( * map_entry , xs ) ;
2018-10-08 19:40:16 +02:00
if ( old_xs )
2019-08-15 11:30:13 +02:00
xsk_map_sock_delete ( old_xs , map_entry ) ;
spin_unlock_bh ( & m - > lock ) ;
2018-05-02 13:01:28 +02:00
sockfd_put ( sock ) ;
return 0 ;
2019-08-15 11:30:13 +02:00
out :
spin_unlock_bh ( & m - > lock ) ;
sockfd_put ( sock ) ;
xsk_map_node_free ( node ) ;
return err ;
2018-05-02 13:01:28 +02:00
}
static int xsk_map_delete_elem ( struct bpf_map * map , void * key )
{
struct xsk_map * m = container_of ( map , struct xsk_map , map ) ;
2019-08-15 11:30:13 +02:00
struct xdp_sock * old_xs , * * map_entry ;
2018-05-02 13:01:28 +02:00
int k = * ( u32 * ) key ;
if ( k > = map - > max_entries )
return - EINVAL ;
2019-08-15 11:30:13 +02:00
spin_lock_bh ( & m - > lock ) ;
map_entry = & m - > xsk_map [ k ] ;
old_xs = xchg ( map_entry , NULL ) ;
2018-10-08 19:40:16 +02:00
if ( old_xs )
2019-08-15 11:30:13 +02:00
xsk_map_sock_delete ( old_xs , map_entry ) ;
spin_unlock_bh ( & m - > lock ) ;
2018-05-02 13:01:28 +02:00
return 0 ;
}
2019-08-15 11:30:13 +02:00
void xsk_map_try_sock_delete ( struct xsk_map * map , struct xdp_sock * xs ,
struct xdp_sock * * map_entry )
{
spin_lock_bh ( & map - > lock ) ;
if ( READ_ONCE ( * map_entry ) = = xs ) {
WRITE_ONCE ( * map_entry , NULL ) ;
xsk_map_sock_delete ( xs , map_entry ) ;
}
spin_unlock_bh ( & map - > lock ) ;
}
2020-08-27 18:18:13 -07:00
static bool xsk_map_meta_equal ( const struct bpf_map * meta0 ,
const struct bpf_map * meta1 )
{
return meta0 - > max_entries = = meta1 - > max_entries & &
bpf_map_meta_equal ( meta0 , meta1 ) ;
}
2020-06-19 14:11:44 -07:00
static int xsk_map_btf_id ;
2018-05-02 13:01:28 +02:00
const struct bpf_map_ops xsk_map_ops = {
2020-08-27 18:18:13 -07:00
. map_meta_equal = xsk_map_meta_equal ,
2018-05-02 13:01:28 +02:00
. map_alloc = xsk_map_alloc ,
. map_free = xsk_map_free ,
. map_get_next_key = xsk_map_get_next_key ,
. map_lookup_elem = xsk_map_lookup_elem ,
2019-11-01 12:03:45 +01:00
. map_gen_lookup = xsk_map_gen_lookup ,
2019-06-06 13:59:40 -07:00
. map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only ,
2018-05-02 13:01:28 +02:00
. map_update_elem = xsk_map_update_elem ,
. map_delete_elem = xsk_map_delete_elem ,
2018-08-12 01:59:17 +02:00
. map_check_btf = map_check_no_btf ,
2020-06-19 14:11:44 -07:00
. map_btf_name = " xsk_map " ,
. map_btf_id = & xsk_map_btf_id ,
2018-05-02 13:01:28 +02:00
} ;