2019-05-28 20:10:09 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2017-03-22 20:00:33 +03:00
/* Copyright (c) 2017 Facebook
*/
# include <linux/slab.h>
# include <linux/bpf.h>
2021-07-15 03:54:12 +03:00
# include <linux/btf.h>
2017-03-22 20:00:33 +03:00
# include "map_in_map.h"
struct bpf_map * bpf_map_meta_alloc ( int inner_map_ufd )
{
struct bpf_map * inner_map , * inner_map_meta ;
2019-01-17 18:34:45 +03:00
u32 inner_map_meta_size ;
2017-03-22 20:00:33 +03:00
struct fd f ;
2022-11-18 04:55:54 +03:00
int ret ;
2017-03-22 20:00:33 +03:00
f = fdget ( inner_map_ufd ) ;
inner_map = __bpf_map_get ( f ) ;
if ( IS_ERR ( inner_map ) )
return inner_map ;
/* Does not support >1 level map-in-map */
if ( inner_map - > inner_map_meta ) {
2022-11-18 04:55:54 +03:00
ret = - EINVAL ;
goto put ;
2017-03-22 20:00:33 +03:00
}
2020-08-28 04:18:06 +03:00
if ( ! inner_map - > ops - > map_meta_equal ) {
2022-11-18 04:55:54 +03:00
ret = - ENOTSUPP ;
goto put ;
2020-08-28 04:18:06 +03:00
}
2019-01-17 18:34:45 +03:00
inner_map_meta_size = sizeof ( * inner_map_meta ) ;
/* In some cases verifier needs to access beyond just base map. */
if ( inner_map - > ops = = & array_map_ops )
inner_map_meta_size = sizeof ( struct bpf_array ) ;
inner_map_meta = kzalloc ( inner_map_meta_size , GFP_USER ) ;
2017-03-22 20:00:33 +03:00
if ( ! inner_map_meta ) {
2022-11-18 04:55:54 +03:00
ret = - ENOMEM ;
goto put ;
2017-03-22 20:00:33 +03:00
}
inner_map_meta - > map_type = inner_map - > map_type ;
inner_map_meta - > key_size = inner_map - > key_size ;
inner_map_meta - > value_size = inner_map - > value_size ;
inner_map_meta - > map_flags = inner_map - > map_flags ;
inner_map_meta - > max_entries = inner_map - > max_entries ;
2022-11-18 04:55:54 +03:00
bpf: Refactor kptr_off_tab into btf_record
To prepare the BPF verifier to handle special fields in both map values
and program allocated types coming from program BTF, we need to refactor
the kptr_off_tab handling code into something more generic and reusable
across both cases to avoid code duplication.
Later patches also require passing this data to helpers at runtime, so
that they can work on user defined types, initialize them, destruct
them, etc.
The main observation is that both map values and such allocated types
point to a type in program BTF, hence they can be handled similarly. We
can prepare a field metadata table for both cases and store them in
struct bpf_map or struct btf depending on the use case.
Hence, refactor the code into generic btf_record and btf_field member
structs. The btf_record represents the fields of a specific btf_type in
user BTF. The cnt indicates the number of special fields we successfully
recognized, and field_mask is a bitmask of fields that were found, to
enable quick determination of availability of a certain field.
Subsequently, refactor the rest of the code to work with these generic
types, remove assumptions about kptr and kptr_off_tab, rename variables
to more meaningful names, etc.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-03 22:09:55 +03:00
inner_map_meta - > record = btf_record_dup ( inner_map - > record ) ;
if ( IS_ERR ( inner_map_meta - > record ) ) {
/* btf_record_dup returns NULL or valid pointer in case of
* invalid / empty / valid , but ERR_PTR in case of errors . During
* equality NULL or IS_ERR is equivalent .
*/
2022-11-18 04:55:54 +03:00
ret = PTR_ERR ( inner_map_meta - > record ) ;
goto free ;
}
bpf: Add comments for map BTF matching requirement for bpf_list_head
The old behavior of bpf_map_meta_equal was that it compared timer_off
to be equal (but not spin_lock_off, because that was not allowed), and
did memcmp of kptr_off_tab.
Now, we memcmp the btf_record of two bpf_map structs, which has all
fields.
We preserve backwards compat as we kzalloc the array, so if only spin
lock and timer exist in map, we only compare offset while the rest of
unused members in the btf_field struct are zeroed out.
In case of kptr, btf and everything else is of vmlinux or module, so as
long type is same it will match, since kernel btf, module, dtor pointer
will be same across maps.
Now with list_head in the mix, things are a bit complicated. We
implicitly add a requirement that both BTFs are same, because struct
btf_field_list_head has btf and value_rec members.
We obviously shouldn't force BTFs to be equal by default, as that breaks
backwards compatibility.
Currently it is only implicitly required due to list_head matching
struct btf and value_rec member. value_rec points back into a btf_record
stashed in the map BTF (btf member of btf_field_list_head). So that
pointer and btf member has to match exactly.
Document all these subtle details so that things don't break in the
future when touching this code.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-19-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-18 04:56:08 +03:00
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps , as some members like
* record - > fields . list_head have pointers like value_rec pointing into
* inner_map - > btf .
*/
2021-07-15 03:54:12 +03:00
if ( inner_map - > btf ) {
btf_get ( inner_map - > btf ) ;
inner_map_meta - > btf = inner_map - > btf ;
}
2017-03-22 20:00:33 +03:00
2019-01-17 18:34:45 +03:00
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta - > ops = inner_map - > ops ;
if ( inner_map - > ops = = & array_map_ops ) {
2023-06-02 22:02:02 +03:00
struct bpf_array * inner_array_meta =
container_of ( inner_map_meta , struct bpf_array , map ) ;
struct bpf_array * inner_array = container_of ( inner_map , struct bpf_array , map ) ;
inner_array_meta - > index_mask = inner_array - > index_mask ;
inner_array_meta - > elem_size = inner_array - > elem_size ;
2020-05-14 02:03:54 +03:00
inner_map_meta - > bypass_spec_v1 = inner_map - > bypass_spec_v1 ;
2019-01-17 18:34:45 +03:00
}
2017-03-22 20:00:33 +03:00
fdput ( f ) ;
return inner_map_meta ;
2022-11-18 04:55:54 +03:00
free :
kfree ( inner_map_meta ) ;
put :
fdput ( f ) ;
return ERR_PTR ( ret ) ;
2017-03-22 20:00:33 +03:00
}
void bpf_map_meta_free ( struct bpf_map * map_meta )
{
bpf: Refactor kptr_off_tab into btf_record
To prepare the BPF verifier to handle special fields in both map values
and program allocated types coming from program BTF, we need to refactor
the kptr_off_tab handling code into something more generic and reusable
across both cases to avoid code duplication.
Later patches also require passing this data to helpers at runtime, so
that they can work on user defined types, initialize them, destruct
them, etc.
The main observation is that both map values and such allocated types
point to a type in program BTF, hence they can be handled similarly. We
can prepare a field metadata table for both cases and store them in
struct bpf_map or struct btf depending on the use case.
Hence, refactor the code into generic btf_record and btf_field member
structs. The btf_record represents the fields of a specific btf_type in
user BTF. The cnt indicates the number of special fields we successfully
recognized, and field_mask is a bitmask of fields that were found, to
enable quick determination of availability of a certain field.
Subsequently, refactor the rest of the code to work with these generic
types, remove assumptions about kptr and kptr_off_tab, rename variables
to more meaningful names, etc.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-03 22:09:55 +03:00
bpf_map_free_record ( map_meta ) ;
2021-07-15 03:54:12 +03:00
btf_put ( map_meta - > btf ) ;
2017-03-22 20:00:33 +03:00
kfree ( map_meta ) ;
}
bool bpf_map_meta_equal ( const struct bpf_map * meta0 ,
const struct bpf_map * meta1 )
{
/* No need to compare ops because it is covered by map_type */
return meta0 - > map_type = = meta1 - > map_type & &
meta0 - > key_size = = meta1 - > key_size & &
meta0 - > value_size = = meta1 - > value_size & &
bpf: Allow storing unreferenced kptr in map
This commit introduces a new pointer type 'kptr' which can be embedded
in a map value to hold a PTR_TO_BTF_ID stored by a BPF program during
its invocation. When storing such a kptr, BPF program's PTR_TO_BTF_ID
register must have the same type as in the map value's BTF, and loading
a kptr marks the destination register as PTR_TO_BTF_ID with the correct
kernel BTF and BTF ID.
Such kptr are unreferenced, i.e. by the time another invocation of the
BPF program loads this pointer, the object which the pointer points to
may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are
patched to PROBE_MEM loads by the verifier, it would safe to allow user
to still access such invalid pointer, but passing such pointers into
BPF helpers and kfuncs should not be permitted. A future patch in this
series will close this gap.
The flexibility offered by allowing programs to dereference such invalid
pointers while being safe at runtime frees the verifier from doing
complex lifetime tracking. As long as the user may ensure that the
object remains valid, it can ensure data read by it from the kernel
object is valid.
The user indicates that a certain pointer must be treated as kptr
capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using
a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this
information is recorded in the object BTF which will be passed into the
kernel by way of map's BTF information. The name and kind from the map
value BTF is used to look up the in-kernel type, and the actual BTF and
BTF ID is recorded in the map struct in a new kptr_off_tab member. For
now, only storing pointers to structs is permitted.
An example of this specification is shown below:
#define __kptr __attribute__((btf_type_tag("kptr")))
struct map_value {
...
struct task_struct __kptr *task;
...
};
Then, in a BPF program, user may store PTR_TO_BTF_ID with the type
task_struct into the map, and then load it later.
Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as
the verifier cannot know whether the value is NULL or not statically, it
must treat all potential loads at that map value offset as loading a
possibly NULL pointer.
Only BPF_LDX, BPF_STX, and BPF_ST (with insn->imm = 0 to denote NULL)
are allowed instructions that can access such a pointer. On BPF_LDX, the
destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX,
it is checked whether the source register type is a PTR_TO_BTF_ID with
same BTF type as specified in the map BTF. The access size must always
be BPF_DW.
For the map in map support, the kptr_off_tab for outer map is copied
from the inner map's kptr_off_tab. It was chosen to do a deep copy
instead of introducing a refcount to kptr_off_tab, because the copy only
needs to be done when paramterizing using inner_map_fd in the map in map
case, hence would be unnecessary for all other users.
It is not permitted to use MAP_FREEZE command and mmap for BPF map
having kptrs, similar to the bpf_timer case. A kptr also requires that
BPF program has both read and write access to the map (hence both
BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG are disallowed).
Note that check_map_access must be called from both
check_helper_mem_access and for the BPF instructions, hence the kptr
check must distinguish between ACCESS_DIRECT and ACCESS_HELPER, and
reject ACCESS_HELPER cases. We rename stack_access_src to bpf_access_src
and reuse it for this purpose.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-2-memxor@gmail.com
2022-04-25 00:48:49 +03:00
meta0 - > map_flags = = meta1 - > map_flags & &
bpf: Refactor kptr_off_tab into btf_record
To prepare the BPF verifier to handle special fields in both map values
and program allocated types coming from program BTF, we need to refactor
the kptr_off_tab handling code into something more generic and reusable
across both cases to avoid code duplication.
Later patches also require passing this data to helpers at runtime, so
that they can work on user defined types, initialize them, destruct
them, etc.
The main observation is that both map values and such allocated types
point to a type in program BTF, hence they can be handled similarly. We
can prepare a field metadata table for both cases and store them in
struct bpf_map or struct btf depending on the use case.
Hence, refactor the code into generic btf_record and btf_field member
structs. The btf_record represents the fields of a specific btf_type in
user BTF. The cnt indicates the number of special fields we successfully
recognized, and field_mask is a bitmask of fields that were found, to
enable quick determination of availability of a certain field.
Subsequently, refactor the rest of the code to work with these generic
types, remove assumptions about kptr and kptr_off_tab, rename variables
to more meaningful names, etc.
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-03 22:09:55 +03:00
btf_record_equal ( meta0 - > record , meta1 - > record ) ;
2017-03-22 20:00:33 +03:00
}
void * bpf_map_fd_get_ptr ( struct bpf_map * map ,
struct file * map_file /* not used */ ,
int ufd )
{
2020-08-28 04:18:06 +03:00
struct bpf_map * inner_map , * inner_map_meta ;
2017-03-22 20:00:33 +03:00
struct fd f ;
f = fdget ( ufd ) ;
inner_map = __bpf_map_get ( f ) ;
if ( IS_ERR ( inner_map ) )
return inner_map ;
2020-08-28 04:18:06 +03:00
inner_map_meta = map - > inner_map_meta ;
if ( inner_map_meta - > ops - > map_meta_equal ( inner_map_meta , inner_map ) )
bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails
92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into
potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit
(32k). Due to using 32-bit counter, it's possible in practice to overflow
refcounter and make it wrap around to 0, causing erroneous map free, while
there are still references to it, causing use-after-free problems.
But having a failing refcounting operations are problematic in some cases. One
example is mmap() interface. After establishing initial memory-mapping, user
is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily
splitting it into multiple non-contiguous regions. All this happening without
any control from the users of mmap subsystem. Rather mmap subsystem sends
notifications to original creator of memory mapping through open/close
callbacks, which are optionally specified during initial memory mapping
creation. These callbacks are used to maintain accurate refcount for bpf_map
(see next patch in this series). The problem is that open() callback is not
supposed to fail, because memory-mapped resource is set up and properly
referenced. This is posing a problem for using memory-mapping with BPF maps.
One solution to this is to maintain separate refcount for just memory-mappings
and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively.
There are similar use cases in current work on tcp-bpf, necessitating extra
counter as well. This seems like a rather unfortunate and ugly solution that
doesn't scale well to various new use cases.
Another approach to solve this is to use non-failing refcount_t type, which
uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX,
stays there. This utlimately causes memory leak, but prevents use after free.
But given refcounting is not the most performance-critical operation with BPF
maps (it's not used from running BPF program code), we can also just switch to
64-bit counter that can't overflow in practice, potentially disadvantaging
32-bit platforms a tiny bit. This simplifies semantics and allows above
described scenarios to not worry about failing refcount increment operation.
In terms of struct bpf_map size, we are still good and use the same amount of
space:
BEFORE (3 cache lines, 8 bytes of padding at the end):
struct bpf_map {
const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */
struct bpf_map * inner_map_meta; /* 8 8 */
void * security; /* 16 8 */
enum bpf_map_type map_type; /* 24 4 */
u32 key_size; /* 28 4 */
u32 value_size; /* 32 4 */
u32 max_entries; /* 36 4 */
u32 map_flags; /* 40 4 */
int spin_lock_off; /* 44 4 */
u32 id; /* 48 4 */
int numa_node; /* 52 4 */
u32 btf_key_type_id; /* 56 4 */
u32 btf_value_type_id; /* 60 4 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct btf * btf; /* 64 8 */
struct bpf_map_memory memory; /* 72 16 */
bool unpriv_array; /* 88 1 */
bool frozen; /* 89 1 */
/* XXX 38 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */
atomic_t usercnt; /* 132 4 */
struct work_struct work; /* 136 32 */
char name[16]; /* 168 16 */
/* size: 192, cachelines: 3, members: 21 */
/* sum members: 146, holes: 1, sum holes: 38 */
/* padding: 8 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
AFTER (same 3 cache lines, no extra padding now):
struct bpf_map {
const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */
struct bpf_map * inner_map_meta; /* 8 8 */
void * security; /* 16 8 */
enum bpf_map_type map_type; /* 24 4 */
u32 key_size; /* 28 4 */
u32 value_size; /* 32 4 */
u32 max_entries; /* 36 4 */
u32 map_flags; /* 40 4 */
int spin_lock_off; /* 44 4 */
u32 id; /* 48 4 */
int numa_node; /* 52 4 */
u32 btf_key_type_id; /* 56 4 */
u32 btf_value_type_id; /* 60 4 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct btf * btf; /* 64 8 */
struct bpf_map_memory memory; /* 72 16 */
bool unpriv_array; /* 88 1 */
bool frozen; /* 89 1 */
/* XXX 38 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */
atomic64_t usercnt; /* 136 8 */
struct work_struct work; /* 144 32 */
char name[16]; /* 176 16 */
/* size: 192, cachelines: 3, members: 21 */
/* sum members: 154, holes: 1, sum holes: 38 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));
This patch, while modifying all users of bpf_map_inc, also cleans up its
interface to match bpf_map_put with separate operations for bpf_map_inc and
bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref,
respectively). Also, given there are no users of bpf_map_inc_not_zero
specifying uref=true, remove uref flag and default to uref=false internally.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
2019-11-17 20:28:02 +03:00
bpf_map_inc ( inner_map ) ;
2017-03-22 20:00:33 +03:00
else
inner_map = ERR_PTR ( - EINVAL ) ;
fdput ( f ) ;
return inner_map ;
}
2023-12-04 17:04:20 +03:00
void bpf_map_fd_put_ptr ( struct bpf_map * map , void * ptr , bool need_defer )
2017-03-22 20:00:33 +03:00
{
2023-12-04 17:04:22 +03:00
struct bpf_map * inner_map = ptr ;
2023-12-04 17:04:23 +03:00
/* Defer the freeing of inner map according to the sleepable attribute
* of bpf program which owns the outer map , so unnecessary waiting for
* RCU tasks trace grace period can be avoided .
2017-03-22 20:00:33 +03:00
*/
2023-12-04 17:04:23 +03:00
if ( need_defer ) {
if ( atomic64_read ( & map - > sleepable_refcnt ) )
WRITE_ONCE ( inner_map - > free_after_mult_rcu_gp , true ) ;
else
WRITE_ONCE ( inner_map - > free_after_rcu_gp , true ) ;
}
2023-12-04 17:04:22 +03:00
bpf_map_put ( inner_map ) ;
2017-03-22 20:00:33 +03:00
}
2017-06-28 09:08:34 +03:00
u32 bpf_map_fd_sys_lookup_elem ( void * ptr )
{
return ( ( struct bpf_map * ) ptr ) - > id ;
}