2022-09-02 14:10:43 -07:00
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
# include <linux/mm.h>
# include <linux/llist.h>
# include <linux/bpf.h>
# include <linux/irq_work.h>
# include <linux/bpf_mem_alloc.h>
# include <linux/memcontrol.h>
# include <asm/local.h>
/* Any context (including NMI) BPF specific memory allocator.
*
* Tracing BPF programs can attach to kprobe and fentry . Hence they
* run in unknown context where calling plain kmalloc ( ) might not be safe .
*
* Front - end kmalloc ( ) with per - cpu per - bucket cache of free elements .
* Refill this cache asynchronously from irq_work .
*
* CPU_0 buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
* . . .
* CPU_N buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
*
* The buckets are prefilled at the start .
* BPF programs always run with migration disabled .
* It ' s safe to allocate from cache of the current cpu with irqs disabled .
* Free - ing is always done into bucket of the current cpu as well .
* irq_work trims extra free elements from buckets with kfree
* and refills them with kmalloc , so global kmalloc logic takes care
* of freeing objects allocated by one cpu and freed on another .
*
* Every allocated objected is padded with extra 8 bytes that contains
* struct llist_node .
*/
# define LLIST_NODE_SZ sizeof(struct llist_node)
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index [ 24 ] __ro_after_init = {
3 , /* 8 */
3 , /* 16 */
4 , /* 24 */
4 , /* 32 */
5 , /* 40 */
5 , /* 48 */
5 , /* 56 */
5 , /* 64 */
1 , /* 72 */
1 , /* 80 */
1 , /* 88 */
1 , /* 96 */
6 , /* 104 */
6 , /* 112 */
6 , /* 120 */
6 , /* 128 */
2 , /* 136 */
2 , /* 144 */
2 , /* 152 */
2 , /* 160 */
2 , /* 168 */
2 , /* 176 */
2 , /* 184 */
2 /* 192 */
} ;
static int bpf_mem_cache_idx ( size_t size )
{
if ( ! size | | size > 4096 )
return - 1 ;
if ( size < = 192 )
return size_index [ ( size - 1 ) / 8 ] - 1 ;
2023-01-18 16:46:30 +08:00
return fls ( size - 1 ) - 2 ;
2022-09-02 14:10:43 -07:00
}
# define NUM_CACHES 11
struct bpf_mem_cache {
/* per-cpu list of free objects of size 'unit_size'.
* All accesses are done with interrupts disabled and ' active ' counter
* protection with __llist_add ( ) and __llist_del_first ( ) .
*/
struct llist_head free_llist ;
local_t active ;
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
* are sequenced by per - cpu ' active ' counter . But unit_free ( ) cannot
* fail . When ' active ' is busy the unit_free ( ) will add an object to
* free_llist_extra .
*/
struct llist_head free_llist_extra ;
struct irq_work refill_work ;
struct obj_cgroup * objcg ;
int unit_size ;
/* count of objects in free_llist */
int free_cnt ;
2022-09-02 14:10:50 -07:00
int low_watermark , high_watermark , batch ;
2022-09-02 14:10:57 -07:00
int percpu_size ;
2023-07-05 20:34:40 -07:00
bool draining ;
2023-07-05 20:34:41 -07:00
struct bpf_mem_cache * tgt ;
2022-09-02 14:10:51 -07:00
2023-07-05 20:34:45 -07:00
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu ;
struct llist_node * free_by_rcu_tail ;
struct llist_head waiting_for_gp ;
struct llist_node * waiting_for_gp_tail ;
struct rcu_head rcu ;
atomic_t call_rcu_in_progress ;
struct llist_head free_llist_extra_rcu ;
2023-07-05 20:34:34 -07:00
/* list of objects to be freed after RCU tasks trace GP */
struct llist_head free_by_rcu_ttrace ;
struct llist_head waiting_for_gp_ttrace ;
struct rcu_head rcu_ttrace ;
atomic_t call_rcu_ttrace_in_progress ;
2022-09-02 14:10:43 -07:00
} ;
struct bpf_mem_caches {
struct bpf_mem_cache cache [ NUM_CACHES ] ;
} ;
static struct llist_node notrace * __llist_del_first ( struct llist_head * head )
{
struct llist_node * entry , * next ;
entry = head - > first ;
if ( ! entry )
return NULL ;
next = entry - > next ;
head - > first = next ;
return entry ;
}
2023-03-22 14:52:42 -07:00
static void * __alloc ( struct bpf_mem_cache * c , int node , gfp_t flags )
2022-09-02 14:10:43 -07:00
{
2022-09-02 14:10:57 -07:00
if ( c - > percpu_size ) {
void * * obj = kmalloc_node ( c - > percpu_size , flags , node ) ;
2022-09-02 14:10:52 -07:00
void * pptr = __alloc_percpu_gfp ( c - > unit_size , 8 , flags ) ;
if ( ! obj | | ! pptr ) {
free_percpu ( pptr ) ;
kfree ( obj ) ;
return NULL ;
}
obj [ 1 ] = pptr ;
return obj ;
}
bpf: Zeroing allocated object from slab in bpf memory allocator
Currently the freed element in bpf memory allocator may be immediately
reused, for htab map the reuse will reinitialize special fields in map
value (e.g., bpf_spin_lock), but lookup procedure may still access
these special fields, and it may lead to hard-lockup as shown below:
NMI backtrace for cpu 16
CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0
......
Call Trace:
<TASK>
copy_map_value_locked+0xb7/0x170
bpf_map_copy_value+0x113/0x3c0
__sys_bpf+0x1c67/0x2780
__x64_sys_bpf+0x1c/0x20
do_syscall_64+0x30/0x60
entry_SYSCALL_64_after_hwframe+0x46/0xb0
......
</TASK>
For htab map, just like the preallocated case, these is no need to
initialize these special fields in map value again once these fields
have been initialized. For preallocated htab map, these fields are
initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the
similar thing for non-preallocated htab in bpf memory allocator. And
there is no need to use __GFP_ZERO for per-cpu bpf memory allocator,
because __alloc_percpu_gfp() does it implicitly.
Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-02-15 16:21:31 +08:00
return kmalloc_node ( c - > unit_size , flags | __GFP_ZERO , node ) ;
2022-09-02 14:10:43 -07:00
}
static struct mem_cgroup * get_memcg ( const struct bpf_mem_cache * c )
{
# ifdef CONFIG_MEMCG_KMEM
if ( c - > objcg )
return get_mem_cgroup_from_objcg ( c - > objcg ) ;
# endif
# ifdef CONFIG_MEMCG
return root_mem_cgroup ;
# else
return NULL ;
# endif
}
2023-07-05 20:34:38 -07:00
static void inc_active ( struct bpf_mem_cache * c , unsigned long * flags )
2023-07-05 20:34:37 -07:00
{
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
* reduce the chance of bpf prog executing on this cpu
* when active counter is busy .
*/
2023-07-05 20:34:38 -07:00
local_irq_save ( * flags ) ;
2023-07-05 20:34:37 -07:00
/* alloc_bulk runs from irq_work which will not preempt a bpf
* program that does unit_alloc / unit_free since IRQs are
* disabled there . There is no race to increment ' active '
* counter . It protects free_llist from corruption in case NMI
* bpf prog preempted this loop .
*/
WARN_ON_ONCE ( local_inc_return ( & c - > active ) ! = 1 ) ;
2023-07-05 20:34:38 -07:00
}
2023-07-25 22:26:40 +02:00
static void dec_active ( struct bpf_mem_cache * c , unsigned long * flags )
2023-07-05 20:34:38 -07:00
{
2023-07-05 20:34:37 -07:00
local_dec ( & c - > active ) ;
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
2023-07-25 22:26:40 +02:00
local_irq_restore ( * flags ) ;
2023-07-05 20:34:37 -07:00
}
2023-07-05 20:34:38 -07:00
static void add_obj_to_free_list ( struct bpf_mem_cache * c , void * obj )
{
unsigned long flags ;
inc_active ( c , & flags ) ;
__llist_add ( obj , & c - > free_llist ) ;
c - > free_cnt + + ;
2023-07-25 22:26:40 +02:00
dec_active ( c , & flags ) ;
2023-07-05 20:34:38 -07:00
}
2022-09-02 14:10:43 -07:00
/* Mostly runs from irq_work except __init phase. */
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
static void alloc_bulk ( struct bpf_mem_cache * c , int cnt , int node , bool atomic )
2022-09-02 14:10:43 -07:00
{
struct mem_cgroup * memcg = NULL , * old_memcg ;
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
gfp_t gfp ;
2022-09-02 14:10:43 -07:00
void * obj ;
int i ;
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
gfp = __GFP_NOWARN | __GFP_ACCOUNT ;
gfp | = atomic ? GFP_NOWAIT : GFP_KERNEL ;
2022-09-02 14:10:43 -07:00
for ( i = 0 ; i < cnt ; i + + ) {
2022-12-09 09:09:46 +08:00
/*
2023-07-05 20:34:41 -07:00
* For every ' c ' llist_del_first ( & c - > free_by_rcu_ttrace ) ; is
* done only by one CPU = = current CPU . Other CPUs might
* llist_add ( ) and llist_del_all ( ) in parallel .
2022-12-09 09:09:46 +08:00
*/
2023-07-05 20:34:41 -07:00
obj = llist_del_first ( & c - > free_by_rcu_ttrace ) ;
2023-07-05 20:34:39 -07:00
if ( ! obj )
break ;
add_obj_to_free_list ( c , obj ) ;
}
if ( i > = cnt )
return ;
2023-07-05 20:34:42 -07:00
for ( ; i < cnt ; i + + ) {
obj = llist_del_first ( & c - > waiting_for_gp_ttrace ) ;
if ( ! obj )
break ;
add_obj_to_free_list ( c , obj ) ;
}
if ( i > = cnt )
return ;
2023-07-05 20:34:39 -07:00
memcg = get_memcg ( c ) ;
old_memcg = set_active_memcg ( memcg ) ;
for ( ; i < cnt ; i + + ) {
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do . irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here .
*/
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
obj = __alloc ( c , node , gfp ) ;
2023-07-05 20:34:39 -07:00
if ( ! obj )
break ;
2023-07-05 20:34:37 -07:00
add_obj_to_free_list ( c , obj ) ;
2022-09-02 14:10:43 -07:00
}
set_active_memcg ( old_memcg ) ;
mem_cgroup_put ( memcg ) ;
}
2023-06-06 11:53:08 +08:00
static void free_one ( void * obj , bool percpu )
2022-09-02 14:10:43 -07:00
{
2023-06-06 11:53:08 +08:00
if ( percpu ) {
2022-09-02 14:10:52 -07:00
free_percpu ( ( ( void * * ) obj ) [ 1 ] ) ;
2022-09-02 14:10:57 -07:00
kfree ( obj ) ;
2022-09-02 14:10:52 -07:00
return ;
}
2022-09-02 14:10:57 -07:00
kfree ( obj ) ;
2022-09-02 14:10:43 -07:00
}
2023-07-05 20:34:36 -07:00
static int free_all ( struct llist_node * llnode , bool percpu )
2022-09-02 14:10:51 -07:00
{
struct llist_node * pos , * t ;
2023-07-05 20:34:36 -07:00
int cnt = 0 ;
2022-09-02 14:10:51 -07:00
2023-07-05 20:34:36 -07:00
llist_for_each_safe ( pos , t , llnode ) {
2023-06-06 11:53:08 +08:00
free_one ( pos , percpu ) ;
2023-07-05 20:34:36 -07:00
cnt + + ;
}
return cnt ;
2023-06-06 11:53:08 +08:00
}
static void __free_rcu ( struct rcu_head * head )
{
2023-07-05 20:34:34 -07:00
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu_ttrace ) ;
2023-06-06 11:53:08 +08:00
2023-07-05 20:34:34 -07:00
free_all ( llist_del_all ( & c - > waiting_for_gp_ttrace ) , ! ! c - > percpu_size ) ;
atomic_set ( & c - > call_rcu_ttrace_in_progress , 0 ) ;
2022-09-02 14:10:51 -07:00
}
2022-09-02 14:10:55 -07:00
static void __free_rcu_tasks_trace ( struct rcu_head * head )
{
2022-10-14 19:39:44 +08:00
/* If RCU Tasks Trace grace period implies RCU grace period,
* there is no need to invoke call_rcu ( ) .
*/
if ( rcu_trace_implies_rcu_gp ( ) )
__free_rcu ( head ) ;
else
call_rcu ( head , __free_rcu ) ;
2022-09-02 14:10:55 -07:00
}
2022-09-02 14:10:51 -07:00
static void enque_to_free ( struct bpf_mem_cache * c , void * obj )
{
struct llist_node * llnode = obj ;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
2023-07-05 20:34:34 -07:00
* Nothing races to add to free_by_rcu_ttrace list .
2022-09-02 14:10:51 -07:00
*/
2023-07-05 20:34:41 -07:00
llist_add ( llnode , & c - > free_by_rcu_ttrace ) ;
2022-09-02 14:10:51 -07:00
}
2023-07-05 20:34:34 -07:00
static void do_call_rcu_ttrace ( struct bpf_mem_cache * c )
2022-09-02 14:10:51 -07:00
{
struct llist_node * llnode , * t ;
2023-07-05 20:34:41 -07:00
if ( atomic_xchg ( & c - > call_rcu_ttrace_in_progress , 1 ) ) {
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
llnode = llist_del_all ( & c - > free_by_rcu_ttrace ) ;
free_all ( llnode , ! ! c - > percpu_size ) ;
}
2022-09-02 14:10:51 -07:00
return ;
2023-07-05 20:34:41 -07:00
}
2022-09-02 14:10:51 -07:00
2023-07-05 20:34:34 -07:00
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp_ttrace ) ) ;
2023-07-05 20:34:41 -07:00
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_by_rcu_ttrace ) )
2023-07-05 20:34:42 -07:00
llist_add ( llnode , & c - > waiting_for_gp_ttrace ) ;
2023-07-05 20:34:40 -07:00
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
__free_rcu ( & c - > rcu_ttrace ) ;
return ;
}
2022-09-02 14:10:55 -07:00
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
2022-10-14 19:39:44 +08:00
* If RCU Tasks Trace grace period implies RCU grace period , free
* these elements directly , else use call_rcu ( ) to wait for normal
* progs to finish and finally do free_one ( ) on each element .
2022-09-02 14:10:55 -07:00
*/
2023-07-05 20:34:34 -07:00
call_rcu_tasks_trace ( & c - > rcu_ttrace , __free_rcu_tasks_trace ) ;
2022-09-02 14:10:51 -07:00
}
2022-09-02 14:10:43 -07:00
static void free_bulk ( struct bpf_mem_cache * c )
{
2023-07-05 20:34:41 -07:00
struct bpf_mem_cache * tgt = c - > tgt ;
2022-09-02 14:10:43 -07:00
struct llist_node * llnode , * t ;
unsigned long flags ;
int cnt ;
2023-07-05 20:34:41 -07:00
WARN_ON_ONCE ( tgt - > unit_size ! = c - > unit_size ) ;
2022-09-02 14:10:43 -07:00
do {
2023-07-05 20:34:38 -07:00
inc_active ( c , & flags ) ;
2022-09-02 14:10:43 -07:00
llnode = __llist_del_first ( & c - > free_llist ) ;
if ( llnode )
cnt = - - c - > free_cnt ;
else
cnt = 0 ;
2023-07-25 22:26:40 +02:00
dec_active ( c , & flags ) ;
2022-09-19 22:48:11 +08:00
if ( llnode )
2023-07-05 20:34:41 -07:00
enque_to_free ( tgt , llnode ) ;
2022-09-02 14:10:50 -07:00
} while ( cnt > ( c - > high_watermark + c - > low_watermark ) / 2 ) ;
2022-09-02 14:10:43 -07:00
/* and drain free_llist_extra */
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra ) )
2023-07-05 20:34:41 -07:00
enque_to_free ( tgt , llnode ) ;
do_call_rcu_ttrace ( tgt ) ;
2022-09-02 14:10:43 -07:00
}
2023-07-05 20:34:45 -07:00
static void __free_by_rcu ( struct rcu_head * head )
{
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu ) ;
struct bpf_mem_cache * tgt = c - > tgt ;
struct llist_node * llnode ;
llnode = llist_del_all ( & c - > waiting_for_gp ) ;
if ( ! llnode )
goto out ;
llist_add_batch ( llnode , c - > waiting_for_gp_tail , & tgt - > free_by_rcu_ttrace ) ;
/* Objects went through regular RCU GP. Send them to RCU tasks trace */
do_call_rcu_ttrace ( tgt ) ;
out :
atomic_set ( & c - > call_rcu_in_progress , 0 ) ;
}
static void check_free_by_rcu ( struct bpf_mem_cache * c )
{
struct llist_node * llnode , * t ;
unsigned long flags ;
/* drain free_llist_extra_rcu */
if ( unlikely ( ! llist_empty ( & c - > free_llist_extra_rcu ) ) ) {
inc_active ( c , & flags ) ;
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra_rcu ) )
if ( __llist_add ( llnode , & c - > free_by_rcu ) )
c - > free_by_rcu_tail = llnode ;
2023-07-25 22:26:40 +02:00
dec_active ( c , & flags ) ;
2023-07-05 20:34:45 -07:00
}
if ( llist_empty ( & c - > free_by_rcu ) )
return ;
if ( atomic_xchg ( & c - > call_rcu_in_progress , 1 ) ) {
/*
* Instead of kmalloc - ing new rcu_head and triggering 10 k
* call_rcu ( ) to hit rcutree . qhimark and force RCU to notice
* the overload just ask RCU to hurry up . There could be many
* objects in free_by_rcu list .
* This hint reduces memory consumption for an artificial
* benchmark from 2 Gbyte to 150 Mbyte .
*/
rcu_request_urgent_qs_task ( current ) ;
return ;
}
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp ) ) ;
inc_active ( c , & flags ) ;
WRITE_ONCE ( c - > waiting_for_gp . first , __llist_del_all ( & c - > free_by_rcu ) ) ;
c - > waiting_for_gp_tail = c - > free_by_rcu_tail ;
2023-07-25 22:26:40 +02:00
dec_active ( c , & flags ) ;
2023-07-05 20:34:45 -07:00
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
free_all ( llist_del_all ( & c - > waiting_for_gp ) , ! ! c - > percpu_size ) ;
atomic_set ( & c - > call_rcu_in_progress , 0 ) ;
} else {
call_rcu_hurry ( & c - > rcu , __free_by_rcu ) ;
}
}
2022-09-02 14:10:43 -07:00
static void bpf_mem_refill ( struct irq_work * work )
{
struct bpf_mem_cache * c = container_of ( work , struct bpf_mem_cache , refill_work ) ;
int cnt ;
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
cnt = c - > free_cnt ;
2022-09-02 14:10:50 -07:00
if ( cnt < c - > low_watermark )
2022-09-02 14:10:43 -07:00
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here .
*/
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
alloc_bulk ( c , c - > batch , NUMA_NO_NODE , true ) ;
2022-09-02 14:10:50 -07:00
else if ( cnt > c - > high_watermark )
2022-09-02 14:10:43 -07:00
free_bulk ( c ) ;
2023-07-05 20:34:45 -07:00
check_free_by_rcu ( c ) ;
2022-09-02 14:10:43 -07:00
}
static void notrace irq_work_raise ( struct bpf_mem_cache * c )
{
irq_work_queue ( & c - > refill_work ) ;
}
2022-09-02 14:10:50 -07:00
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
* the freelist cache will be elem_size * 64 ( or less ) on each cpu .
*
* For bpf programs that don ' t have statically known allocation sizes and
* assuming ( low_mark + high_mark ) / 2 as an average number of elements per
* bucket and all buckets are used the total amount of memory in freelists
* on each cpu will be :
* 64 * 16 + 64 * 32 + 64 * 64 + 64 * 96 + 64 * 128 + 64 * 196 + 64 * 256 + 32 * 512 + 16 * 1024 + 8 * 2048 + 4 * 4096
* = = ~ 116 Kbyte using below heuristic .
* Initialized , but unused bpf allocator ( not bpf map specific one ) will
* consume ~ 11 Kbyte per cpu .
* Typical case will be between 11 K and 116 K closer to 11 K .
* bpf progs can and should share bpf_mem_cache when possible .
*/
2023-09-08 21:39:21 +08:00
static void init_refill_work ( struct bpf_mem_cache * c )
2022-09-02 14:10:43 -07:00
{
init_irq_work ( & c - > refill_work , bpf_mem_refill ) ;
2022-09-02 14:10:50 -07:00
if ( c - > unit_size < = 256 ) {
c - > low_watermark = 32 ;
c - > high_watermark = 96 ;
} else {
/* When page_size == 4k, order-0 cache will have low_mark == 2
* and high_mark = = 6 with batch alloc of 3 individual pages at
* a time .
* 8 k allocs and above low = = 1 , high = = 3 , batch = = 1.
*/
c - > low_watermark = max ( 32 * 256 / c - > unit_size , 1 ) ;
c - > high_watermark = max ( 96 * 256 / c - > unit_size , 3 ) ;
}
c - > batch = max ( ( c - > high_watermark - c - > low_watermark ) / 4 * 3 , 1 ) ;
2023-09-08 21:39:21 +08:00
}
2022-09-02 14:10:50 -07:00
2023-09-08 21:39:21 +08:00
static void prefill_mem_cache ( struct bpf_mem_cache * c , int cpu )
{
2022-09-02 14:10:43 -07:00
/* To avoid consuming memory assume that 1st run of bpf
* prog won ' t be doing more than 4 map_update_elem from
* irq disabled region
*/
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 04:33:59 +00:00
alloc_bulk ( c , c - > unit_size < = 256 ? 4 : 1 , cpu_to_node ( cpu ) , false ) ;
2022-09-02 14:10:43 -07:00
}
2023-09-08 21:39:22 +08:00
static int check_obj_size ( struct bpf_mem_cache * c , unsigned int idx )
{
struct llist_node * first ;
unsigned int obj_size ;
2023-09-13 21:59:43 +08:00
/* For per-cpu allocator, the size of free objects in free list doesn't
* match with unit_size and now there is no way to get the size of
* per - cpu pointer saved in free object , so just skip the checking .
*/
if ( c - > percpu_size )
return 0 ;
2023-09-08 21:39:22 +08:00
first = c - > free_llist . first ;
if ( ! first )
return 0 ;
obj_size = ksize ( first ) ;
if ( obj_size ! = c - > unit_size ) {
WARN_ONCE ( 1 , " bpf_mem_cache[%u]: unexpected object size %u, expect %u \n " ,
idx , obj_size , c - > unit_size ) ;
return - EINVAL ;
}
return 0 ;
}
2022-09-02 14:10:57 -07:00
/* When size != 0 bpf_mem_cache for each cpu.
2022-09-02 14:10:43 -07:00
* This is typical bpf hash map use case when all elements have equal size .
*
* When size = = 0 allocate 11 bpf_mem_cache - s for each cpu , then rely on
* kmalloc / kfree . Max allocation size is 4096 in this case .
* This is bpf_dynptr and bpf_kptr use case .
*/
2022-09-02 14:10:52 -07:00
int bpf_mem_alloc_init ( struct bpf_mem_alloc * ma , int size , bool percpu )
2022-09-02 14:10:43 -07:00
{
static u16 sizes [ NUM_CACHES ] = { 96 , 192 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 } ;
2023-09-08 21:39:22 +08:00
int cpu , i , err , unit_size , percpu_size = 0 ;
2022-09-02 14:10:43 -07:00
struct bpf_mem_caches * cc , __percpu * pcc ;
struct bpf_mem_cache * c , __percpu * pc ;
struct obj_cgroup * objcg = NULL ;
if ( size ) {
pc = __alloc_percpu_gfp ( sizeof ( * pc ) , 8 , GFP_KERNEL ) ;
if ( ! pc )
return - ENOMEM ;
2022-09-02 14:10:52 -07:00
2022-09-02 14:10:57 -07:00
if ( percpu )
2022-09-02 14:10:52 -07:00
/* room for llist_node and per-cpu pointer */
2022-09-02 14:10:57 -07:00
percpu_size = LLIST_NODE_SZ + sizeof ( void * ) ;
else
2022-09-02 14:10:52 -07:00
size + = LLIST_NODE_SZ ; /* room for llist_node */
2022-09-02 14:10:57 -07:00
unit_size = size ;
2022-09-02 14:10:52 -07:00
2022-09-02 14:10:43 -07:00
# ifdef CONFIG_MEMCG_KMEM
2023-02-10 15:47:33 +00:00
if ( memcg_bpf_enabled ( ) )
objcg = get_obj_cgroup_from_current ( ) ;
2022-09-02 14:10:43 -07:00
# endif
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( pc , cpu ) ;
2022-09-02 14:10:52 -07:00
c - > unit_size = unit_size ;
2022-09-02 14:10:43 -07:00
c - > objcg = objcg ;
2022-09-02 14:10:57 -07:00
c - > percpu_size = percpu_size ;
2023-07-05 20:34:41 -07:00
c - > tgt = c ;
2023-09-08 21:39:21 +08:00
init_refill_work ( c ) ;
2022-09-02 14:10:43 -07:00
prefill_mem_cache ( c , cpu ) ;
}
ma - > cache = pc ;
return 0 ;
}
2022-09-02 14:10:52 -07:00
/* size == 0 && percpu is an invalid combination */
if ( WARN_ON_ONCE ( percpu ) )
return - EINVAL ;
2022-09-02 14:10:43 -07:00
pcc = __alloc_percpu_gfp ( sizeof ( * cc ) , 8 , GFP_KERNEL ) ;
if ( ! pcc )
return - ENOMEM ;
2023-09-08 21:39:22 +08:00
err = 0 ;
2022-09-02 14:10:43 -07:00
# ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current ( ) ;
# endif
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( pcc , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
c - > unit_size = sizes [ i ] ;
c - > objcg = objcg ;
2023-07-05 20:34:41 -07:00
c - > tgt = c ;
2023-09-08 21:39:21 +08:00
init_refill_work ( c ) ;
/* Another bpf_mem_cache will be used when allocating
* c - > unit_size in bpf_mem_alloc ( ) , so doesn ' t prefill
* for the bpf_mem_cache because these free objects will
* never be used .
*/
if ( i ! = bpf_mem_cache_idx ( c - > unit_size ) )
continue ;
2022-09-02 14:10:43 -07:00
prefill_mem_cache ( c , cpu ) ;
2023-09-08 21:39:22 +08:00
err = check_obj_size ( c , i ) ;
if ( err )
goto out ;
2022-09-02 14:10:43 -07:00
}
}
2023-09-08 21:39:22 +08:00
out :
2022-09-02 14:10:43 -07:00
ma - > caches = pcc ;
2023-09-08 21:39:22 +08:00
/* refill_work is either zeroed or initialized, so it is safe to
* call irq_work_sync ( ) .
*/
if ( err )
bpf_mem_alloc_destroy ( ma ) ;
return err ;
2022-09-02 14:10:43 -07:00
}
static void drain_mem_cache ( struct bpf_mem_cache * c )
{
2023-06-06 11:53:08 +08:00
bool percpu = ! ! c - > percpu_size ;
2022-09-02 14:10:43 -07:00
2022-09-02 14:10:58 -07:00
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free ( ) for all remaining elements and they can be in
2023-07-05 20:34:34 -07:00
* free_by_rcu_ttrace or in waiting_for_gp_ttrace lists , so drain those lists now .
2022-10-21 19:49:13 +08:00
*
2023-07-05 20:34:34 -07:00
* Except for waiting_for_gp_ttrace list , there are no concurrent operations
2022-10-21 19:49:13 +08:00
* on these lists , so it is safe to use __llist_del_all ( ) .
2022-09-02 14:10:51 -07:00
*/
2023-07-05 20:34:41 -07:00
free_all ( llist_del_all ( & c - > free_by_rcu_ttrace ) , percpu ) ;
2023-07-05 20:34:34 -07:00
free_all ( llist_del_all ( & c - > waiting_for_gp_ttrace ) , percpu ) ;
2023-06-06 11:53:08 +08:00
free_all ( __llist_del_all ( & c - > free_llist ) , percpu ) ;
free_all ( __llist_del_all ( & c - > free_llist_extra ) , percpu ) ;
2023-07-05 20:34:45 -07:00
free_all ( __llist_del_all ( & c - > free_by_rcu ) , percpu ) ;
free_all ( __llist_del_all ( & c - > free_llist_extra_rcu ) , percpu ) ;
free_all ( llist_del_all ( & c - > waiting_for_gp ) , percpu ) ;
2022-09-02 14:10:43 -07:00
}
2023-07-05 20:34:47 -07:00
static void check_mem_cache ( struct bpf_mem_cache * c )
{
WARN_ON_ONCE ( ! llist_empty ( & c - > free_by_rcu_ttrace ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp_ttrace ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist_extra ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_by_rcu ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist_extra_rcu ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp ) ) ;
}
static void check_leaked_objs ( struct bpf_mem_alloc * ma )
{
struct bpf_mem_caches * cc ;
struct bpf_mem_cache * c ;
int cpu , i ;
if ( ma - > cache ) {
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( ma - > cache , cpu ) ;
check_mem_cache ( c ) ;
}
}
if ( ma - > caches ) {
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( ma - > caches , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
check_mem_cache ( c ) ;
}
}
}
}
2022-09-02 14:10:58 -07:00
static void free_mem_alloc_no_barrier ( struct bpf_mem_alloc * ma )
{
2023-07-05 20:34:47 -07:00
check_leaked_objs ( ma ) ;
2022-09-02 14:10:58 -07:00
free_percpu ( ma - > cache ) ;
free_percpu ( ma - > caches ) ;
ma - > cache = NULL ;
ma - > caches = NULL ;
}
static void free_mem_alloc ( struct bpf_mem_alloc * ma )
{
2023-07-05 20:34:45 -07:00
/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
* might still execute . Wait for them .
2022-12-09 09:09:47 +08:00
*
* rcu_barrier_tasks_trace ( ) doesn ' t imply synchronize_rcu_tasks_trace ( ) ,
* but rcu_barrier_tasks_trace ( ) and rcu_barrier ( ) below are only used
* to wait for the pending __free_rcu_tasks_trace ( ) and __free_rcu ( ) ,
* so if call_rcu ( head , __free_rcu ) is skipped due to
* rcu_trace_implies_rcu_gp ( ) , it will be OK to skip rcu_barrier ( ) by
* using rcu_trace_implies_rcu_gp ( ) as well .
2022-09-02 14:10:58 -07:00
*/
2023-07-05 20:34:45 -07:00
rcu_barrier ( ) ; /* wait for __free_by_rcu */
rcu_barrier_tasks_trace ( ) ; /* wait for __free_rcu */
2022-12-09 09:09:47 +08:00
if ( ! rcu_trace_implies_rcu_gp ( ) )
rcu_barrier ( ) ;
2022-09-02 14:10:58 -07:00
free_mem_alloc_no_barrier ( ma ) ;
}
static void free_mem_alloc_deferred ( struct work_struct * work )
{
struct bpf_mem_alloc * ma = container_of ( work , struct bpf_mem_alloc , work ) ;
free_mem_alloc ( ma ) ;
kfree ( ma ) ;
}
static void destroy_mem_alloc ( struct bpf_mem_alloc * ma , int rcu_in_progress )
{
struct bpf_mem_alloc * copy ;
if ( ! rcu_in_progress ) {
/* Fast path. No callbacks are pending, hence no need to do
* rcu_barrier - s .
*/
free_mem_alloc_no_barrier ( ma ) ;
return ;
}
2023-07-05 20:34:35 -07:00
copy = kmemdup ( ma , sizeof ( * ma ) , GFP_KERNEL ) ;
2022-09-02 14:10:58 -07:00
if ( ! copy ) {
/* Slow path with inline barrier-s */
free_mem_alloc ( ma ) ;
return ;
}
/* Defer barriers into worker to let the rest of map memory to be freed */
2023-07-05 20:34:35 -07:00
memset ( ma , 0 , sizeof ( * ma ) ) ;
2022-09-02 14:10:58 -07:00
INIT_WORK ( & copy - > work , free_mem_alloc_deferred ) ;
queue_work ( system_unbound_wq , & copy - > work ) ;
}
2022-09-02 14:10:43 -07:00
void bpf_mem_alloc_destroy ( struct bpf_mem_alloc * ma )
{
struct bpf_mem_caches * cc ;
struct bpf_mem_cache * c ;
2022-09-02 14:10:58 -07:00
int cpu , i , rcu_in_progress ;
2022-09-02 14:10:43 -07:00
if ( ma - > cache ) {
2022-09-02 14:10:58 -07:00
rcu_in_progress = 0 ;
2022-09-02 14:10:43 -07:00
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( ma - > cache , cpu ) ;
2023-07-05 20:34:40 -07:00
WRITE_ONCE ( c - > draining , true ) ;
2022-10-21 19:49:12 +08:00
irq_work_sync ( & c - > refill_work ) ;
2022-09-02 14:10:43 -07:00
drain_mem_cache ( c ) ;
2023-07-05 20:34:34 -07:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_ttrace_in_progress ) ;
2023-07-05 20:34:45 -07:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_in_progress ) ;
2022-09-02 14:10:43 -07:00
}
2022-09-02 14:10:57 -07:00
/* objcg is the same across cpus */
2022-09-02 14:10:43 -07:00
if ( c - > objcg )
obj_cgroup_put ( c - > objcg ) ;
2022-09-02 14:10:58 -07:00
destroy_mem_alloc ( ma , rcu_in_progress ) ;
2022-09-02 14:10:43 -07:00
}
if ( ma - > caches ) {
2022-09-02 14:10:58 -07:00
rcu_in_progress = 0 ;
2022-09-02 14:10:43 -07:00
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( ma - > caches , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
2023-07-05 20:34:40 -07:00
WRITE_ONCE ( c - > draining , true ) ;
2022-10-21 19:49:12 +08:00
irq_work_sync ( & c - > refill_work ) ;
2022-09-02 14:10:43 -07:00
drain_mem_cache ( c ) ;
2023-07-05 20:34:34 -07:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_ttrace_in_progress ) ;
2023-07-05 20:34:45 -07:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_in_progress ) ;
2022-09-02 14:10:43 -07:00
}
}
if ( c - > objcg )
obj_cgroup_put ( c - > objcg ) ;
2022-09-02 14:10:58 -07:00
destroy_mem_alloc ( ma , rcu_in_progress ) ;
2022-09-02 14:10:43 -07:00
}
}
/* notrace is necessary here and in other functions to make sure
* bpf programs cannot attach to them and cause llist corruptions .
*/
static void notrace * unit_alloc ( struct bpf_mem_cache * c )
{
struct llist_node * llnode = NULL ;
unsigned long flags ;
int cnt = 0 ;
/* Disable irqs to prevent the following race for majority of prog types:
* prog_A
* bpf_mem_alloc
* preemption or irq - > prog_B
* bpf_mem_alloc
*
* but prog_B could be a perf_event NMI prog .
* Use per - cpu ' active ' counter to order free_list access between
* unit_alloc / unit_free / bpf_mem_refill .
*/
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
llnode = __llist_del_first ( & c - > free_llist ) ;
2023-07-05 20:34:41 -07:00
if ( llnode ) {
2022-09-02 14:10:43 -07:00
cnt = - - c - > free_cnt ;
2023-07-05 20:34:41 -07:00
* ( struct bpf_mem_cache * * ) llnode = c ;
}
2022-09-02 14:10:43 -07:00
}
local_dec ( & c - > active ) ;
local_irq_restore ( flags ) ;
WARN_ON ( cnt < 0 ) ;
2022-09-02 14:10:50 -07:00
if ( cnt < c - > low_watermark )
2022-09-02 14:10:43 -07:00
irq_work_raise ( c ) ;
return llnode ;
}
/* Though 'ptr' object could have been allocated on a different cpu
* add it to the free_llist of the current cpu .
* Let kfree ( ) logic deal with it when it ' s later called from irq_work .
*/
static void notrace unit_free ( struct bpf_mem_cache * c , void * ptr )
{
struct llist_node * llnode = ptr - LLIST_NODE_SZ ;
unsigned long flags ;
int cnt = 0 ;
BUILD_BUG_ON ( LLIST_NODE_SZ > 8 ) ;
2023-07-05 20:34:41 -07:00
/*
* Remember bpf_mem_cache that allocated this object .
* The hint is not accurate .
*/
c - > tgt = * ( struct bpf_mem_cache * * ) llnode ;
2022-09-02 14:10:43 -07:00
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
__llist_add ( llnode , & c - > free_llist ) ;
cnt = + + c - > free_cnt ;
} else {
/* unit_free() cannot fail. Therefore add an object to atomic
* llist . free_bulk ( ) will drain it . Though free_llist_extra is
* a per - cpu list we have to use atomic llist_add here , since
* it also can be interrupted by bpf nmi prog that does another
* unit_free ( ) into the same free_llist_extra .
*/
llist_add ( llnode , & c - > free_llist_extra ) ;
}
local_dec ( & c - > active ) ;
local_irq_restore ( flags ) ;
2022-09-02 14:10:50 -07:00
if ( cnt > c - > high_watermark )
2022-09-02 14:10:43 -07:00
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise ( c ) ;
}
2023-07-05 20:34:45 -07:00
static void notrace unit_free_rcu ( struct bpf_mem_cache * c , void * ptr )
{
struct llist_node * llnode = ptr - LLIST_NODE_SZ ;
unsigned long flags ;
c - > tgt = * ( struct bpf_mem_cache * * ) llnode ;
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
if ( __llist_add ( llnode , & c - > free_by_rcu ) )
c - > free_by_rcu_tail = llnode ;
} else {
llist_add ( llnode , & c - > free_llist_extra_rcu ) ;
}
local_dec ( & c - > active ) ;
local_irq_restore ( flags ) ;
if ( ! atomic_read ( & c - > call_rcu_in_progress ) )
irq_work_raise ( c ) ;
}
2022-09-02 14:10:43 -07:00
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled .
*/
void notrace * bpf_mem_alloc ( struct bpf_mem_alloc * ma , size_t size )
{
int idx ;
void * ret ;
if ( ! size )
return ZERO_SIZE_PTR ;
idx = bpf_mem_cache_idx ( size + LLIST_NODE_SZ ) ;
if ( idx < 0 )
return NULL ;
ret = unit_alloc ( this_cpu_ptr ( ma - > caches ) - > cache + idx ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_free ( struct bpf_mem_alloc * ma , void * ptr )
{
int idx ;
if ( ! ptr )
return ;
2022-09-06 19:38:53 -07:00
idx = bpf_mem_cache_idx ( ksize ( ptr - LLIST_NODE_SZ ) ) ;
2022-09-02 14:10:43 -07:00
if ( idx < 0 )
return ;
unit_free ( this_cpu_ptr ( ma - > caches ) - > cache + idx , ptr ) ;
}
2023-07-05 20:34:45 -07:00
void notrace bpf_mem_free_rcu ( struct bpf_mem_alloc * ma , void * ptr )
{
int idx ;
if ( ! ptr )
return ;
idx = bpf_mem_cache_idx ( ksize ( ptr - LLIST_NODE_SZ ) ) ;
if ( idx < 0 )
return ;
unit_free_rcu ( this_cpu_ptr ( ma - > caches ) - > cache + idx , ptr ) ;
}
2022-09-02 14:10:43 -07:00
void notrace * bpf_mem_cache_alloc ( struct bpf_mem_alloc * ma )
{
void * ret ;
ret = unit_alloc ( this_cpu_ptr ( ma - > cache ) ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_cache_free ( struct bpf_mem_alloc * ma , void * ptr )
{
if ( ! ptr )
return ;
unit_free ( this_cpu_ptr ( ma - > cache ) , ptr ) ;
}
2023-03-22 14:52:42 -07:00
2023-07-05 20:34:45 -07:00
void notrace bpf_mem_cache_free_rcu ( struct bpf_mem_alloc * ma , void * ptr )
{
if ( ! ptr )
return ;
unit_free_rcu ( this_cpu_ptr ( ma - > cache ) , ptr ) ;
}
2023-03-22 14:52:42 -07:00
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp .
* The caller must first go through the rcu_tasks_trace gp for ' ptr '
* before calling bpf_mem_cache_raw_free ( ) .
* It could be used when the rcu_tasks_trace callback does not have
* a hold on the original bpf_mem_alloc object that allocated the
* ' ptr ' . This should only be used in the uncommon code path .
* Otherwise , the bpf_mem_alloc ' s free_llist cannot be refilled
* and may affect performance .
*/
void bpf_mem_cache_raw_free ( void * ptr )
{
if ( ! ptr )
return ;
kfree ( ptr - LLIST_NODE_SZ ) ;
}
/* When flags == GFP_KERNEL, it signals that the caller will not cause
* deadlock when using kmalloc . bpf_mem_cache_alloc_flags ( ) will use
* kmalloc if the free_llist is empty .
*/
void notrace * bpf_mem_cache_alloc_flags ( struct bpf_mem_alloc * ma , gfp_t flags )
{
struct bpf_mem_cache * c ;
void * ret ;
c = this_cpu_ptr ( ma - > cache ) ;
ret = unit_alloc ( c ) ;
if ( ! ret & & flags = = GFP_KERNEL ) {
struct mem_cgroup * memcg , * old_memcg ;
memcg = get_memcg ( c ) ;
old_memcg = set_active_memcg ( memcg ) ;
ret = __alloc ( c , NUMA_NO_NODE , GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT ) ;
set_active_memcg ( old_memcg ) ;
mem_cgroup_put ( memcg ) ;
}
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
2023-09-08 21:39:20 +08:00
static __init int bpf_mem_cache_adjust_size ( void )
{
2023-09-28 18:15:58 +08:00
unsigned int size ;
2023-09-08 21:39:20 +08:00
2023-09-28 18:15:58 +08:00
/* Adjusting the indexes in size_index() according to the object_size
* of underlying slab cache , so bpf_mem_alloc ( ) will select a
* bpf_mem_cache with unit_size equal to the object_size of
* the underlying slab cache .
*
* The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign ( ) is
* 256 - bytes , so only do adjustment for [ 8 - bytes , 192 - bytes ] .
2023-09-08 21:39:20 +08:00
*/
2023-09-28 18:15:58 +08:00
for ( size = 192 ; size > = 8 ; size - = 8 ) {
unsigned int kmalloc_size , index ;
2023-09-08 21:39:20 +08:00
2023-09-28 18:15:58 +08:00
kmalloc_size = kmalloc_size_roundup ( size ) ;
if ( kmalloc_size = = size )
continue ;
2023-09-08 21:39:20 +08:00
2023-09-28 18:15:58 +08:00
if ( kmalloc_size < = 192 )
index = size_index [ ( kmalloc_size - 1 ) / 8 ] ;
else
index = fls ( kmalloc_size - 1 ) - 1 ;
/* Only overwrite if necessary */
if ( size_index [ ( size - 1 ) / 8 ] ! = index )
2023-09-08 21:39:20 +08:00
size_index [ ( size - 1 ) / 8 ] = index ;
}
return 0 ;
}
subsys_initcall ( bpf_mem_cache_adjust_size ) ;