2022-09-03 00:10:43 +03:00
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
# include <linux/mm.h>
# include <linux/llist.h>
# include <linux/bpf.h>
# include <linux/irq_work.h>
# include <linux/bpf_mem_alloc.h>
# include <linux/memcontrol.h>
# include <asm/local.h>
/* Any context (including NMI) BPF specific memory allocator.
*
* Tracing BPF programs can attach to kprobe and fentry . Hence they
* run in unknown context where calling plain kmalloc ( ) might not be safe .
*
* Front - end kmalloc ( ) with per - cpu per - bucket cache of free elements .
* Refill this cache asynchronously from irq_work .
*
* CPU_0 buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
* . . .
* CPU_N buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
*
* The buckets are prefilled at the start .
* BPF programs always run with migration disabled .
* It ' s safe to allocate from cache of the current cpu with irqs disabled .
* Free - ing is always done into bucket of the current cpu as well .
* irq_work trims extra free elements from buckets with kfree
* and refills them with kmalloc , so global kmalloc logic takes care
* of freeing objects allocated by one cpu and freed on another .
*
* Every allocated objected is padded with extra 8 bytes that contains
* struct llist_node .
*/
# define LLIST_NODE_SZ sizeof(struct llist_node)
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index [ 24 ] __ro_after_init = {
3 , /* 8 */
3 , /* 16 */
4 , /* 24 */
4 , /* 32 */
5 , /* 40 */
5 , /* 48 */
5 , /* 56 */
5 , /* 64 */
1 , /* 72 */
1 , /* 80 */
1 , /* 88 */
1 , /* 96 */
6 , /* 104 */
6 , /* 112 */
6 , /* 120 */
6 , /* 128 */
2 , /* 136 */
2 , /* 144 */
2 , /* 152 */
2 , /* 160 */
2 , /* 168 */
2 , /* 176 */
2 , /* 184 */
2 /* 192 */
} ;
static int bpf_mem_cache_idx ( size_t size )
{
if ( ! size | | size > 4096 )
return - 1 ;
if ( size < = 192 )
return size_index [ ( size - 1 ) / 8 ] - 1 ;
2023-01-18 11:46:30 +03:00
return fls ( size - 1 ) - 2 ;
2022-09-03 00:10:43 +03:00
}
# define NUM_CACHES 11
struct bpf_mem_cache {
/* per-cpu list of free objects of size 'unit_size'.
* All accesses are done with interrupts disabled and ' active ' counter
* protection with __llist_add ( ) and __llist_del_first ( ) .
*/
struct llist_head free_llist ;
local_t active ;
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
* are sequenced by per - cpu ' active ' counter . But unit_free ( ) cannot
* fail . When ' active ' is busy the unit_free ( ) will add an object to
* free_llist_extra .
*/
struct llist_head free_llist_extra ;
struct irq_work refill_work ;
struct obj_cgroup * objcg ;
int unit_size ;
/* count of objects in free_llist */
int free_cnt ;
2022-09-03 00:10:50 +03:00
int low_watermark , high_watermark , batch ;
2022-09-03 00:10:57 +03:00
int percpu_size ;
2023-07-06 06:34:40 +03:00
bool draining ;
2023-07-06 06:34:41 +03:00
struct bpf_mem_cache * tgt ;
2022-09-03 00:10:51 +03:00
2023-07-06 06:34:45 +03:00
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu ;
struct llist_node * free_by_rcu_tail ;
struct llist_head waiting_for_gp ;
struct llist_node * waiting_for_gp_tail ;
struct rcu_head rcu ;
atomic_t call_rcu_in_progress ;
struct llist_head free_llist_extra_rcu ;
2023-07-06 06:34:34 +03:00
/* list of objects to be freed after RCU tasks trace GP */
struct llist_head free_by_rcu_ttrace ;
struct llist_head waiting_for_gp_ttrace ;
struct rcu_head rcu_ttrace ;
atomic_t call_rcu_ttrace_in_progress ;
2022-09-03 00:10:43 +03:00
} ;
struct bpf_mem_caches {
struct bpf_mem_cache cache [ NUM_CACHES ] ;
} ;
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)
In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
* 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.
Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.
Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.
To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-22 06:17:45 +03:00
static const u16 sizes [ NUM_CACHES ] = { 96 , 192 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 } ;
2022-09-03 00:10:43 +03:00
static struct llist_node notrace * __llist_del_first ( struct llist_head * head )
{
struct llist_node * entry , * next ;
entry = head - > first ;
if ( ! entry )
return NULL ;
next = entry - > next ;
head - > first = next ;
return entry ;
}
2023-03-23 00:52:42 +03:00
static void * __alloc ( struct bpf_mem_cache * c , int node , gfp_t flags )
2022-09-03 00:10:43 +03:00
{
2022-09-03 00:10:57 +03:00
if ( c - > percpu_size ) {
void * * obj = kmalloc_node ( c - > percpu_size , flags , node ) ;
2022-09-03 00:10:52 +03:00
void * pptr = __alloc_percpu_gfp ( c - > unit_size , 8 , flags ) ;
if ( ! obj | | ! pptr ) {
free_percpu ( pptr ) ;
kfree ( obj ) ;
return NULL ;
}
obj [ 1 ] = pptr ;
return obj ;
}
bpf: Zeroing allocated object from slab in bpf memory allocator
Currently the freed element in bpf memory allocator may be immediately
reused, for htab map the reuse will reinitialize special fields in map
value (e.g., bpf_spin_lock), but lookup procedure may still access
these special fields, and it may lead to hard-lockup as shown below:
NMI backtrace for cpu 16
CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0
......
Call Trace:
<TASK>
copy_map_value_locked+0xb7/0x170
bpf_map_copy_value+0x113/0x3c0
__sys_bpf+0x1c67/0x2780
__x64_sys_bpf+0x1c/0x20
do_syscall_64+0x30/0x60
entry_SYSCALL_64_after_hwframe+0x46/0xb0
......
</TASK>
For htab map, just like the preallocated case, these is no need to
initialize these special fields in map value again once these fields
have been initialized. For preallocated htab map, these fields are
initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the
similar thing for non-preallocated htab in bpf memory allocator. And
there is no need to use __GFP_ZERO for per-cpu bpf memory allocator,
because __alloc_percpu_gfp() does it implicitly.
Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-02-15 11:21:31 +03:00
return kmalloc_node ( c - > unit_size , flags | __GFP_ZERO , node ) ;
2022-09-03 00:10:43 +03:00
}
static struct mem_cgroup * get_memcg ( const struct bpf_mem_cache * c )
{
# ifdef CONFIG_MEMCG_KMEM
if ( c - > objcg )
return get_mem_cgroup_from_objcg ( c - > objcg ) ;
# endif
# ifdef CONFIG_MEMCG
return root_mem_cgroup ;
# else
return NULL ;
# endif
}
2023-07-06 06:34:38 +03:00
static void inc_active ( struct bpf_mem_cache * c , unsigned long * flags )
2023-07-06 06:34:37 +03:00
{
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
* reduce the chance of bpf prog executing on this cpu
* when active counter is busy .
*/
2023-07-06 06:34:38 +03:00
local_irq_save ( * flags ) ;
2023-07-06 06:34:37 +03:00
/* alloc_bulk runs from irq_work which will not preempt a bpf
* program that does unit_alloc / unit_free since IRQs are
* disabled there . There is no race to increment ' active '
* counter . It protects free_llist from corruption in case NMI
* bpf prog preempted this loop .
*/
WARN_ON_ONCE ( local_inc_return ( & c - > active ) ! = 1 ) ;
2023-07-06 06:34:38 +03:00
}
2023-07-25 23:26:40 +03:00
static void dec_active ( struct bpf_mem_cache * c , unsigned long * flags )
2023-07-06 06:34:38 +03:00
{
2023-07-06 06:34:37 +03:00
local_dec ( & c - > active ) ;
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
2023-07-25 23:26:40 +03:00
local_irq_restore ( * flags ) ;
2023-07-06 06:34:37 +03:00
}
2023-07-06 06:34:38 +03:00
static void add_obj_to_free_list ( struct bpf_mem_cache * c , void * obj )
{
unsigned long flags ;
inc_active ( c , & flags ) ;
__llist_add ( obj , & c - > free_llist ) ;
c - > free_cnt + + ;
2023-07-25 23:26:40 +03:00
dec_active ( c , & flags ) ;
2023-07-06 06:34:38 +03:00
}
2022-09-03 00:10:43 +03:00
/* Mostly runs from irq_work except __init phase. */
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 07:33:59 +03:00
static void alloc_bulk ( struct bpf_mem_cache * c , int cnt , int node , bool atomic )
2022-09-03 00:10:43 +03:00
{
struct mem_cgroup * memcg = NULL , * old_memcg ;
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 07:33:59 +03:00
gfp_t gfp ;
2022-09-03 00:10:43 +03:00
void * obj ;
int i ;
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 07:33:59 +03:00
gfp = __GFP_NOWARN | __GFP_ACCOUNT ;
gfp | = atomic ? GFP_NOWAIT : GFP_KERNEL ;
2022-09-03 00:10:43 +03:00
for ( i = 0 ; i < cnt ; i + + ) {
2022-12-09 04:09:46 +03:00
/*
2023-07-06 06:34:41 +03:00
* For every ' c ' llist_del_first ( & c - > free_by_rcu_ttrace ) ; is
* done only by one CPU = = current CPU . Other CPUs might
* llist_add ( ) and llist_del_all ( ) in parallel .
2022-12-09 04:09:46 +03:00
*/
2023-07-06 06:34:41 +03:00
obj = llist_del_first ( & c - > free_by_rcu_ttrace ) ;
2023-07-06 06:34:39 +03:00
if ( ! obj )
break ;
add_obj_to_free_list ( c , obj ) ;
}
if ( i > = cnt )
return ;
2023-07-06 06:34:42 +03:00
for ( ; i < cnt ; i + + ) {
obj = llist_del_first ( & c - > waiting_for_gp_ttrace ) ;
if ( ! obj )
break ;
add_obj_to_free_list ( c , obj ) ;
}
if ( i > = cnt )
return ;
2023-07-06 06:34:39 +03:00
memcg = get_memcg ( c ) ;
old_memcg = set_active_memcg ( memcg ) ;
for ( ; i < cnt ; i + + ) {
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do . irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here .
*/
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 07:33:59 +03:00
obj = __alloc ( c , node , gfp ) ;
2023-07-06 06:34:39 +03:00
if ( ! obj )
break ;
2023-07-06 06:34:37 +03:00
add_obj_to_free_list ( c , obj ) ;
2022-09-03 00:10:43 +03:00
}
set_active_memcg ( old_memcg ) ;
mem_cgroup_put ( memcg ) ;
}
2023-06-06 06:53:08 +03:00
static void free_one ( void * obj , bool percpu )
2022-09-03 00:10:43 +03:00
{
2023-06-06 06:53:08 +03:00
if ( percpu ) {
2022-09-03 00:10:52 +03:00
free_percpu ( ( ( void * * ) obj ) [ 1 ] ) ;
2022-09-03 00:10:57 +03:00
kfree ( obj ) ;
2022-09-03 00:10:52 +03:00
return ;
}
2022-09-03 00:10:57 +03:00
kfree ( obj ) ;
2022-09-03 00:10:43 +03:00
}
2023-07-06 06:34:36 +03:00
static int free_all ( struct llist_node * llnode , bool percpu )
2022-09-03 00:10:51 +03:00
{
struct llist_node * pos , * t ;
2023-07-06 06:34:36 +03:00
int cnt = 0 ;
2022-09-03 00:10:51 +03:00
2023-07-06 06:34:36 +03:00
llist_for_each_safe ( pos , t , llnode ) {
2023-06-06 06:53:08 +03:00
free_one ( pos , percpu ) ;
2023-07-06 06:34:36 +03:00
cnt + + ;
}
return cnt ;
2023-06-06 06:53:08 +03:00
}
static void __free_rcu ( struct rcu_head * head )
{
2023-07-06 06:34:34 +03:00
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu_ttrace ) ;
2023-06-06 06:53:08 +03:00
2023-07-06 06:34:34 +03:00
free_all ( llist_del_all ( & c - > waiting_for_gp_ttrace ) , ! ! c - > percpu_size ) ;
atomic_set ( & c - > call_rcu_ttrace_in_progress , 0 ) ;
2022-09-03 00:10:51 +03:00
}
2022-09-03 00:10:55 +03:00
static void __free_rcu_tasks_trace ( struct rcu_head * head )
{
2022-10-14 14:39:44 +03:00
/* If RCU Tasks Trace grace period implies RCU grace period,
* there is no need to invoke call_rcu ( ) .
*/
if ( rcu_trace_implies_rcu_gp ( ) )
__free_rcu ( head ) ;
else
call_rcu ( head , __free_rcu ) ;
2022-09-03 00:10:55 +03:00
}
2022-09-03 00:10:51 +03:00
static void enque_to_free ( struct bpf_mem_cache * c , void * obj )
{
struct llist_node * llnode = obj ;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
2023-07-06 06:34:34 +03:00
* Nothing races to add to free_by_rcu_ttrace list .
2022-09-03 00:10:51 +03:00
*/
2023-07-06 06:34:41 +03:00
llist_add ( llnode , & c - > free_by_rcu_ttrace ) ;
2022-09-03 00:10:51 +03:00
}
2023-07-06 06:34:34 +03:00
static void do_call_rcu_ttrace ( struct bpf_mem_cache * c )
2022-09-03 00:10:51 +03:00
{
struct llist_node * llnode , * t ;
2023-07-06 06:34:41 +03:00
if ( atomic_xchg ( & c - > call_rcu_ttrace_in_progress , 1 ) ) {
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
llnode = llist_del_all ( & c - > free_by_rcu_ttrace ) ;
free_all ( llnode , ! ! c - > percpu_size ) ;
}
2022-09-03 00:10:51 +03:00
return ;
2023-07-06 06:34:41 +03:00
}
2022-09-03 00:10:51 +03:00
2023-07-06 06:34:34 +03:00
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp_ttrace ) ) ;
2023-07-06 06:34:41 +03:00
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_by_rcu_ttrace ) )
2023-07-06 06:34:42 +03:00
llist_add ( llnode , & c - > waiting_for_gp_ttrace ) ;
2023-07-06 06:34:40 +03:00
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
__free_rcu ( & c - > rcu_ttrace ) ;
return ;
}
2022-09-03 00:10:55 +03:00
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
2022-10-14 14:39:44 +03:00
* If RCU Tasks Trace grace period implies RCU grace period , free
* these elements directly , else use call_rcu ( ) to wait for normal
* progs to finish and finally do free_one ( ) on each element .
2022-09-03 00:10:55 +03:00
*/
2023-07-06 06:34:34 +03:00
call_rcu_tasks_trace ( & c - > rcu_ttrace , __free_rcu_tasks_trace ) ;
2022-09-03 00:10:51 +03:00
}
2022-09-03 00:10:43 +03:00
static void free_bulk ( struct bpf_mem_cache * c )
{
2023-07-06 06:34:41 +03:00
struct bpf_mem_cache * tgt = c - > tgt ;
2022-09-03 00:10:43 +03:00
struct llist_node * llnode , * t ;
unsigned long flags ;
int cnt ;
2023-07-06 06:34:41 +03:00
WARN_ON_ONCE ( tgt - > unit_size ! = c - > unit_size ) ;
2023-10-21 04:49:59 +03:00
WARN_ON_ONCE ( tgt - > percpu_size ! = c - > percpu_size ) ;
2023-07-06 06:34:41 +03:00
2022-09-03 00:10:43 +03:00
do {
2023-07-06 06:34:38 +03:00
inc_active ( c , & flags ) ;
2022-09-03 00:10:43 +03:00
llnode = __llist_del_first ( & c - > free_llist ) ;
if ( llnode )
cnt = - - c - > free_cnt ;
else
cnt = 0 ;
2023-07-25 23:26:40 +03:00
dec_active ( c , & flags ) ;
2022-09-19 17:48:11 +03:00
if ( llnode )
2023-07-06 06:34:41 +03:00
enque_to_free ( tgt , llnode ) ;
2022-09-03 00:10:50 +03:00
} while ( cnt > ( c - > high_watermark + c - > low_watermark ) / 2 ) ;
2022-09-03 00:10:43 +03:00
/* and drain free_llist_extra */
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra ) )
2023-07-06 06:34:41 +03:00
enque_to_free ( tgt , llnode ) ;
do_call_rcu_ttrace ( tgt ) ;
2022-09-03 00:10:43 +03:00
}
2023-07-06 06:34:45 +03:00
static void __free_by_rcu ( struct rcu_head * head )
{
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu ) ;
struct bpf_mem_cache * tgt = c - > tgt ;
struct llist_node * llnode ;
2023-10-21 04:49:59 +03:00
WARN_ON_ONCE ( tgt - > unit_size ! = c - > unit_size ) ;
WARN_ON_ONCE ( tgt - > percpu_size ! = c - > percpu_size ) ;
2023-07-06 06:34:45 +03:00
llnode = llist_del_all ( & c - > waiting_for_gp ) ;
if ( ! llnode )
goto out ;
llist_add_batch ( llnode , c - > waiting_for_gp_tail , & tgt - > free_by_rcu_ttrace ) ;
/* Objects went through regular RCU GP. Send them to RCU tasks trace */
do_call_rcu_ttrace ( tgt ) ;
out :
atomic_set ( & c - > call_rcu_in_progress , 0 ) ;
}
static void check_free_by_rcu ( struct bpf_mem_cache * c )
{
struct llist_node * llnode , * t ;
unsigned long flags ;
/* drain free_llist_extra_rcu */
if ( unlikely ( ! llist_empty ( & c - > free_llist_extra_rcu ) ) ) {
inc_active ( c , & flags ) ;
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra_rcu ) )
if ( __llist_add ( llnode , & c - > free_by_rcu ) )
c - > free_by_rcu_tail = llnode ;
2023-07-25 23:26:40 +03:00
dec_active ( c , & flags ) ;
2023-07-06 06:34:45 +03:00
}
if ( llist_empty ( & c - > free_by_rcu ) )
return ;
if ( atomic_xchg ( & c - > call_rcu_in_progress , 1 ) ) {
/*
* Instead of kmalloc - ing new rcu_head and triggering 10 k
* call_rcu ( ) to hit rcutree . qhimark and force RCU to notice
* the overload just ask RCU to hurry up . There could be many
* objects in free_by_rcu list .
* This hint reduces memory consumption for an artificial
* benchmark from 2 Gbyte to 150 Mbyte .
*/
rcu_request_urgent_qs_task ( current ) ;
return ;
}
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp ) ) ;
inc_active ( c , & flags ) ;
WRITE_ONCE ( c - > waiting_for_gp . first , __llist_del_all ( & c - > free_by_rcu ) ) ;
c - > waiting_for_gp_tail = c - > free_by_rcu_tail ;
2023-07-25 23:26:40 +03:00
dec_active ( c , & flags ) ;
2023-07-06 06:34:45 +03:00
if ( unlikely ( READ_ONCE ( c - > draining ) ) ) {
free_all ( llist_del_all ( & c - > waiting_for_gp ) , ! ! c - > percpu_size ) ;
atomic_set ( & c - > call_rcu_in_progress , 0 ) ;
} else {
call_rcu_hurry ( & c - > rcu , __free_by_rcu ) ;
}
}
2022-09-03 00:10:43 +03:00
static void bpf_mem_refill ( struct irq_work * work )
{
struct bpf_mem_cache * c = container_of ( work , struct bpf_mem_cache , refill_work ) ;
int cnt ;
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
cnt = c - > free_cnt ;
2022-09-03 00:10:50 +03:00
if ( cnt < c - > low_watermark )
2022-09-03 00:10:43 +03:00
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here .
*/
bpf: Non-atomically allocate freelist during prefill
In internal testing of test_maps, we sometimes observed failures like:
test_maps: test_maps.c:173: void test_hashmap_percpu(unsigned int, void *):
Assertion `bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0' failed.
where the errno is ENOMEM. After some troubleshooting and enabling
the warnings, we saw:
[ 91.304708] percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
[ 91.304716] CPU: 51 PID: 24145 Comm: test_maps Kdump: loaded Tainted: G N 6.1.38-smp-DEV #7
[ 91.304719] Hardware name: Google Astoria/astoria, BIOS 0.20230627.0-0 06/27/2023
[ 91.304721] Call Trace:
[ 91.304724] <TASK>
[ 91.304730] [<ffffffffa7ef83b9>] dump_stack_lvl+0x59/0x88
[ 91.304737] [<ffffffffa7ef83f8>] dump_stack+0x10/0x18
[ 91.304738] [<ffffffffa75caa0c>] pcpu_alloc+0x6fc/0x870
[ 91.304741] [<ffffffffa75ca302>] __alloc_percpu_gfp+0x12/0x20
[ 91.304743] [<ffffffffa756785e>] alloc_bulk+0xde/0x1e0
[ 91.304746] [<ffffffffa7566c02>] bpf_mem_alloc_init+0xd2/0x2f0
[ 91.304747] [<ffffffffa7547c69>] htab_map_alloc+0x479/0x650
[ 91.304750] [<ffffffffa751d6e0>] map_create+0x140/0x2e0
[ 91.304752] [<ffffffffa751d413>] __sys_bpf+0x5a3/0x6c0
[ 91.304753] [<ffffffffa751c3ec>] __x64_sys_bpf+0x1c/0x30
[ 91.304754] [<ffffffffa7ef847a>] do_syscall_64+0x5a/0x80
[ 91.304756] [<ffffffffa800009b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
This makes sense, because in atomic context, percpu allocation would
not create new chunks; it would only create in non-atomic contexts.
And if during prefill all precpu chunks are full, -ENOMEM would
happen immediately upon next unit_alloc.
Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM.
GFP_NOWAIT has to be used in unit_alloc when bpf program runs
in atomic context. Even if bpf program runs in non-atomic context,
in most cases, rcu read lock is enabled for the program so
GFP_NOWAIT is still needed. This is often also the case for
BPF_MAP_UPDATE_ELEM syscalls.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230728043359.3324347-1-zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-28 07:33:59 +03:00
alloc_bulk ( c , c - > batch , NUMA_NO_NODE , true ) ;
2022-09-03 00:10:50 +03:00
else if ( cnt > c - > high_watermark )
2022-09-03 00:10:43 +03:00
free_bulk ( c ) ;
2023-07-06 06:34:45 +03:00
check_free_by_rcu ( c ) ;
2022-09-03 00:10:43 +03:00
}
static void notrace irq_work_raise ( struct bpf_mem_cache * c )
{
irq_work_queue ( & c - > refill_work ) ;
}
2022-09-03 00:10:50 +03:00
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
* the freelist cache will be elem_size * 64 ( or less ) on each cpu .
*
* For bpf programs that don ' t have statically known allocation sizes and
* assuming ( low_mark + high_mark ) / 2 as an average number of elements per
* bucket and all buckets are used the total amount of memory in freelists
* on each cpu will be :
* 64 * 16 + 64 * 32 + 64 * 64 + 64 * 96 + 64 * 128 + 64 * 196 + 64 * 256 + 32 * 512 + 16 * 1024 + 8 * 2048 + 4 * 4096
* = = ~ 116 Kbyte using below heuristic .
* Initialized , but unused bpf allocator ( not bpf map specific one ) will
* consume ~ 11 Kbyte per cpu .
* Typical case will be between 11 K and 116 K closer to 11 K .
* bpf progs can and should share bpf_mem_cache when possible .
2023-12-22 06:17:55 +03:00
*
* Percpu allocation is typically rare . To avoid potential unnecessary large
* memory consumption , set low_mark = 1 and high_mark = 3 , resulting in c - > batch = 1.
2022-09-03 00:10:50 +03:00
*/
2023-09-08 16:39:21 +03:00
static void init_refill_work ( struct bpf_mem_cache * c )
2022-09-03 00:10:43 +03:00
{
init_irq_work ( & c - > refill_work , bpf_mem_refill ) ;
2023-12-22 06:17:55 +03:00
if ( c - > percpu_size ) {
c - > low_watermark = 1 ;
c - > high_watermark = 3 ;
} else if ( c - > unit_size < = 256 ) {
2022-09-03 00:10:50 +03:00
c - > low_watermark = 32 ;
c - > high_watermark = 96 ;
} else {
/* When page_size == 4k, order-0 cache will have low_mark == 2
* and high_mark = = 6 with batch alloc of 3 individual pages at
* a time .
* 8 k allocs and above low = = 1 , high = = 3 , batch = = 1.
*/
c - > low_watermark = max ( 32 * 256 / c - > unit_size , 1 ) ;
c - > high_watermark = max ( 96 * 256 / c - > unit_size , 3 ) ;
}
c - > batch = max ( ( c - > high_watermark - c - > low_watermark ) / 4 * 3 , 1 ) ;
2023-09-08 16:39:21 +03:00
}
2022-09-03 00:10:50 +03:00
2023-09-08 16:39:21 +03:00
static void prefill_mem_cache ( struct bpf_mem_cache * c , int cpu )
{
2023-12-22 06:17:50 +03:00
int cnt = 1 ;
/* To avoid consuming memory, for non-percpu allocation, assume that
* 1 st run of bpf prog won ' t be doing more than 4 map_update_elem from
* irq disabled region if unit size is less than or equal to 256.
* For all other cases , let us just do one allocation .
2022-09-03 00:10:43 +03:00
*/
2023-12-22 06:17:50 +03:00
if ( ! c - > percpu_size & & c - > unit_size < = 256 )
cnt = 4 ;
alloc_bulk ( c , cnt , cpu_to_node ( cpu ) , false ) ;
2022-09-03 00:10:43 +03:00
}
2022-09-03 00:10:57 +03:00
/* When size != 0 bpf_mem_cache for each cpu.
2022-09-03 00:10:43 +03:00
* This is typical bpf hash map use case when all elements have equal size .
*
* When size = = 0 allocate 11 bpf_mem_cache - s for each cpu , then rely on
* kmalloc / kfree . Max allocation size is 4096 in this case .
* This is bpf_dynptr and bpf_kptr use case .
*/
2022-09-03 00:10:52 +03:00
int bpf_mem_alloc_init ( struct bpf_mem_alloc * ma , int size , bool percpu )
2022-09-03 00:10:43 +03:00
{
struct bpf_mem_caches * cc , __percpu * pcc ;
struct bpf_mem_cache * c , __percpu * pc ;
struct obj_cgroup * objcg = NULL ;
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
int cpu , i , unit_size , percpu_size = 0 ;
2022-09-03 00:10:43 +03:00
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)
In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
* 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.
Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.
Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.
To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-22 06:17:45 +03:00
if ( percpu & & size = = 0 )
return - EINVAL ;
2023-08-27 18:27:34 +03:00
/* room for llist_node and per-cpu pointer */
if ( percpu )
percpu_size = LLIST_NODE_SZ + sizeof ( void * ) ;
2023-10-20 16:31:59 +03:00
ma - > percpu = percpu ;
2023-08-27 18:27:34 +03:00
2022-09-03 00:10:43 +03:00
if ( size ) {
pc = __alloc_percpu_gfp ( sizeof ( * pc ) , 8 , GFP_KERNEL ) ;
if ( ! pc )
return - ENOMEM ;
2022-09-03 00:10:52 +03:00
2023-08-27 18:27:34 +03:00
if ( ! percpu )
2022-09-03 00:10:52 +03:00
size + = LLIST_NODE_SZ ; /* room for llist_node */
2022-09-03 00:10:57 +03:00
unit_size = size ;
2022-09-03 00:10:52 +03:00
2022-09-03 00:10:43 +03:00
# ifdef CONFIG_MEMCG_KMEM
2023-02-10 18:47:33 +03:00
if ( memcg_bpf_enabled ( ) )
objcg = get_obj_cgroup_from_current ( ) ;
2022-09-03 00:10:43 +03:00
# endif
2023-12-22 06:17:39 +03:00
ma - > objcg = objcg ;
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)
In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
* 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.
Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.
Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.
To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-22 06:17:45 +03:00
2022-09-03 00:10:43 +03:00
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( pc , cpu ) ;
2022-09-03 00:10:52 +03:00
c - > unit_size = unit_size ;
2022-09-03 00:10:43 +03:00
c - > objcg = objcg ;
2022-09-03 00:10:57 +03:00
c - > percpu_size = percpu_size ;
2023-07-06 06:34:41 +03:00
c - > tgt = c ;
2023-09-08 16:39:21 +03:00
init_refill_work ( c ) ;
2022-09-03 00:10:43 +03:00
prefill_mem_cache ( c , cpu ) ;
}
ma - > cache = pc ;
return 0 ;
}
pcc = __alloc_percpu_gfp ( sizeof ( * cc ) , 8 , GFP_KERNEL ) ;
if ( ! pcc )
return - ENOMEM ;
# ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current ( ) ;
# endif
2023-12-22 06:17:39 +03:00
ma - > objcg = objcg ;
2022-09-03 00:10:43 +03:00
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( pcc , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
c - > unit_size = sizes [ i ] ;
c - > objcg = objcg ;
2023-08-27 18:27:34 +03:00
c - > percpu_size = percpu_size ;
2023-07-06 06:34:41 +03:00
c - > tgt = c ;
2023-09-08 16:39:21 +03:00
init_refill_work ( c ) ;
2022-09-03 00:10:43 +03:00
prefill_mem_cache ( c , cpu ) ;
}
}
2023-09-08 16:39:22 +03:00
2022-09-03 00:10:43 +03:00
ma - > caches = pcc ;
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
return 0 ;
2022-09-03 00:10:43 +03:00
}
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)
In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
* 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.
Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.
Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.
To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-22 06:17:45 +03:00
int bpf_mem_alloc_percpu_init ( struct bpf_mem_alloc * ma , struct obj_cgroup * objcg )
{
struct bpf_mem_caches __percpu * pcc ;
pcc = __alloc_percpu_gfp ( sizeof ( struct bpf_mem_caches ) , 8 , GFP_KERNEL ) ;
if ( ! pcc )
return - ENOMEM ;
ma - > caches = pcc ;
ma - > objcg = objcg ;
ma - > percpu = true ;
return 0 ;
}
int bpf_mem_alloc_percpu_unit_init ( struct bpf_mem_alloc * ma , int size )
{
struct bpf_mem_caches * cc , __percpu * pcc ;
int cpu , i , unit_size , percpu_size ;
struct obj_cgroup * objcg ;
struct bpf_mem_cache * c ;
i = bpf_mem_cache_idx ( size ) ;
if ( i < 0 )
return - EINVAL ;
/* room for llist_node and per-cpu pointer */
percpu_size = LLIST_NODE_SZ + sizeof ( void * ) ;
unit_size = sizes [ i ] ;
objcg = ma - > objcg ;
pcc = ma - > caches ;
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( pcc , cpu ) ;
c = & cc - > cache [ i ] ;
2024-01-04 19:57:44 +03:00
if ( c - > unit_size )
bpf: Allow per unit prefill for non-fix-size percpu memory allocator
Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)
In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
* 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.
Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.
Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.
To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-22 06:17:45 +03:00
break ;
c - > unit_size = unit_size ;
c - > objcg = objcg ;
c - > percpu_size = percpu_size ;
c - > tgt = c ;
init_refill_work ( c ) ;
prefill_mem_cache ( c , cpu ) ;
}
return 0 ;
}
2022-09-03 00:10:43 +03:00
static void drain_mem_cache ( struct bpf_mem_cache * c )
{
2023-06-06 06:53:08 +03:00
bool percpu = ! ! c - > percpu_size ;
2022-09-03 00:10:43 +03:00
2022-09-03 00:10:58 +03:00
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free ( ) for all remaining elements and they can be in
2023-07-06 06:34:34 +03:00
* free_by_rcu_ttrace or in waiting_for_gp_ttrace lists , so drain those lists now .
2022-10-21 14:49:13 +03:00
*
2023-07-06 06:34:34 +03:00
* Except for waiting_for_gp_ttrace list , there are no concurrent operations
2022-10-21 14:49:13 +03:00
* on these lists , so it is safe to use __llist_del_all ( ) .
2022-09-03 00:10:51 +03:00
*/
2023-07-06 06:34:41 +03:00
free_all ( llist_del_all ( & c - > free_by_rcu_ttrace ) , percpu ) ;
2023-07-06 06:34:34 +03:00
free_all ( llist_del_all ( & c - > waiting_for_gp_ttrace ) , percpu ) ;
2023-06-06 06:53:08 +03:00
free_all ( __llist_del_all ( & c - > free_llist ) , percpu ) ;
free_all ( __llist_del_all ( & c - > free_llist_extra ) , percpu ) ;
2023-07-06 06:34:45 +03:00
free_all ( __llist_del_all ( & c - > free_by_rcu ) , percpu ) ;
free_all ( __llist_del_all ( & c - > free_llist_extra_rcu ) , percpu ) ;
free_all ( llist_del_all ( & c - > waiting_for_gp ) , percpu ) ;
2022-09-03 00:10:43 +03:00
}
2023-07-06 06:34:47 +03:00
static void check_mem_cache ( struct bpf_mem_cache * c )
{
WARN_ON_ONCE ( ! llist_empty ( & c - > free_by_rcu_ttrace ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp_ttrace ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist_extra ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_by_rcu ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > free_llist_extra_rcu ) ) ;
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp ) ) ;
}
static void check_leaked_objs ( struct bpf_mem_alloc * ma )
{
struct bpf_mem_caches * cc ;
struct bpf_mem_cache * c ;
int cpu , i ;
if ( ma - > cache ) {
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( ma - > cache , cpu ) ;
check_mem_cache ( c ) ;
}
}
if ( ma - > caches ) {
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( ma - > caches , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
check_mem_cache ( c ) ;
}
}
}
}
2022-09-03 00:10:58 +03:00
static void free_mem_alloc_no_barrier ( struct bpf_mem_alloc * ma )
{
2023-07-06 06:34:47 +03:00
check_leaked_objs ( ma ) ;
2022-09-03 00:10:58 +03:00
free_percpu ( ma - > cache ) ;
free_percpu ( ma - > caches ) ;
ma - > cache = NULL ;
ma - > caches = NULL ;
}
static void free_mem_alloc ( struct bpf_mem_alloc * ma )
{
2023-07-06 06:34:45 +03:00
/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
* might still execute . Wait for them .
2022-12-09 04:09:47 +03:00
*
* rcu_barrier_tasks_trace ( ) doesn ' t imply synchronize_rcu_tasks_trace ( ) ,
* but rcu_barrier_tasks_trace ( ) and rcu_barrier ( ) below are only used
* to wait for the pending __free_rcu_tasks_trace ( ) and __free_rcu ( ) ,
* so if call_rcu ( head , __free_rcu ) is skipped due to
* rcu_trace_implies_rcu_gp ( ) , it will be OK to skip rcu_barrier ( ) by
* using rcu_trace_implies_rcu_gp ( ) as well .
2022-09-03 00:10:58 +03:00
*/
2023-07-06 06:34:45 +03:00
rcu_barrier ( ) ; /* wait for __free_by_rcu */
rcu_barrier_tasks_trace ( ) ; /* wait for __free_rcu */
2022-12-09 04:09:47 +03:00
if ( ! rcu_trace_implies_rcu_gp ( ) )
rcu_barrier ( ) ;
2022-09-03 00:10:58 +03:00
free_mem_alloc_no_barrier ( ma ) ;
}
static void free_mem_alloc_deferred ( struct work_struct * work )
{
struct bpf_mem_alloc * ma = container_of ( work , struct bpf_mem_alloc , work ) ;
free_mem_alloc ( ma ) ;
kfree ( ma ) ;
}
static void destroy_mem_alloc ( struct bpf_mem_alloc * ma , int rcu_in_progress )
{
struct bpf_mem_alloc * copy ;
if ( ! rcu_in_progress ) {
/* Fast path. No callbacks are pending, hence no need to do
* rcu_barrier - s .
*/
free_mem_alloc_no_barrier ( ma ) ;
return ;
}
2023-07-06 06:34:35 +03:00
copy = kmemdup ( ma , sizeof ( * ma ) , GFP_KERNEL ) ;
2022-09-03 00:10:58 +03:00
if ( ! copy ) {
/* Slow path with inline barrier-s */
free_mem_alloc ( ma ) ;
return ;
}
/* Defer barriers into worker to let the rest of map memory to be freed */
2023-07-06 06:34:35 +03:00
memset ( ma , 0 , sizeof ( * ma ) ) ;
2022-09-03 00:10:58 +03:00
INIT_WORK ( & copy - > work , free_mem_alloc_deferred ) ;
queue_work ( system_unbound_wq , & copy - > work ) ;
}
2022-09-03 00:10:43 +03:00
void bpf_mem_alloc_destroy ( struct bpf_mem_alloc * ma )
{
struct bpf_mem_caches * cc ;
struct bpf_mem_cache * c ;
2022-09-03 00:10:58 +03:00
int cpu , i , rcu_in_progress ;
2022-09-03 00:10:43 +03:00
if ( ma - > cache ) {
2022-09-03 00:10:58 +03:00
rcu_in_progress = 0 ;
2022-09-03 00:10:43 +03:00
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( ma - > cache , cpu ) ;
2023-07-06 06:34:40 +03:00
WRITE_ONCE ( c - > draining , true ) ;
2022-10-21 14:49:12 +03:00
irq_work_sync ( & c - > refill_work ) ;
2022-09-03 00:10:43 +03:00
drain_mem_cache ( c ) ;
2023-07-06 06:34:34 +03:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_ttrace_in_progress ) ;
2023-07-06 06:34:45 +03:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_in_progress ) ;
2022-09-03 00:10:43 +03:00
}
2023-12-22 06:17:39 +03:00
if ( ma - > objcg )
obj_cgroup_put ( ma - > objcg ) ;
2022-09-03 00:10:58 +03:00
destroy_mem_alloc ( ma , rcu_in_progress ) ;
2022-09-03 00:10:43 +03:00
}
if ( ma - > caches ) {
2022-09-03 00:10:58 +03:00
rcu_in_progress = 0 ;
2022-09-03 00:10:43 +03:00
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( ma - > caches , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
2023-07-06 06:34:40 +03:00
WRITE_ONCE ( c - > draining , true ) ;
2022-10-21 14:49:12 +03:00
irq_work_sync ( & c - > refill_work ) ;
2022-09-03 00:10:43 +03:00
drain_mem_cache ( c ) ;
2023-07-06 06:34:34 +03:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_ttrace_in_progress ) ;
2023-07-06 06:34:45 +03:00
rcu_in_progress + = atomic_read ( & c - > call_rcu_in_progress ) ;
2022-09-03 00:10:43 +03:00
}
}
2023-12-22 06:17:39 +03:00
if ( ma - > objcg )
obj_cgroup_put ( ma - > objcg ) ;
2022-09-03 00:10:58 +03:00
destroy_mem_alloc ( ma , rcu_in_progress ) ;
2022-09-03 00:10:43 +03:00
}
}
/* notrace is necessary here and in other functions to make sure
* bpf programs cannot attach to them and cause llist corruptions .
*/
static void notrace * unit_alloc ( struct bpf_mem_cache * c )
{
struct llist_node * llnode = NULL ;
unsigned long flags ;
int cnt = 0 ;
/* Disable irqs to prevent the following race for majority of prog types:
* prog_A
* bpf_mem_alloc
* preemption or irq - > prog_B
* bpf_mem_alloc
*
* but prog_B could be a perf_event NMI prog .
* Use per - cpu ' active ' counter to order free_list access between
* unit_alloc / unit_free / bpf_mem_refill .
*/
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
llnode = __llist_del_first ( & c - > free_llist ) ;
2023-07-06 06:34:41 +03:00
if ( llnode ) {
2022-09-03 00:10:43 +03:00
cnt = - - c - > free_cnt ;
2023-07-06 06:34:41 +03:00
* ( struct bpf_mem_cache * * ) llnode = c ;
}
2022-09-03 00:10:43 +03:00
}
local_dec ( & c - > active ) ;
WARN_ON ( cnt < 0 ) ;
2022-09-03 00:10:50 +03:00
if ( cnt < c - > low_watermark )
2022-09-03 00:10:43 +03:00
irq_work_raise ( c ) ;
2023-09-01 14:19:52 +03:00
/* Enable IRQ after the enqueue of irq work completes, so irq work
* will run after IRQ is enabled and free_llist may be refilled by
* irq work before other task preempts current task .
*/
local_irq_restore ( flags ) ;
2022-09-03 00:10:43 +03:00
return llnode ;
}
/* Though 'ptr' object could have been allocated on a different cpu
* add it to the free_llist of the current cpu .
* Let kfree ( ) logic deal with it when it ' s later called from irq_work .
*/
static void notrace unit_free ( struct bpf_mem_cache * c , void * ptr )
{
struct llist_node * llnode = ptr - LLIST_NODE_SZ ;
unsigned long flags ;
int cnt = 0 ;
BUILD_BUG_ON ( LLIST_NODE_SZ > 8 ) ;
2023-07-06 06:34:41 +03:00
/*
* Remember bpf_mem_cache that allocated this object .
* The hint is not accurate .
*/
c - > tgt = * ( struct bpf_mem_cache * * ) llnode ;
2022-09-03 00:10:43 +03:00
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
__llist_add ( llnode , & c - > free_llist ) ;
cnt = + + c - > free_cnt ;
} else {
/* unit_free() cannot fail. Therefore add an object to atomic
* llist . free_bulk ( ) will drain it . Though free_llist_extra is
* a per - cpu list we have to use atomic llist_add here , since
* it also can be interrupted by bpf nmi prog that does another
* unit_free ( ) into the same free_llist_extra .
*/
llist_add ( llnode , & c - > free_llist_extra ) ;
}
local_dec ( & c - > active ) ;
2022-09-03 00:10:50 +03:00
if ( cnt > c - > high_watermark )
2022-09-03 00:10:43 +03:00
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise ( c ) ;
2023-09-01 14:19:53 +03:00
/* Enable IRQ after irq_work_raise() completes, otherwise when current
* task is preempted by task which does unit_alloc ( ) , unit_alloc ( ) may
* return NULL unexpectedly because irq work is already pending but can
* not been triggered and free_llist can not be refilled timely .
*/
local_irq_restore ( flags ) ;
2022-09-03 00:10:43 +03:00
}
2023-07-06 06:34:45 +03:00
static void notrace unit_free_rcu ( struct bpf_mem_cache * c , void * ptr )
{
struct llist_node * llnode = ptr - LLIST_NODE_SZ ;
unsigned long flags ;
c - > tgt = * ( struct bpf_mem_cache * * ) llnode ;
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
if ( __llist_add ( llnode , & c - > free_by_rcu ) )
c - > free_by_rcu_tail = llnode ;
} else {
llist_add ( llnode , & c - > free_llist_extra_rcu ) ;
}
local_dec ( & c - > active ) ;
if ( ! atomic_read ( & c - > call_rcu_in_progress ) )
irq_work_raise ( c ) ;
2023-09-01 14:19:53 +03:00
local_irq_restore ( flags ) ;
2023-07-06 06:34:45 +03:00
}
2022-09-03 00:10:43 +03:00
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled .
*/
void notrace * bpf_mem_alloc ( struct bpf_mem_alloc * ma , size_t size )
{
int idx ;
void * ret ;
if ( ! size )
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
return NULL ;
2022-09-03 00:10:43 +03:00
2023-12-22 06:17:34 +03:00
if ( ! ma - > percpu )
size + = LLIST_NODE_SZ ;
idx = bpf_mem_cache_idx ( size ) ;
2022-09-03 00:10:43 +03:00
if ( idx < 0 )
return NULL ;
ret = unit_alloc ( this_cpu_ptr ( ma - > caches ) - > cache + idx ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_free ( struct bpf_mem_alloc * ma , void * ptr )
{
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
struct bpf_mem_cache * c ;
2022-09-03 00:10:43 +03:00
int idx ;
if ( ! ptr )
return ;
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
c = * ( void * * ) ( ptr - LLIST_NODE_SZ ) ;
idx = bpf_mem_cache_idx ( c - > unit_size ) ;
if ( WARN_ON_ONCE ( idx < 0 ) )
2022-09-03 00:10:43 +03:00
return ;
unit_free ( this_cpu_ptr ( ma - > caches ) - > cache + idx , ptr ) ;
}
2023-07-06 06:34:45 +03:00
void notrace bpf_mem_free_rcu ( struct bpf_mem_alloc * ma , void * ptr )
{
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
struct bpf_mem_cache * c ;
2023-07-06 06:34:45 +03:00
int idx ;
if ( ! ptr )
return ;
bpf: Use c->unit_size to select target cache during free
At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().
The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.
Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.
A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.
As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.
Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:
kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s
After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:
kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s
percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s
After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.
Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-12-16 16:10:51 +03:00
c = * ( void * * ) ( ptr - LLIST_NODE_SZ ) ;
idx = bpf_mem_cache_idx ( c - > unit_size ) ;
if ( WARN_ON_ONCE ( idx < 0 ) )
2023-07-06 06:34:45 +03:00
return ;
unit_free_rcu ( this_cpu_ptr ( ma - > caches ) - > cache + idx , ptr ) ;
}
2022-09-03 00:10:43 +03:00
void notrace * bpf_mem_cache_alloc ( struct bpf_mem_alloc * ma )
{
void * ret ;
ret = unit_alloc ( this_cpu_ptr ( ma - > cache ) ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_cache_free ( struct bpf_mem_alloc * ma , void * ptr )
{
if ( ! ptr )
return ;
unit_free ( this_cpu_ptr ( ma - > cache ) , ptr ) ;
}
2023-03-23 00:52:42 +03:00
2023-07-06 06:34:45 +03:00
void notrace bpf_mem_cache_free_rcu ( struct bpf_mem_alloc * ma , void * ptr )
{
if ( ! ptr )
return ;
unit_free_rcu ( this_cpu_ptr ( ma - > cache ) , ptr ) ;
}
2023-03-23 00:52:42 +03:00
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp .
* The caller must first go through the rcu_tasks_trace gp for ' ptr '
* before calling bpf_mem_cache_raw_free ( ) .
* It could be used when the rcu_tasks_trace callback does not have
* a hold on the original bpf_mem_alloc object that allocated the
* ' ptr ' . This should only be used in the uncommon code path .
* Otherwise , the bpf_mem_alloc ' s free_llist cannot be refilled
* and may affect performance .
*/
void bpf_mem_cache_raw_free ( void * ptr )
{
if ( ! ptr )
return ;
kfree ( ptr - LLIST_NODE_SZ ) ;
}
/* When flags == GFP_KERNEL, it signals that the caller will not cause
* deadlock when using kmalloc . bpf_mem_cache_alloc_flags ( ) will use
* kmalloc if the free_llist is empty .
*/
void notrace * bpf_mem_cache_alloc_flags ( struct bpf_mem_alloc * ma , gfp_t flags )
{
struct bpf_mem_cache * c ;
void * ret ;
c = this_cpu_ptr ( ma - > cache ) ;
ret = unit_alloc ( c ) ;
if ( ! ret & & flags = = GFP_KERNEL ) {
struct mem_cgroup * memcg , * old_memcg ;
memcg = get_memcg ( c ) ;
old_memcg = set_active_memcg ( memcg ) ;
ret = __alloc ( c , NUMA_NO_NODE , GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT ) ;
2023-11-11 07:38:21 +03:00
if ( ret )
* ( struct bpf_mem_cache * * ) ret = c ;
2023-03-23 00:52:42 +03:00
set_active_memcg ( old_memcg ) ;
mem_cgroup_put ( memcg ) ;
}
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}