2022-09-03 00:10:43 +03:00
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
# include <linux/mm.h>
# include <linux/llist.h>
# include <linux/bpf.h>
# include <linux/irq_work.h>
# include <linux/bpf_mem_alloc.h>
# include <linux/memcontrol.h>
# include <asm/local.h>
/* Any context (including NMI) BPF specific memory allocator.
*
* Tracing BPF programs can attach to kprobe and fentry . Hence they
* run in unknown context where calling plain kmalloc ( ) might not be safe .
*
* Front - end kmalloc ( ) with per - cpu per - bucket cache of free elements .
* Refill this cache asynchronously from irq_work .
*
* CPU_0 buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
* . . .
* CPU_N buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
*
* The buckets are prefilled at the start .
* BPF programs always run with migration disabled .
* It ' s safe to allocate from cache of the current cpu with irqs disabled .
* Free - ing is always done into bucket of the current cpu as well .
* irq_work trims extra free elements from buckets with kfree
* and refills them with kmalloc , so global kmalloc logic takes care
* of freeing objects allocated by one cpu and freed on another .
*
* Every allocated objected is padded with extra 8 bytes that contains
* struct llist_node .
*/
# define LLIST_NODE_SZ sizeof(struct llist_node)
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index [ 24 ] __ro_after_init = {
3 , /* 8 */
3 , /* 16 */
4 , /* 24 */
4 , /* 32 */
5 , /* 40 */
5 , /* 48 */
5 , /* 56 */
5 , /* 64 */
1 , /* 72 */
1 , /* 80 */
1 , /* 88 */
1 , /* 96 */
6 , /* 104 */
6 , /* 112 */
6 , /* 120 */
6 , /* 128 */
2 , /* 136 */
2 , /* 144 */
2 , /* 152 */
2 , /* 160 */
2 , /* 168 */
2 , /* 176 */
2 , /* 184 */
2 /* 192 */
} ;
static int bpf_mem_cache_idx ( size_t size )
{
if ( ! size | | size > 4096 )
return - 1 ;
if ( size < = 192 )
return size_index [ ( size - 1 ) / 8 ] - 1 ;
return fls ( size - 1 ) - 1 ;
}
# define NUM_CACHES 11
struct bpf_mem_cache {
/* per-cpu list of free objects of size 'unit_size'.
* All accesses are done with interrupts disabled and ' active ' counter
* protection with __llist_add ( ) and __llist_del_first ( ) .
*/
struct llist_head free_llist ;
local_t active ;
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
* are sequenced by per - cpu ' active ' counter . But unit_free ( ) cannot
* fail . When ' active ' is busy the unit_free ( ) will add an object to
* free_llist_extra .
*/
struct llist_head free_llist_extra ;
struct irq_work refill_work ;
struct obj_cgroup * objcg ;
int unit_size ;
/* count of objects in free_llist */
int free_cnt ;
2022-09-03 00:10:50 +03:00
int low_watermark , high_watermark , batch ;
2022-09-03 00:10:57 +03:00
int percpu_size ;
2022-09-03 00:10:51 +03:00
struct rcu_head rcu ;
struct llist_head free_by_rcu ;
struct llist_head waiting_for_gp ;
atomic_t call_rcu_in_progress ;
2022-09-03 00:10:43 +03:00
} ;
struct bpf_mem_caches {
struct bpf_mem_cache cache [ NUM_CACHES ] ;
} ;
static struct llist_node notrace * __llist_del_first ( struct llist_head * head )
{
struct llist_node * entry , * next ;
entry = head - > first ;
if ( ! entry )
return NULL ;
next = entry - > next ;
head - > first = next ;
return entry ;
}
static void * __alloc ( struct bpf_mem_cache * c , int node )
{
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do . irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here .
*/
gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT ;
2022-09-03 00:10:57 +03:00
if ( c - > percpu_size ) {
void * * obj = kmalloc_node ( c - > percpu_size , flags , node ) ;
2022-09-03 00:10:52 +03:00
void * pptr = __alloc_percpu_gfp ( c - > unit_size , 8 , flags ) ;
if ( ! obj | | ! pptr ) {
free_percpu ( pptr ) ;
kfree ( obj ) ;
return NULL ;
}
obj [ 1 ] = pptr ;
return obj ;
}
2022-09-03 00:10:43 +03:00
return kmalloc_node ( c - > unit_size , flags , node ) ;
}
static struct mem_cgroup * get_memcg ( const struct bpf_mem_cache * c )
{
# ifdef CONFIG_MEMCG_KMEM
if ( c - > objcg )
return get_mem_cgroup_from_objcg ( c - > objcg ) ;
# endif
# ifdef CONFIG_MEMCG
return root_mem_cgroup ;
# else
return NULL ;
# endif
}
/* Mostly runs from irq_work except __init phase. */
static void alloc_bulk ( struct bpf_mem_cache * c , int cnt , int node )
{
struct mem_cgroup * memcg = NULL , * old_memcg ;
unsigned long flags ;
void * obj ;
int i ;
memcg = get_memcg ( c ) ;
old_memcg = set_active_memcg ( memcg ) ;
for ( i = 0 ; i < cnt ; i + + ) {
obj = __alloc ( c , node ) ;
if ( ! obj )
break ;
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
* reduce the chance of bpf prog executing on this cpu
* when active counter is busy .
*/
local_irq_save ( flags ) ;
/* alloc_bulk runs from irq_work which will not preempt a bpf
* program that does unit_alloc / unit_free since IRQs are
* disabled there . There is no race to increment ' active '
* counter . It protects free_llist from corruption in case NMI
* bpf prog preempted this loop .
*/
WARN_ON_ONCE ( local_inc_return ( & c - > active ) ! = 1 ) ;
__llist_add ( obj , & c - > free_llist ) ;
c - > free_cnt + + ;
local_dec ( & c - > active ) ;
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
local_irq_restore ( flags ) ;
}
set_active_memcg ( old_memcg ) ;
mem_cgroup_put ( memcg ) ;
}
static void free_one ( struct bpf_mem_cache * c , void * obj )
{
2022-09-03 00:10:57 +03:00
if ( c - > percpu_size ) {
2022-09-03 00:10:52 +03:00
free_percpu ( ( ( void * * ) obj ) [ 1 ] ) ;
2022-09-03 00:10:57 +03:00
kfree ( obj ) ;
2022-09-03 00:10:52 +03:00
return ;
}
2022-09-03 00:10:57 +03:00
kfree ( obj ) ;
2022-09-03 00:10:43 +03:00
}
2022-09-03 00:10:51 +03:00
static void __free_rcu ( struct rcu_head * head )
{
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu ) ;
struct llist_node * llnode = llist_del_all ( & c - > waiting_for_gp ) ;
struct llist_node * pos , * t ;
llist_for_each_safe ( pos , t , llnode )
free_one ( c , pos ) ;
atomic_set ( & c - > call_rcu_in_progress , 0 ) ;
}
2022-09-03 00:10:55 +03:00
static void __free_rcu_tasks_trace ( struct rcu_head * head )
{
struct bpf_mem_cache * c = container_of ( head , struct bpf_mem_cache , rcu ) ;
call_rcu ( & c - > rcu , __free_rcu ) ;
}
2022-09-03 00:10:51 +03:00
static void enque_to_free ( struct bpf_mem_cache * c , void * obj )
{
struct llist_node * llnode = obj ;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
* Nothing races to add to free_by_rcu list .
*/
__llist_add ( llnode , & c - > free_by_rcu ) ;
}
static void do_call_rcu ( struct bpf_mem_cache * c )
{
struct llist_node * llnode , * t ;
if ( atomic_xchg ( & c - > call_rcu_in_progress , 1 ) )
return ;
WARN_ON_ONCE ( ! llist_empty ( & c - > waiting_for_gp ) ) ;
llist_for_each_safe ( llnode , t , __llist_del_all ( & c - > free_by_rcu ) )
/* There is no concurrent __llist_add(waiting_for_gp) access.
* It doesn ' t race with llist_del_all either .
* But there could be two concurrent llist_del_all ( waiting_for_gp ) :
* from __free_rcu ( ) and from drain_mem_cache ( ) .
*/
__llist_add ( llnode , & c - > waiting_for_gp ) ;
2022-09-03 00:10:55 +03:00
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
* Then use call_rcu ( ) to wait for normal progs to finish
* and finally do free_one ( ) on each element .
*/
call_rcu_tasks_trace ( & c - > rcu , __free_rcu_tasks_trace ) ;
2022-09-03 00:10:51 +03:00
}
2022-09-03 00:10:43 +03:00
static void free_bulk ( struct bpf_mem_cache * c )
{
struct llist_node * llnode , * t ;
unsigned long flags ;
int cnt ;
do {
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
local_irq_save ( flags ) ;
WARN_ON_ONCE ( local_inc_return ( & c - > active ) ! = 1 ) ;
llnode = __llist_del_first ( & c - > free_llist ) ;
if ( llnode )
cnt = - - c - > free_cnt ;
else
cnt = 0 ;
local_dec ( & c - > active ) ;
if ( IS_ENABLED ( CONFIG_PREEMPT_RT ) )
local_irq_restore ( flags ) ;
2022-09-03 00:10:51 +03:00
enque_to_free ( c , llnode ) ;
2022-09-03 00:10:50 +03:00
} while ( cnt > ( c - > high_watermark + c - > low_watermark ) / 2 ) ;
2022-09-03 00:10:43 +03:00
/* and drain free_llist_extra */
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra ) )
2022-09-03 00:10:51 +03:00
enque_to_free ( c , llnode ) ;
do_call_rcu ( c ) ;
2022-09-03 00:10:43 +03:00
}
static void bpf_mem_refill ( struct irq_work * work )
{
struct bpf_mem_cache * c = container_of ( work , struct bpf_mem_cache , refill_work ) ;
int cnt ;
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
cnt = c - > free_cnt ;
2022-09-03 00:10:50 +03:00
if ( cnt < c - > low_watermark )
2022-09-03 00:10:43 +03:00
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here .
*/
2022-09-03 00:10:50 +03:00
alloc_bulk ( c , c - > batch , NUMA_NO_NODE ) ;
else if ( cnt > c - > high_watermark )
2022-09-03 00:10:43 +03:00
free_bulk ( c ) ;
}
static void notrace irq_work_raise ( struct bpf_mem_cache * c )
{
irq_work_queue ( & c - > refill_work ) ;
}
2022-09-03 00:10:50 +03:00
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
* the freelist cache will be elem_size * 64 ( or less ) on each cpu .
*
* For bpf programs that don ' t have statically known allocation sizes and
* assuming ( low_mark + high_mark ) / 2 as an average number of elements per
* bucket and all buckets are used the total amount of memory in freelists
* on each cpu will be :
* 64 * 16 + 64 * 32 + 64 * 64 + 64 * 96 + 64 * 128 + 64 * 196 + 64 * 256 + 32 * 512 + 16 * 1024 + 8 * 2048 + 4 * 4096
* = = ~ 116 Kbyte using below heuristic .
* Initialized , but unused bpf allocator ( not bpf map specific one ) will
* consume ~ 11 Kbyte per cpu .
* Typical case will be between 11 K and 116 K closer to 11 K .
* bpf progs can and should share bpf_mem_cache when possible .
*/
2022-09-03 00:10:43 +03:00
static void prefill_mem_cache ( struct bpf_mem_cache * c , int cpu )
{
init_irq_work ( & c - > refill_work , bpf_mem_refill ) ;
2022-09-03 00:10:50 +03:00
if ( c - > unit_size < = 256 ) {
c - > low_watermark = 32 ;
c - > high_watermark = 96 ;
} else {
/* When page_size == 4k, order-0 cache will have low_mark == 2
* and high_mark = = 6 with batch alloc of 3 individual pages at
* a time .
* 8 k allocs and above low = = 1 , high = = 3 , batch = = 1.
*/
c - > low_watermark = max ( 32 * 256 / c - > unit_size , 1 ) ;
c - > high_watermark = max ( 96 * 256 / c - > unit_size , 3 ) ;
}
c - > batch = max ( ( c - > high_watermark - c - > low_watermark ) / 4 * 3 , 1 ) ;
2022-09-03 00:10:43 +03:00
/* To avoid consuming memory assume that 1st run of bpf
* prog won ' t be doing more than 4 map_update_elem from
* irq disabled region
*/
alloc_bulk ( c , c - > unit_size < = 256 ? 4 : 1 , cpu_to_node ( cpu ) ) ;
}
2022-09-03 00:10:57 +03:00
/* When size != 0 bpf_mem_cache for each cpu.
2022-09-03 00:10:43 +03:00
* This is typical bpf hash map use case when all elements have equal size .
*
* When size = = 0 allocate 11 bpf_mem_cache - s for each cpu , then rely on
* kmalloc / kfree . Max allocation size is 4096 in this case .
* This is bpf_dynptr and bpf_kptr use case .
*/
2022-09-03 00:10:52 +03:00
int bpf_mem_alloc_init ( struct bpf_mem_alloc * ma , int size , bool percpu )
2022-09-03 00:10:43 +03:00
{
static u16 sizes [ NUM_CACHES ] = { 96 , 192 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 } ;
struct bpf_mem_caches * cc , __percpu * pcc ;
struct bpf_mem_cache * c , __percpu * pc ;
struct obj_cgroup * objcg = NULL ;
2022-09-03 00:10:57 +03:00
int cpu , i , unit_size , percpu_size = 0 ;
2022-09-03 00:10:43 +03:00
if ( size ) {
pc = __alloc_percpu_gfp ( sizeof ( * pc ) , 8 , GFP_KERNEL ) ;
if ( ! pc )
return - ENOMEM ;
2022-09-03 00:10:52 +03:00
2022-09-03 00:10:57 +03:00
if ( percpu )
2022-09-03 00:10:52 +03:00
/* room for llist_node and per-cpu pointer */
2022-09-03 00:10:57 +03:00
percpu_size = LLIST_NODE_SZ + sizeof ( void * ) ;
else
2022-09-03 00:10:52 +03:00
size + = LLIST_NODE_SZ ; /* room for llist_node */
2022-09-03 00:10:57 +03:00
unit_size = size ;
2022-09-03 00:10:52 +03:00
2022-09-03 00:10:43 +03:00
# ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current ( ) ;
# endif
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( pc , cpu ) ;
2022-09-03 00:10:52 +03:00
c - > unit_size = unit_size ;
2022-09-03 00:10:43 +03:00
c - > objcg = objcg ;
2022-09-03 00:10:57 +03:00
c - > percpu_size = percpu_size ;
2022-09-03 00:10:43 +03:00
prefill_mem_cache ( c , cpu ) ;
}
ma - > cache = pc ;
return 0 ;
}
2022-09-03 00:10:52 +03:00
/* size == 0 && percpu is an invalid combination */
if ( WARN_ON_ONCE ( percpu ) )
return - EINVAL ;
2022-09-03 00:10:43 +03:00
pcc = __alloc_percpu_gfp ( sizeof ( * cc ) , 8 , GFP_KERNEL ) ;
if ( ! pcc )
return - ENOMEM ;
# ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current ( ) ;
# endif
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( pcc , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
c - > unit_size = sizes [ i ] ;
c - > objcg = objcg ;
prefill_mem_cache ( c , cpu ) ;
}
}
ma - > caches = pcc ;
return 0 ;
}
static void drain_mem_cache ( struct bpf_mem_cache * c )
{
struct llist_node * llnode , * t ;
2022-09-03 00:10:51 +03:00
/* The caller has done rcu_barrier() and no progs are using this
* bpf_mem_cache , but htab_map_free ( ) called bpf_mem_cache_free ( ) for
* all remaining elements and they can be in free_by_rcu or in
* waiting_for_gp lists , so drain those lists now .
*/
llist_for_each_safe ( llnode , t , __llist_del_all ( & c - > free_by_rcu ) )
free_one ( c , llnode ) ;
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > waiting_for_gp ) )
free_one ( c , llnode ) ;
2022-09-03 00:10:43 +03:00
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist ) )
free_one ( c , llnode ) ;
llist_for_each_safe ( llnode , t , llist_del_all ( & c - > free_llist_extra ) )
free_one ( c , llnode ) ;
}
void bpf_mem_alloc_destroy ( struct bpf_mem_alloc * ma )
{
struct bpf_mem_caches * cc ;
struct bpf_mem_cache * c ;
int cpu , i ;
if ( ma - > cache ) {
for_each_possible_cpu ( cpu ) {
c = per_cpu_ptr ( ma - > cache , cpu ) ;
drain_mem_cache ( c ) ;
}
2022-09-03 00:10:57 +03:00
/* objcg is the same across cpus */
2022-09-03 00:10:43 +03:00
if ( c - > objcg )
obj_cgroup_put ( c - > objcg ) ;
2022-09-03 00:10:51 +03:00
/* c->waiting_for_gp list was drained, but __free_rcu might
* still execute . Wait for it now before we free ' c ' .
*/
2022-09-03 00:10:55 +03:00
rcu_barrier_tasks_trace ( ) ;
2022-09-03 00:10:51 +03:00
rcu_barrier ( ) ;
2022-09-03 00:10:43 +03:00
free_percpu ( ma - > cache ) ;
ma - > cache = NULL ;
}
if ( ma - > caches ) {
for_each_possible_cpu ( cpu ) {
cc = per_cpu_ptr ( ma - > caches , cpu ) ;
for ( i = 0 ; i < NUM_CACHES ; i + + ) {
c = & cc - > cache [ i ] ;
drain_mem_cache ( c ) ;
}
}
if ( c - > objcg )
obj_cgroup_put ( c - > objcg ) ;
2022-09-03 00:10:55 +03:00
rcu_barrier_tasks_trace ( ) ;
2022-09-03 00:10:51 +03:00
rcu_barrier ( ) ;
2022-09-03 00:10:43 +03:00
free_percpu ( ma - > caches ) ;
ma - > caches = NULL ;
}
}
/* notrace is necessary here and in other functions to make sure
* bpf programs cannot attach to them and cause llist corruptions .
*/
static void notrace * unit_alloc ( struct bpf_mem_cache * c )
{
struct llist_node * llnode = NULL ;
unsigned long flags ;
int cnt = 0 ;
/* Disable irqs to prevent the following race for majority of prog types:
* prog_A
* bpf_mem_alloc
* preemption or irq - > prog_B
* bpf_mem_alloc
*
* but prog_B could be a perf_event NMI prog .
* Use per - cpu ' active ' counter to order free_list access between
* unit_alloc / unit_free / bpf_mem_refill .
*/
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
llnode = __llist_del_first ( & c - > free_llist ) ;
if ( llnode )
cnt = - - c - > free_cnt ;
}
local_dec ( & c - > active ) ;
local_irq_restore ( flags ) ;
WARN_ON ( cnt < 0 ) ;
2022-09-03 00:10:50 +03:00
if ( cnt < c - > low_watermark )
2022-09-03 00:10:43 +03:00
irq_work_raise ( c ) ;
return llnode ;
}
/* Though 'ptr' object could have been allocated on a different cpu
* add it to the free_llist of the current cpu .
* Let kfree ( ) logic deal with it when it ' s later called from irq_work .
*/
static void notrace unit_free ( struct bpf_mem_cache * c , void * ptr )
{
struct llist_node * llnode = ptr - LLIST_NODE_SZ ;
unsigned long flags ;
int cnt = 0 ;
BUILD_BUG_ON ( LLIST_NODE_SZ > 8 ) ;
local_irq_save ( flags ) ;
if ( local_inc_return ( & c - > active ) = = 1 ) {
__llist_add ( llnode , & c - > free_llist ) ;
cnt = + + c - > free_cnt ;
} else {
/* unit_free() cannot fail. Therefore add an object to atomic
* llist . free_bulk ( ) will drain it . Though free_llist_extra is
* a per - cpu list we have to use atomic llist_add here , since
* it also can be interrupted by bpf nmi prog that does another
* unit_free ( ) into the same free_llist_extra .
*/
llist_add ( llnode , & c - > free_llist_extra ) ;
}
local_dec ( & c - > active ) ;
local_irq_restore ( flags ) ;
2022-09-03 00:10:50 +03:00
if ( cnt > c - > high_watermark )
2022-09-03 00:10:43 +03:00
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise ( c ) ;
}
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled .
*/
void notrace * bpf_mem_alloc ( struct bpf_mem_alloc * ma , size_t size )
{
int idx ;
void * ret ;
if ( ! size )
return ZERO_SIZE_PTR ;
idx = bpf_mem_cache_idx ( size + LLIST_NODE_SZ ) ;
if ( idx < 0 )
return NULL ;
ret = unit_alloc ( this_cpu_ptr ( ma - > caches ) - > cache + idx ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_free ( struct bpf_mem_alloc * ma , void * ptr )
{
int idx ;
if ( ! ptr )
return ;
idx = bpf_mem_cache_idx ( __ksize ( ptr - LLIST_NODE_SZ ) ) ;
if ( idx < 0 )
return ;
unit_free ( this_cpu_ptr ( ma - > caches ) - > cache + idx , ptr ) ;
}
void notrace * bpf_mem_cache_alloc ( struct bpf_mem_alloc * ma )
{
void * ret ;
ret = unit_alloc ( this_cpu_ptr ( ma - > cache ) ) ;
return ! ret ? NULL : ret + LLIST_NODE_SZ ;
}
void notrace bpf_mem_cache_free ( struct bpf_mem_alloc * ma , void * ptr )
{
if ( ! ptr )
return ;
unit_free ( this_cpu_ptr ( ma - > cache ) , ptr ) ;
}