2023-10-17 21:56:50 +08:00
// SPDX-License-Identifier: GPL-2.0
# include <linux/objpool.h>
# include <linux/slab.h>
# include <linux/vmalloc.h>
# include <linux/atomic.h>
# include <linux/irqflags.h>
# include <linux/cpumask.h>
# include <linux/log2.h>
/*
* objpool : ring - array based lockless MPMC / FIFO queues
*
* Copyright : wuqiang . matt @ bytedance . com , mhiramat @ kernel . org
*/
/* initialize percpu objpool_slot */
static int
objpool_init_percpu_slot ( struct objpool_head * pool ,
struct objpool_slot * slot ,
int nodes , void * context ,
objpool_init_obj_cb objinit )
{
void * obj = ( void * ) & slot - > entries [ pool - > capacity ] ;
int i ;
/* initialize elements of percpu objpool_slot */
slot - > mask = pool - > capacity - 1 ;
for ( i = 0 ; i < nodes ; i + + ) {
if ( objinit ) {
int rc = objinit ( obj , context ) ;
if ( rc )
return rc ;
}
slot - > entries [ slot - > tail & slot - > mask ] = obj ;
obj = obj + pool - > obj_size ;
slot - > tail + + ;
slot - > last = slot - > tail ;
pool - > nr_objs + + ;
}
return 0 ;
}
/* allocate and initialize percpu slots */
static int
objpool_init_percpu_slots ( struct objpool_head * pool , int nr_objs ,
void * context , objpool_init_obj_cb objinit )
{
int i , cpu_count = 0 ;
for ( i = 0 ; i < pool - > nr_cpus ; i + + ) {
struct objpool_slot * slot ;
int nodes , size , rc ;
/* skip the cpu node which could never be present */
if ( ! cpu_possible ( i ) )
continue ;
/* compute how many objects to be allocated with this slot */
nodes = nr_objs / num_possible_cpus ( ) ;
if ( cpu_count < ( nr_objs % num_possible_cpus ( ) ) )
nodes + + ;
cpu_count + + ;
size = struct_size ( slot , entries , pool - > capacity ) +
pool - > obj_size * nodes ;
/*
* here we allocate percpu - slot & objs together in a single
* allocation to make it more compact , taking advantage of
* warm caches and TLB hits . in default vmalloc is used to
* reduce the pressure of kernel slab system . as we know ,
* mimimal size of vmalloc is one page since vmalloc would
* always align the requested size to page size
*/
if ( pool - > gfp & GFP_ATOMIC )
slot = kmalloc_node ( size , pool - > gfp , cpu_to_node ( i ) ) ;
else
slot = __vmalloc_node ( size , sizeof ( void * ) , pool - > gfp ,
cpu_to_node ( i ) , __builtin_return_address ( 0 ) ) ;
if ( ! slot )
return - ENOMEM ;
memset ( slot , 0 , size ) ;
pool - > cpu_slots [ i ] = slot ;
/* initialize the objpool_slot of cpu node i */
rc = objpool_init_percpu_slot ( pool , slot , nodes , context , objinit ) ;
if ( rc )
return rc ;
}
return 0 ;
}
/* cleanup all percpu slots of the object pool */
static void objpool_fini_percpu_slots ( struct objpool_head * pool )
{
int i ;
if ( ! pool - > cpu_slots )
return ;
for ( i = 0 ; i < pool - > nr_cpus ; i + + )
kvfree ( pool - > cpu_slots [ i ] ) ;
kfree ( pool - > cpu_slots ) ;
}
/* initialize object pool and pre-allocate objects */
int objpool_init ( struct objpool_head * pool , int nr_objs , int object_size ,
gfp_t gfp , void * context , objpool_init_obj_cb objinit ,
objpool_fini_cb release )
{
int rc , capacity , slot_size ;
/* check input parameters */
if ( nr_objs < = 0 | | nr_objs > OBJPOOL_NR_OBJECT_MAX | |
object_size < = 0 | | object_size > OBJPOOL_OBJECT_SIZE_MAX )
return - EINVAL ;
/* align up to unsigned long size */
object_size = ALIGN ( object_size , sizeof ( long ) ) ;
/* calculate capacity of percpu objpool_slot */
capacity = roundup_pow_of_two ( nr_objs ) ;
if ( ! capacity )
return - EINVAL ;
/* initialize objpool pool */
memset ( pool , 0 , sizeof ( struct objpool_head ) ) ;
pool - > nr_cpus = nr_cpu_ids ;
pool - > obj_size = object_size ;
pool - > capacity = capacity ;
pool - > gfp = gfp & ~ __GFP_ZERO ;
pool - > context = context ;
pool - > release = release ;
slot_size = pool - > nr_cpus * sizeof ( struct objpool_slot ) ;
pool - > cpu_slots = kzalloc ( slot_size , pool - > gfp ) ;
if ( ! pool - > cpu_slots )
return - ENOMEM ;
/* initialize per-cpu slots */
rc = objpool_init_percpu_slots ( pool , nr_objs , context , objinit ) ;
if ( rc )
objpool_fini_percpu_slots ( pool ) ;
else
refcount_set ( & pool - > ref , pool - > nr_objs + 1 ) ;
return rc ;
}
EXPORT_SYMBOL_GPL ( objpool_init ) ;
/* adding object to slot, abort if the slot was already full */
static inline int
objpool_try_add_slot ( void * obj , struct objpool_head * pool , int cpu )
{
struct objpool_slot * slot = pool - > cpu_slots [ cpu ] ;
uint32_t head , tail ;
/* loading tail and head as a local snapshot, tail first */
tail = READ_ONCE ( slot - > tail ) ;
do {
head = READ_ONCE ( slot - > head ) ;
/* fault caught: something must be wrong */
WARN_ON_ONCE ( tail - head > pool - > nr_objs ) ;
} while ( ! try_cmpxchg_acquire ( & slot - > tail , & tail , tail + 1 ) ) ;
/* now the tail position is reserved for the given obj */
WRITE_ONCE ( slot - > entries [ tail & slot - > mask ] , obj ) ;
/* update sequence to make this obj available for pop() */
smp_store_release ( & slot - > last , tail + 1 ) ;
return 0 ;
}
/* reclaim an object to object pool */
int objpool_push ( void * obj , struct objpool_head * pool )
{
unsigned long flags ;
int rc ;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save ( flags ) ;
rc = objpool_try_add_slot ( obj , pool , raw_smp_processor_id ( ) ) ;
raw_local_irq_restore ( flags ) ;
return rc ;
}
EXPORT_SYMBOL_GPL ( objpool_push ) ;
/* try to retrieve object from slot */
static inline void * objpool_try_get_slot ( struct objpool_head * pool , int cpu )
{
struct objpool_slot * slot = pool - > cpu_slots [ cpu ] ;
/* load head snapshot, other cpus may change it */
uint32_t head = smp_load_acquire ( & slot - > head ) ;
while ( head ! = READ_ONCE ( slot - > last ) ) {
void * obj ;
lib: objpool: fix head overrun on RK3588 SBC
objpool overrun stress with test_objpool on OrangePi5+ SBC triggered the
following kernel warnings:
WARNING: CPU: 6 PID: 3115 at lib/objpool.c:168 objpool_push+0xc0/0x100
This message is from objpool.c:168:
WARN_ON_ONCE(tail - head > pool->nr_objs);
The overrun test case is to validate the case that pre-allocated objects
are insufficient: 8 objects are pre-allocated for each node and consumer
thread per node tries to grab 16 objects in a row. The testing system is
OrangePI 5+, with RK3588, a big.LITTLE SOC with 4x A76 and 4x A55. When
disabling either all 4 big or 4 little cores, the overrun tests run well,
and once with big and little cores mixed together, the overrun test would
always cause an overrun loop. It's likely the memory timing differences
of big and little cores cause this trouble. Here are the debugging data
of objpool_try_get_slot after try_cmpxchg_release:
objpool_pop: cpu: 4/0 0:0 head: 278/279 tail:278 last:276/278
The local copies of 'head' and 'last' were 278 and 276, and reloading of
'slot->head' and 'slot->last' got 279 and 278. After try_cmpxchg_release
'slot->head' became 'head + 1', which is correct. But what's wrong here
is the stale value of 'last', and that stale value of 'last' finally led
the overrun of 'head'.
Memory updating of 'last' and 'head' are performed in push() and pop()
independently, which could be the culprit leading this out of order
visibility of 'last' and 'head'. So for objpool_try_get_slot(), it's
not enough only checking the condition of 'head != slot', the implicit
condition 'last - head <= nr_objs' must also be explicitly asserted to
guarantee 'last' is always behind 'head' before the object retrieving.
This patch will check and try reloading of 'head' and 'last' to ensure
'last' is behind 'head' at the time of object retrieving. Performance
testings show the average impact is about 0.1% for X86_64 and 1.12% for
ARM64. Here are the results:
OS: Debian 10 X86_64, Linux 6.6rc
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T
native: 49543304 99277826 199017659 399070324 795185848
objpool: 29909085 59865637 119692073 239750369 478005250
objpool+: 29879313 59230743 119609856 239067773 478509029
32T 48T 64T 96T 128T
native: 1596927073 2390099988 2929397330 3183875848 3257546602
objpool: 957553042 1435814086 1680872925 2043126796 2165424198
objpool+: 956476281 1434491297 1666055740 2041556569 2157415622
OS: Debian 11 AARCH64, Linux 6.6rc
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: 30890508 60399915 123111980 242257008 494002946
objpool: 14742531 28883047 57739948 115886644 232455421
objpool+: 14107220 29032998 57286084 113730493 232232850
24T 32T 48T 64T 96T
native: 746406039 1000174750 1493236240 1998318364 2942911180
objpool: 349164852 467284332 702296756 934459713 1387898285
objpool+: 348388180 462750976 696606096 927865887 1368402195
Link: https://lore.kernel.org/all/20231114115148.298821-1-wuqiang.matt@bytedance.com/
Fixes: b4edb8d2d464 ("lib: objpool added: ring-array based lockless MPMC")
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-12-01 14:53:55 +09:00
/*
* data visibility of ' last ' and ' head ' could be out of
* order since memory updating of ' last ' and ' head ' are
* performed in push ( ) and pop ( ) independently
*
* before any retrieving attempts , pop ( ) must guarantee
* ' last ' is behind ' head ' , that is to say , there must
* be available objects in slot , which could be ensured
* by condition ' last ! = head & & last - head < = nr_objs '
* that is equivalent to ' last - head - 1 < nr_objs ' as
* ' last ' and ' head ' are both unsigned int32
*/
if ( READ_ONCE ( slot - > last ) - head - 1 > = pool - > nr_objs ) {
head = READ_ONCE ( slot - > head ) ;
continue ;
}
2023-10-17 21:56:50 +08:00
/* obj must be retrieved before moving forward head */
obj = READ_ONCE ( slot - > entries [ head & slot - > mask ] ) ;
/* move head forward to mark it's consumption */
if ( try_cmpxchg_release ( & slot - > head , & head , head + 1 ) )
return obj ;
}
return NULL ;
}
/* allocate an object from object pool */
void * objpool_pop ( struct objpool_head * pool )
{
void * obj = NULL ;
unsigned long flags ;
int i , cpu ;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save ( flags ) ;
cpu = raw_smp_processor_id ( ) ;
for ( i = 0 ; i < num_possible_cpus ( ) ; i + + ) {
obj = objpool_try_get_slot ( pool , cpu ) ;
if ( obj )
break ;
cpu = cpumask_next_wrap ( cpu , cpu_possible_mask , - 1 , 1 ) ;
}
raw_local_irq_restore ( flags ) ;
return obj ;
}
EXPORT_SYMBOL_GPL ( objpool_pop ) ;
/* release whole objpool forcely */
void objpool_free ( struct objpool_head * pool )
{
if ( ! pool - > cpu_slots )
return ;
/* release percpu slots */
objpool_fini_percpu_slots ( pool ) ;
/* call user's cleanup callback if provided */
if ( pool - > release )
pool - > release ( pool , pool - > context ) ;
}
EXPORT_SYMBOL_GPL ( objpool_free ) ;
/* drop the allocated object, rather reclaim it to objpool */
int objpool_drop ( void * obj , struct objpool_head * pool )
{
if ( ! obj | | ! pool )
return - EINVAL ;
if ( refcount_dec_and_test ( & pool - > ref ) ) {
objpool_free ( pool ) ;
return 0 ;
}
return - EAGAIN ;
}
EXPORT_SYMBOL_GPL ( objpool_drop ) ;
/* drop unused objects and defref objpool for releasing */
void objpool_fini ( struct objpool_head * pool )
{
int count = 1 ; /* extra ref for objpool itself */
/* drop all remained objects from objpool */
while ( objpool_pop ( pool ) )
count + + ;
if ( refcount_sub_and_test ( count , & pool - > ref ) )
objpool_free ( pool ) ;
}
EXPORT_SYMBOL_GPL ( objpool_fini ) ;