2022-03-15 17:00:38 +03:00
// SPDX-License-Identifier: GPL-2.0
/*
* fprobe - Simple ftrace probe wrapper for function entry .
*/
# define pr_fmt(fmt) "fprobe: " fmt
# include <linux/err.h>
# include <linux/fprobe.h>
# include <linux/kallsyms.h>
# include <linux/kprobes.h>
2022-03-15 17:01:48 +03:00
# include <linux/rethook.h>
2022-03-15 17:00:38 +03:00
# include <linux/slab.h>
# include <linux/sort.h>
2022-03-15 17:01:48 +03:00
# include "trace.h"
struct fprobe_rethook_node {
struct rethook_node node ;
unsigned long entry_ip ;
2023-05-17 06:45:08 +03:00
unsigned long entry_parent_ip ;
2023-02-01 18:56:01 +03:00
char data [ ] ;
2022-03-15 17:01:48 +03:00
} ;
2023-05-17 06:45:07 +03:00
static inline void __fprobe_handler ( unsigned long ip , unsigned long parent_ip ,
struct ftrace_ops * ops , struct ftrace_regs * fregs )
2022-03-15 17:00:38 +03:00
{
2022-03-15 17:01:48 +03:00
struct fprobe_rethook_node * fpr ;
2023-02-01 18:56:01 +03:00
struct rethook_node * rh = NULL ;
2022-03-15 17:00:38 +03:00
struct fprobe * fp ;
2023-02-01 18:56:01 +03:00
void * entry_data = NULL ;
2023-05-17 06:45:07 +03:00
int ret = 0 ;
2022-03-15 17:00:38 +03:00
fp = container_of ( ops , struct fprobe , ops ) ;
2022-03-15 17:01:48 +03:00
if ( fp - > exit_handler ) {
rh = rethook_try_get ( fp - > rethook ) ;
if ( ! rh ) {
fp - > nmissed + + ;
2023-05-17 06:45:07 +03:00
return ;
2022-03-15 17:01:48 +03:00
}
fpr = container_of ( rh , struct fprobe_rethook_node , node ) ;
fpr - > entry_ip = ip ;
2023-05-17 06:45:08 +03:00
fpr - > entry_parent_ip = parent_ip ;
2023-02-01 18:56:01 +03:00
if ( fp - > entry_data_size )
entry_data = fpr - > data ;
2022-03-15 17:01:48 +03:00
}
2023-02-01 18:56:01 +03:00
if ( fp - > entry_handler )
2023-06-06 15:39:55 +03:00
ret = fp - > entry_handler ( fp , ip , parent_ip , ftrace_get_regs ( fregs ) , entry_data ) ;
2023-02-01 18:56:01 +03:00
2023-02-01 18:56:38 +03:00
/* If entry_handler returns !0, nmissed is not counted. */
if ( rh ) {
if ( ret )
rethook_recycle ( rh ) ;
else
rethook_hook ( rh , ftrace_get_regs ( fregs ) , true ) ;
}
2023-05-17 06:45:07 +03:00
}
static void fprobe_handler ( unsigned long ip , unsigned long parent_ip ,
struct ftrace_ops * ops , struct ftrace_regs * fregs )
{
struct fprobe * fp ;
int bit ;
fp = container_of ( ops , struct fprobe , ops ) ;
if ( fprobe_disabled ( fp ) )
return ;
/* recursion detection has to go before any traceable function and
* all functions before this point should be marked as notrace
*/
bit = ftrace_test_recursion_trylock ( ip , parent_ip ) ;
if ( bit < 0 ) {
fp - > nmissed + + ;
return ;
}
__fprobe_handler ( ip , parent_ip , ops , fregs ) ;
2022-03-15 17:00:38 +03:00
ftrace_test_recursion_unlock ( bit ) ;
2023-05-17 06:45:07 +03:00
2022-03-15 17:00:38 +03:00
}
NOKPROBE_SYMBOL ( fprobe_handler ) ;
2022-03-15 17:02:11 +03:00
static void fprobe_kprobe_handler ( unsigned long ip , unsigned long parent_ip ,
struct ftrace_ops * ops , struct ftrace_regs * fregs )
{
2023-05-17 06:45:07 +03:00
struct fprobe * fp ;
int bit ;
fp = container_of ( ops , struct fprobe , ops ) ;
if ( fprobe_disabled ( fp ) )
return ;
/* recursion detection has to go before any traceable function and
* all functions called before this point should be marked as notrace
*/
bit = ftrace_test_recursion_trylock ( ip , parent_ip ) ;
if ( bit < 0 ) {
fp - > nmissed + + ;
return ;
}
2022-03-15 17:02:11 +03:00
2023-07-07 19:38:03 +03:00
/*
* This user handler is shared with other kprobes and is not expected to be
* called recursively . So if any other kprobe handler is running , this will
* exit as kprobe does . See the section ' Share the callbacks with kprobes '
* in Documentation / trace / fprobe . rst for more information .
*/
2022-03-15 17:02:11 +03:00
if ( unlikely ( kprobe_running ( ) ) ) {
fp - > nmissed + + ;
2023-07-03 12:23:36 +03:00
goto recursion_unlock ;
2022-03-15 17:02:11 +03:00
}
2023-05-17 06:45:07 +03:00
2022-03-15 17:02:11 +03:00
kprobe_busy_begin ( ) ;
2023-05-17 06:45:07 +03:00
__fprobe_handler ( ip , parent_ip , ops , fregs ) ;
2022-03-15 17:02:11 +03:00
kprobe_busy_end ( ) ;
2023-07-03 12:23:36 +03:00
recursion_unlock :
2023-05-17 06:45:07 +03:00
ftrace_test_recursion_unlock ( bit ) ;
2022-03-15 17:02:11 +03:00
}
2022-03-15 17:01:48 +03:00
static void fprobe_exit_handler ( struct rethook_node * rh , void * data ,
2023-06-06 15:39:55 +03:00
unsigned long ret_ip , struct pt_regs * regs )
2022-03-15 17:01:48 +03:00
{
struct fprobe * fp = ( struct fprobe * ) data ;
struct fprobe_rethook_node * fpr ;
2023-05-17 06:45:08 +03:00
int bit ;
2022-03-15 17:01:48 +03:00
if ( ! fp | | fprobe_disabled ( fp ) )
return ;
fpr = container_of ( rh , struct fprobe_rethook_node , node ) ;
2023-05-17 06:45:08 +03:00
/*
* we need to assure no calls to traceable functions in - between the
* end of fprobe_handler and the beginning of fprobe_exit_handler .
*/
bit = ftrace_test_recursion_trylock ( fpr - > entry_ip , fpr - > entry_parent_ip ) ;
if ( bit < 0 ) {
fp - > nmissed + + ;
return ;
}
2023-06-06 15:39:55 +03:00
fp - > exit_handler ( fp , fpr - > entry_ip , ret_ip , regs ,
2023-02-01 18:56:01 +03:00
fp - > entry_data_size ? ( void * ) fpr - > data : NULL ) ;
2023-05-17 06:45:08 +03:00
ftrace_test_recursion_unlock ( bit ) ;
2022-03-15 17:01:48 +03:00
}
NOKPROBE_SYMBOL ( fprobe_exit_handler ) ;
2022-05-10 15:26:14 +03:00
static int symbols_cmp ( const void * a , const void * b )
{
const char * * str_a = ( const char * * ) a ;
const char * * str_b = ( const char * * ) b ;
return strcmp ( * str_a , * str_b ) ;
}
2022-03-15 17:00:38 +03:00
/* Convert ftrace location address from symbols */
static unsigned long * get_ftrace_locations ( const char * * syms , int num )
{
unsigned long * addrs ;
/* Convert symbols to symbol address */
addrs = kcalloc ( num , sizeof ( * addrs ) , GFP_KERNEL ) ;
if ( ! addrs )
return ERR_PTR ( - ENOMEM ) ;
2022-05-10 15:26:14 +03:00
/* ftrace_lookup_symbols expects sorted symbols */
sort ( syms , num , sizeof ( * syms ) , symbols_cmp , NULL ) ;
2022-03-15 17:00:38 +03:00
2022-05-10 15:26:14 +03:00
if ( ! ftrace_lookup_symbols ( syms , num , addrs ) )
return addrs ;
2022-03-15 17:00:38 +03:00
kfree ( addrs ) ;
return ERR_PTR ( - ENOENT ) ;
}
static void fprobe_init ( struct fprobe * fp )
{
fp - > nmissed = 0 ;
2022-03-15 17:02:11 +03:00
if ( fprobe_shared_with_kprobes ( fp ) )
fp - > ops . func = fprobe_kprobe_handler ;
else
fp - > ops . func = fprobe_handler ;
2022-03-15 17:00:38 +03:00
fp - > ops . flags | = FTRACE_OPS_FL_SAVE_REGS ;
}
2022-03-15 17:01:48 +03:00
static int fprobe_init_rethook ( struct fprobe * fp , int num )
{
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
int size ;
2022-03-15 17:01:48 +03:00
if ( ! fp - > exit_handler ) {
fp - > rethook = NULL ;
return 0 ;
}
/* Initialize rethook if needed */
2023-02-01 18:56:19 +03:00
if ( fp - > nr_maxactive )
2024-03-01 03:18:24 +03:00
num = fp - > nr_maxactive ;
2023-02-01 18:56:19 +03:00
else
2024-03-01 03:18:24 +03:00
num * = num_possible_cpus ( ) * 2 ;
if ( num < = 0 )
2023-10-17 02:49:45 +03:00
return - EINVAL ;
2022-03-15 17:01:48 +03:00
2024-03-01 03:18:24 +03:00
size = sizeof ( struct fprobe_rethook_node ) + fp - > entry_data_size ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
/* Initialize rethook */
2024-03-01 03:18:24 +03:00
fp - > rethook = rethook_alloc ( ( void * ) fp , fprobe_exit_handler , size , num ) ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( IS_ERR ( fp - > rethook ) )
return PTR_ERR ( fp - > rethook ) ;
2022-03-15 17:01:48 +03:00
return 0 ;
}
static void fprobe_fail_cleanup ( struct fprobe * fp )
{
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( ! IS_ERR_OR_NULL ( fp - > rethook ) ) {
2022-03-15 17:01:48 +03:00
/* Don't need to cleanup rethook->handler because this is not used. */
rethook_free ( fp - > rethook ) ;
fp - > rethook = NULL ;
}
ftrace_free_filter ( & fp - > ops ) ;
}
2022-03-15 17:00:38 +03:00
/**
* register_fprobe ( ) - Register fprobe to ftrace by pattern .
* @ fp : A fprobe data structure to be registered .
* @ filter : A wildcard pattern of probed symbols .
* @ notfilter : A wildcard pattern of NOT probed symbols .
*
* Register @ fp to ftrace for enabling the probe on the symbols matched to @ filter .
* If @ notfilter is not NULL , the symbols matched the @ notfilter are not probed .
*
* Return 0 if @ fp is registered successfully , - errno if not .
*/
int register_fprobe ( struct fprobe * fp , const char * filter , const char * notfilter )
{
2022-03-15 17:01:48 +03:00
struct ftrace_hash * hash ;
2022-03-15 17:00:38 +03:00
unsigned char * str ;
int ret , len ;
if ( ! fp | | ! filter )
return - EINVAL ;
fprobe_init ( fp ) ;
len = strlen ( filter ) ;
str = kstrdup ( filter , GFP_KERNEL ) ;
ret = ftrace_set_filter ( & fp - > ops , str , len , 0 ) ;
kfree ( str ) ;
if ( ret )
return ret ;
if ( notfilter ) {
len = strlen ( notfilter ) ;
str = kstrdup ( notfilter , GFP_KERNEL ) ;
ret = ftrace_set_notrace ( & fp - > ops , str , len , 0 ) ;
kfree ( str ) ;
if ( ret )
goto out ;
}
2022-03-15 17:01:48 +03:00
/* TODO:
* correctly calculate the total number of filtered symbols
* from both filter and notfilter .
*/
2022-03-23 10:35:36 +03:00
hash = rcu_access_pointer ( fp - > ops . local_hash . filter_hash ) ;
2022-03-15 17:01:48 +03:00
if ( WARN_ON_ONCE ( ! hash ) )
goto out ;
ret = fprobe_init_rethook ( fp , ( int ) hash - > count ) ;
if ( ! ret )
ret = register_ftrace_function ( & fp - > ops ) ;
2022-03-15 17:00:38 +03:00
out :
if ( ret )
2022-03-15 17:01:48 +03:00
fprobe_fail_cleanup ( fp ) ;
2022-03-15 17:00:38 +03:00
return ret ;
}
EXPORT_SYMBOL_GPL ( register_fprobe ) ;
/**
* register_fprobe_ips ( ) - Register fprobe to ftrace by address .
* @ fp : A fprobe data structure to be registered .
* @ addrs : An array of target ftrace location addresses .
* @ num : The number of entries of @ addrs .
*
* Register @ fp to ftrace for enabling the probe on the address given by @ addrs .
* The @ addrs must be the addresses of ftrace location address , which may be
* the symbol address + arch - dependent offset .
* If you unsure what this mean , please use other registration functions .
*
* Return 0 if @ fp is registered successfully , - errno if not .
*/
int register_fprobe_ips ( struct fprobe * fp , unsigned long * addrs , int num )
{
int ret ;
if ( ! fp | | ! addrs | | num < = 0 )
return - EINVAL ;
fprobe_init ( fp ) ;
ret = ftrace_set_filter_ips ( & fp - > ops , addrs , num , 0 , 0 ) ;
2022-03-15 17:01:48 +03:00
if ( ret )
return ret ;
ret = fprobe_init_rethook ( fp , num ) ;
2022-03-15 17:00:38 +03:00
if ( ! ret )
ret = register_ftrace_function ( & fp - > ops ) ;
if ( ret )
2022-03-15 17:01:48 +03:00
fprobe_fail_cleanup ( fp ) ;
2022-03-15 17:00:38 +03:00
return ret ;
}
EXPORT_SYMBOL_GPL ( register_fprobe_ips ) ;
/**
* register_fprobe_syms ( ) - Register fprobe to ftrace by symbols .
* @ fp : A fprobe data structure to be registered .
* @ syms : An array of target symbols .
* @ num : The number of entries of @ syms .
*
* Register @ fp to the symbols given by @ syms array . This will be useful if
* you are sure the symbols exist in the kernel .
*
* Return 0 if @ fp is registered successfully , - errno if not .
*/
int register_fprobe_syms ( struct fprobe * fp , const char * * syms , int num )
{
unsigned long * addrs ;
int ret ;
if ( ! fp | | ! syms | | num < = 0 )
return - EINVAL ;
addrs = get_ftrace_locations ( syms , num ) ;
if ( IS_ERR ( addrs ) )
return PTR_ERR ( addrs ) ;
ret = register_fprobe_ips ( fp , addrs , num ) ;
kfree ( addrs ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( register_fprobe_syms ) ;
2023-06-06 15:39:55 +03:00
bool fprobe_is_registered ( struct fprobe * fp )
{
if ( ! fp | | ( fp - > ops . saved_func ! = fprobe_handler & &
fp - > ops . saved_func ! = fprobe_kprobe_handler ) )
return false ;
return true ;
}
2022-03-15 17:00:38 +03:00
/**
* unregister_fprobe ( ) - Unregister fprobe from ftrace
* @ fp : A fprobe data structure to be unregistered .
*
* Unregister fprobe ( and remove ftrace hooks from the function entries ) .
*
* Return 0 if @ fp is unregistered successfully , - errno if not .
*/
int unregister_fprobe ( struct fprobe * fp )
{
int ret ;
2023-06-06 15:39:55 +03:00
if ( ! fprobe_is_registered ( fp ) )
2022-03-15 17:00:38 +03:00
return - EINVAL ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( ! IS_ERR_OR_NULL ( fp - > rethook ) )
2023-07-07 17:03:19 +03:00
rethook_stop ( fp - > rethook ) ;
2022-03-15 17:01:48 +03:00
2022-03-15 17:00:38 +03:00
ret = unregister_ftrace_function ( & fp - > ops ) ;
2022-03-15 17:01:48 +03:00
if ( ret < 0 )
return ret ;
2022-03-15 17:00:38 +03:00
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( ! IS_ERR_OR_NULL ( fp - > rethook ) )
2023-06-15 14:52:36 +03:00
rethook_free ( fp - > rethook ) ;
2022-03-15 17:01:48 +03:00
ftrace_free_filter ( & fp - > ops ) ;
2022-03-15 17:00:38 +03:00
return ret ;
}
EXPORT_SYMBOL_GPL ( unregister_fprobe ) ;