2019-05-27 09:55:05 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* Kernel Probes ( KProbes )
*
* Copyright ( C ) IBM Corporation , 2002 , 2004
*
* 2002 - Oct Created by Vamsi Krishna S < vamsi_krishna @ in . ibm . com > Kernel
* Probes initial implementation ( includes suggestions from
* Rusty Russell ) .
* 2004 - Aug Updated by Prasanna S Panchamukhi < prasanna @ in . ibm . com > with
* hlists and exceptions notifier as suggested by Andi Kleen .
* 2004 - July Suparna Bhattacharya < suparna @ in . ibm . com > added jumper probes
* interface to access function arguments .
* 2004 - Sep Prasanna S Panchamukhi < prasanna @ in . ibm . com > Changed Kprobes
* exceptions notifier to be first on the priority list .
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
* 2005 - May Hien Nguyen < hien @ us . ibm . com > , Jim Keniston
* < jkenisto @ us . ibm . com > and Prasanna S Panchamukhi
* < prasanna @ in . ibm . com > added function - return probes .
2005-04-17 02:20:36 +04:00
*/
2021-09-14 17:39:25 +03:00
# define pr_fmt(fmt) "kprobes: " fmt
2005-04-17 02:20:36 +04:00
# include <linux/kprobes.h>
# include <linux/hash.h>
# include <linux/init.h>
2005-10-31 02:03:48 +03:00
# include <linux/slab.h>
2007-05-08 11:27:01 +04:00
# include <linux/stddef.h>
2011-05-23 22:51:41 +04:00
# include <linux/export.h>
2005-06-28 02:17:01 +04:00
# include <linux/moduleloader.h>
2006-10-02 13:17:30 +04:00
# include <linux/kallsyms.h>
2006-12-07 07:38:11 +03:00
# include <linux/freezer.h>
2007-02-21 00:57:54 +03:00
# include <linux/seq_file.h>
# include <linux/debugfs.h>
2010-02-25 16:34:15 +03:00
# include <linux/sysctl.h>
2007-05-08 11:27:03 +04:00
# include <linux/kdebug.h>
2009-03-06 18:36:38 +03:00
# include <linux/memory.h>
2010-02-03 00:49:18 +03:00
# include <linux/ftrace.h>
2010-02-25 16:34:07 +03:00
# include <linux/cpu.h>
2010-09-17 19:09:00 +04:00
# include <linux/jump_label.h>
2021-06-28 14:24:12 +03:00
# include <linux/static_call.h>
2020-05-12 15:19:11 +03:00
# include <linux/perf_event.h>
2007-05-08 11:34:16 +04:00
2016-10-11 23:52:22 +03:00
# include <asm/sections.h>
2005-04-17 02:20:36 +04:00
# include <asm/cacheflush.h>
# include <asm/errno.h>
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2005-04-17 02:20:36 +04:00
# define KPROBE_HASH_BITS 6
# define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
2022-01-22 09:13:41 +03:00
# if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
# define kprobe_sysctls_init() do { } while (0)
# endif
2006-10-02 13:17:30 +04:00
2008-07-25 12:46:04 +04:00
static int kprobes_initialized ;
2020-05-12 11:02:44 +03:00
/* kprobe_table can be accessed by
2021-09-14 17:39:34 +03:00
* - Normal hlist traversal and RCU add / del under ' kprobe_mutex ' is held .
2020-05-12 11:02:44 +03:00
* Or
* - RCU hlist traversal under disabling preempt ( breakpoint handlers )
*/
2005-04-17 02:20:36 +04:00
static struct hlist_head kprobe_table [ KPROBE_TABLE_SIZE ] ;
2021-09-14 17:39:34 +03:00
/* NOTE: change this value only with 'kprobe_mutex' held */
2009-04-07 06:01:01 +04:00
static bool kprobes_all_disarmed ;
2007-05-08 11:34:16 +04:00
2021-09-14 17:39:34 +03:00
/* This protects 'kprobe_table' and 'optimizing_list' */
2010-10-25 17:18:01 +04:00
static DEFINE_MUTEX ( kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
static DEFINE_PER_CPU ( struct kprobe * , kprobe_instance ) ;
2008-07-25 12:46:04 +04:00
2017-04-19 15:51:01 +03:00
kprobe_opcode_t * __weak kprobe_lookup_name ( const char * name ,
unsigned int __unused )
2017-04-19 15:51:00 +03:00
{
return ( ( kprobe_opcode_t * ) ( kallsyms_lookup_name ( name ) ) ) ;
}
2021-09-14 17:39:34 +03:00
/*
* Blacklist - - list of ' struct kprobe_blacklist_entry ' to store info where
* kprobes can not probe .
*/
2014-04-17 12:17:05 +04:00
static LIST_HEAD ( kprobe_blacklist ) ;
2008-04-28 13:14:26 +04:00
2006-01-10 07:52:41 +03:00
# ifdef __ARCH_WANT_KPROBES_INSN_SLOT
2005-06-28 02:17:01 +04:00
/*
2021-09-14 17:39:34 +03:00
* ' kprobe : : ainsn . insn ' points to the copy of the instruction to be
2005-06-28 02:17:01 +04:00
* single - stepped . x86_64 , POWER4 and above have no - exec support and
* stepping on the instruction on a vmalloced / kmalloced / data page
* is a recipe for disaster
*/
struct kprobe_insn_page {
2009-07-01 01:08:14 +04:00
struct list_head list ;
2005-06-28 02:17:01 +04:00
kprobe_opcode_t * insns ; /* Page of instruction slots */
2013-09-12 01:24:13 +04:00
struct kprobe_insn_cache * cache ;
2005-06-28 02:17:01 +04:00
int nused ;
2006-12-07 07:38:11 +03:00
int ngarbage ;
2010-02-25 16:33:59 +03:00
char slot_used [ ] ;
2005-06-28 02:17:01 +04:00
} ;
2010-02-25 16:33:59 +03:00
# define KPROBE_INSN_PAGE_SIZE(slots) \
( offsetof ( struct kprobe_insn_page , slot_used ) + \
( sizeof ( char ) * ( slots ) ) )
static int slots_per_page ( struct kprobe_insn_cache * c )
{
return PAGE_SIZE / ( c - > insn_size * sizeof ( kprobe_opcode_t ) ) ;
}
2007-01-31 01:36:06 +03:00
enum kprobe_slot_state {
SLOT_CLEAN = 0 ,
SLOT_DIRTY = 1 ,
SLOT_USED = 2 ,
} ;
2017-08-18 11:24:00 +03:00
void __weak * alloc_insn_page ( void )
2013-09-12 01:24:13 +04:00
{
2021-09-14 17:39:34 +03:00
/*
* Use module_alloc ( ) so this page is within + / - 2 GB of where the
* kernel image and loaded module images reside . This is required
* for most of the architectures .
* ( e . g . x86 - 64 needs this to handle the % rip - relative fixups . )
*/
2013-09-12 01:24:13 +04:00
return module_alloc ( PAGE_SIZE ) ;
}
2021-07-01 04:56:31 +03:00
static void free_insn_page ( void * page )
2013-09-12 01:24:13 +04:00
{
2015-01-20 01:37:05 +03:00
module_memfree ( page ) ;
2013-09-12 01:24:13 +04:00
}
2013-09-12 01:24:11 +04:00
struct kprobe_insn_cache kprobe_insn_slots = {
. mutex = __MUTEX_INITIALIZER ( kprobe_insn_slots . mutex ) ,
2013-09-12 01:24:13 +04:00
. alloc = alloc_insn_page ,
. free = free_insn_page ,
2020-05-28 11:00:58 +03:00
. sym = KPROBE_INSN_PAGE_SYM ,
2010-02-25 16:33:59 +03:00
. pages = LIST_HEAD_INIT ( kprobe_insn_slots . pages ) ,
. insn_size = MAX_INSN_SIZE ,
. nr_garbage = 0 ,
} ;
2014-04-17 12:17:54 +04:00
static int collect_garbage_slots ( struct kprobe_insn_cache * c ) ;
2006-12-07 07:38:11 +03:00
2005-06-28 02:17:01 +04:00
/**
2009-01-07 01:41:50 +03:00
* __get_insn_slot ( ) - Find a slot on an executable page for an instruction .
2005-06-28 02:17:01 +04:00
* We allocate an executable page if there ' s no room on existing ones .
*/
2014-04-17 12:17:54 +04:00
kprobe_opcode_t * __get_insn_slot ( struct kprobe_insn_cache * c )
2005-06-28 02:17:01 +04:00
{
struct kprobe_insn_page * kip ;
2013-09-12 01:24:11 +04:00
kprobe_opcode_t * slot = NULL ;
2005-06-28 02:17:01 +04:00
2017-01-08 17:58:09 +03:00
/* Since the slot array is not protected by rcu, we need a mutex */
2013-09-12 01:24:11 +04:00
mutex_lock ( & c - > mutex ) ;
2007-05-08 11:34:13 +04:00
retry :
2017-01-08 17:58:09 +03:00
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( kip , & c - > pages , list ) {
2010-02-25 16:33:59 +03:00
if ( kip - > nused < slots_per_page ( c ) ) {
2005-06-28 02:17:01 +04:00
int i ;
2021-09-14 17:39:34 +03:00
2010-02-25 16:33:59 +03:00
for ( i = 0 ; i < slots_per_page ( c ) ; i + + ) {
2007-01-31 01:36:06 +03:00
if ( kip - > slot_used [ i ] = = SLOT_CLEAN ) {
kip - > slot_used [ i ] = SLOT_USED ;
2005-06-28 02:17:01 +04:00
kip - > nused + + ;
2013-09-12 01:24:11 +04:00
slot = kip - > insns + ( i * c - > insn_size ) ;
2017-01-08 17:58:09 +03:00
rcu_read_unlock ( ) ;
2013-09-12 01:24:11 +04:00
goto out ;
2005-06-28 02:17:01 +04:00
}
}
2010-02-25 16:33:59 +03:00
/* kip->nused is broken. Fix it. */
kip - > nused = slots_per_page ( c ) ;
WARN_ON ( 1 ) ;
2005-06-28 02:17:01 +04:00
}
}
2017-01-08 17:58:09 +03:00
rcu_read_unlock ( ) ;
2005-06-28 02:17:01 +04:00
2006-12-07 07:38:11 +03:00
/* If there are any garbage slots, collect it and try again. */
2010-02-25 16:33:59 +03:00
if ( c - > nr_garbage & & collect_garbage_slots ( c ) = = 0 )
2006-12-07 07:38:11 +03:00
goto retry ;
2010-02-25 16:33:59 +03:00
/* All out of space. Need to allocate a new page. */
kip = kmalloc ( KPROBE_INSN_PAGE_SIZE ( slots_per_page ( c ) ) , GFP_KERNEL ) ;
2007-05-08 11:34:13 +04:00
if ( ! kip )
2013-09-12 01:24:11 +04:00
goto out ;
2005-06-28 02:17:01 +04:00
2013-09-12 01:24:13 +04:00
kip - > insns = c - > alloc ( ) ;
2005-06-28 02:17:01 +04:00
if ( ! kip - > insns ) {
kfree ( kip ) ;
2013-09-12 01:24:11 +04:00
goto out ;
2005-06-28 02:17:01 +04:00
}
2009-07-01 01:08:14 +04:00
INIT_LIST_HEAD ( & kip - > list ) ;
2010-02-25 16:33:59 +03:00
memset ( kip - > slot_used , SLOT_CLEAN , slots_per_page ( c ) ) ;
2007-01-31 01:36:06 +03:00
kip - > slot_used [ 0 ] = SLOT_USED ;
2005-06-28 02:17:01 +04:00
kip - > nused = 1 ;
2006-12-07 07:38:11 +03:00
kip - > ngarbage = 0 ;
2013-09-12 01:24:13 +04:00
kip - > cache = c ;
2017-01-08 17:58:09 +03:00
list_add_rcu ( & kip - > list , & c - > pages ) ;
2013-09-12 01:24:11 +04:00
slot = kip - > insns ;
2020-05-12 15:19:11 +03:00
/* Record the perf ksymbol register event after adding the page */
perf_event_ksymbol ( PERF_RECORD_KSYMBOL_TYPE_OOL , ( unsigned long ) kip - > insns ,
PAGE_SIZE , false , c - > sym ) ;
2013-09-12 01:24:11 +04:00
out :
mutex_unlock ( & c - > mutex ) ;
return slot ;
2009-01-07 01:41:50 +03:00
}
2021-09-14 17:40:16 +03:00
/* Return true if all garbages are collected, otherwise false. */
static bool collect_one_slot ( struct kprobe_insn_page * kip , int idx )
2006-12-07 07:38:11 +03:00
{
2007-01-31 01:36:06 +03:00
kip - > slot_used [ idx ] = SLOT_CLEAN ;
2006-12-07 07:38:11 +03:00
kip - > nused - - ;
if ( kip - > nused = = 0 ) {
/*
* Page is no longer in use . Free it unless
* it ' s the last one . We keep the last one
* so as not to have to set it up again the
* next time somebody inserts a probe .
*/
2010-02-25 16:33:59 +03:00
if ( ! list_is_singular ( & kip - > list ) ) {
2020-05-12 15:19:11 +03:00
/*
* Record perf ksymbol unregister event before removing
* the page .
*/
perf_event_ksymbol ( PERF_RECORD_KSYMBOL_TYPE_OOL ,
( unsigned long ) kip - > insns , PAGE_SIZE , true ,
kip - > cache - > sym ) ;
2017-01-08 17:58:09 +03:00
list_del_rcu ( & kip - > list ) ;
synchronize_rcu ( ) ;
2013-09-12 01:24:13 +04:00
kip - > cache - > free ( kip - > insns ) ;
2006-12-07 07:38:11 +03:00
kfree ( kip ) ;
}
2021-09-14 17:40:16 +03:00
return true ;
2006-12-07 07:38:11 +03:00
}
2021-09-14 17:40:16 +03:00
return false ;
2006-12-07 07:38:11 +03:00
}
2014-04-17 12:17:54 +04:00
static int collect_garbage_slots ( struct kprobe_insn_cache * c )
2006-12-07 07:38:11 +03:00
{
2009-07-01 01:08:14 +04:00
struct kprobe_insn_page * kip , * next ;
2006-12-07 07:38:11 +03:00
2010-02-03 00:49:04 +03:00
/* Ensure no-one is interrupted on the garbages */
2018-11-07 06:04:39 +03:00
synchronize_rcu ( ) ;
2006-12-07 07:38:11 +03:00
2010-02-25 16:33:59 +03:00
list_for_each_entry_safe ( kip , next , & c - > pages , list ) {
2006-12-07 07:38:11 +03:00
int i ;
2021-09-14 17:39:34 +03:00
2006-12-07 07:38:11 +03:00
if ( kip - > ngarbage = = 0 )
continue ;
kip - > ngarbage = 0 ; /* we will collect all garbages */
2010-02-25 16:33:59 +03:00
for ( i = 0 ; i < slots_per_page ( c ) ; i + + ) {
2017-01-08 17:58:09 +03:00
if ( kip - > slot_used [ i ] = = SLOT_DIRTY & & collect_one_slot ( kip , i ) )
2006-12-07 07:38:11 +03:00
break ;
}
}
2010-02-25 16:33:59 +03:00
c - > nr_garbage = 0 ;
2006-12-07 07:38:11 +03:00
return 0 ;
}
2014-04-17 12:17:54 +04:00
void __free_insn_slot ( struct kprobe_insn_cache * c ,
kprobe_opcode_t * slot , int dirty )
2005-06-28 02:17:01 +04:00
{
struct kprobe_insn_page * kip ;
2017-01-08 17:58:09 +03:00
long idx ;
2005-06-28 02:17:01 +04:00
2013-09-12 01:24:11 +04:00
mutex_lock ( & c - > mutex ) ;
2017-01-08 17:58:09 +03:00
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( kip , & c - > pages , list ) {
idx = ( ( long ) slot - ( long ) kip - > insns ) /
( c - > insn_size * sizeof ( kprobe_opcode_t ) ) ;
if ( idx > = 0 & & idx < slots_per_page ( c ) )
2013-09-12 01:24:11 +04:00
goto out ;
2005-06-28 02:17:01 +04:00
}
2017-01-08 17:58:09 +03:00
/* Could not find this slot. */
2010-02-25 16:33:59 +03:00
WARN_ON ( 1 ) ;
2017-01-08 17:58:09 +03:00
kip = NULL ;
2013-09-12 01:24:11 +04:00
out :
2017-01-08 17:58:09 +03:00
rcu_read_unlock ( ) ;
/* Mark and sweep: this may sleep */
if ( kip ) {
/* Check double free */
WARN_ON ( kip - > slot_used [ idx ] ! = SLOT_USED ) ;
if ( dirty ) {
kip - > slot_used [ idx ] = SLOT_DIRTY ;
kip - > ngarbage + + ;
if ( + + c - > nr_garbage > slots_per_page ( c ) )
collect_garbage_slots ( c ) ;
} else {
collect_one_slot ( kip , idx ) ;
}
}
2013-09-12 01:24:11 +04:00
mutex_unlock ( & c - > mutex ) ;
2010-02-25 16:33:59 +03:00
}
2007-05-08 11:34:13 +04:00
2017-01-08 17:58:09 +03:00
/*
* Check given address is on the page of kprobe instruction slots .
* This will be used for checking whether the address on a stack
* is on a text area or not .
*/
bool __is_insn_slot_addr ( struct kprobe_insn_cache * c , unsigned long addr )
{
struct kprobe_insn_page * kip ;
bool ret = false ;
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( kip , & c - > pages , list ) {
if ( addr > = ( unsigned long ) kip - > insns & &
addr < ( unsigned long ) kip - > insns + PAGE_SIZE ) {
ret = true ;
break ;
}
}
rcu_read_unlock ( ) ;
return ret ;
}
2020-05-28 11:00:58 +03:00
int kprobe_cache_get_kallsym ( struct kprobe_insn_cache * c , unsigned int * symnum ,
unsigned long * value , char * type , char * sym )
{
struct kprobe_insn_page * kip ;
int ret = - ERANGE ;
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( kip , & c - > pages , list ) {
if ( ( * symnum ) - - )
continue ;
2021-09-14 17:39:34 +03:00
strscpy ( sym , c - > sym , KSYM_NAME_LEN ) ;
2020-05-28 11:00:58 +03:00
* type = ' t ' ;
* value = ( unsigned long ) kip - > insns ;
ret = 0 ;
break ;
}
rcu_read_unlock ( ) ;
return ret ;
}
2010-02-25 16:34:07 +03:00
# ifdef CONFIG_OPTPROBES
2021-05-13 12:07:51 +03:00
void __weak * alloc_optinsn_page ( void )
{
return alloc_insn_page ( ) ;
}
void __weak free_optinsn_page ( void * page )
{
free_insn_page ( page ) ;
}
2010-02-25 16:34:07 +03:00
/* For optimized_kprobe buffer */
2013-09-12 01:24:11 +04:00
struct kprobe_insn_cache kprobe_optinsn_slots = {
. mutex = __MUTEX_INITIALIZER ( kprobe_optinsn_slots . mutex ) ,
2021-05-13 12:07:51 +03:00
. alloc = alloc_optinsn_page ,
. free = free_optinsn_page ,
2020-05-28 11:00:58 +03:00
. sym = KPROBE_OPTINSN_PAGE_SYM ,
2010-02-25 16:34:07 +03:00
. pages = LIST_HEAD_INIT ( kprobe_optinsn_slots . pages ) ,
/* .insn_size is initialized later */
. nr_garbage = 0 ,
} ;
# endif
2006-01-10 07:52:41 +03:00
# endif
2005-06-28 02:17:01 +04:00
2005-11-07 12:00:07 +03:00
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance ( struct kprobe * kp )
{
2010-12-06 20:16:25 +03:00
__this_cpu_write ( kprobe_instance , kp ) ;
2005-11-07 12:00:07 +03:00
}
static inline void reset_kprobe_instance ( void )
{
2010-12-06 20:16:25 +03:00
__this_cpu_write ( kprobe_instance , NULL ) ;
2005-11-07 12:00:07 +03:00
}
2005-11-07 12:00:13 +03:00
/*
* This routine is called either :
2021-09-14 17:39:34 +03:00
* - under the ' kprobe_mutex ' - during kprobe_ [ un ] register ( ) .
* OR
* - with preemption disabled - from architecture specific code .
2005-11-07 12:00:13 +03:00
*/
2014-04-17 12:18:21 +04:00
struct kprobe * get_kprobe ( void * addr )
2005-04-17 02:20:36 +04:00
{
struct hlist_head * head ;
2005-11-07 12:00:13 +03:00
struct kprobe * p ;
2005-04-17 02:20:36 +04:00
head = & kprobe_table [ hash_ptr ( addr , KPROBE_HASH_BITS ) ] ;
2020-05-12 11:02:33 +03:00
hlist_for_each_entry_rcu ( p , head , hlist ,
lockdep_is_held ( & kprobe_mutex ) ) {
2005-04-17 02:20:36 +04:00
if ( p - > addr = = addr )
return p ;
}
2010-02-25 16:34:07 +03:00
2005-04-17 02:20:36 +04:00
return NULL ;
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( get_kprobe ) ;
2005-04-17 02:20:36 +04:00
2014-04-17 12:18:21 +04:00
static int aggr_pre_handler ( struct kprobe * p , struct pt_regs * regs ) ;
2010-02-25 16:34:07 +03:00
2021-09-14 17:39:34 +03:00
/* Return true if 'p' is an aggregator */
2021-09-14 17:40:16 +03:00
static inline bool kprobe_aggrprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
return p - > pre_handler = = aggr_pre_handler ;
}
2021-09-14 17:39:34 +03:00
/* Return true if 'p' is unused */
2021-09-14 17:40:16 +03:00
static inline bool kprobe_unused ( struct kprobe * p )
2010-12-03 12:54:09 +03:00
{
return kprobe_aggrprobe ( p ) & & kprobe_disabled ( p ) & &
list_empty ( & p - > list ) ;
}
2021-09-14 17:39:34 +03:00
/* Keep all fields in the kprobe consistent. */
2010-12-03 12:53:50 +03:00
static inline void copy_kprobe ( struct kprobe * ap , struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
2010-12-03 12:53:50 +03:00
memcpy ( & p - > opcode , & ap - > opcode , sizeof ( kprobe_opcode_t ) ) ;
memcpy ( & p - > ainsn , & ap - > ainsn , sizeof ( struct arch_specific_insn ) ) ;
2010-02-25 16:34:07 +03:00
}
# ifdef CONFIG_OPTPROBES
2021-09-14 17:39:34 +03:00
/* NOTE: This is protected by 'kprobe_mutex'. */
2010-02-25 16:34:15 +03:00
static bool kprobes_allow_optimization ;
2010-02-25 16:34:07 +03:00
/*
2021-09-14 17:39:34 +03:00
* Call all ' kprobe : : pre_handler ' on the list , but ignores its return value .
2010-02-25 16:34:07 +03:00
* This must be called from arch - dep optimized caller .
*/
2014-04-17 12:18:21 +04:00
void opt_pre_handler ( struct kprobe * p , struct pt_regs * regs )
2010-02-25 16:34:07 +03:00
{
struct kprobe * kp ;
list_for_each_entry_rcu ( kp , & p - > list , list ) {
if ( kp - > pre_handler & & likely ( ! kprobe_disabled ( kp ) ) ) {
set_kprobe_instance ( kp ) ;
2017-10-17 11:18:34 +03:00
kp - > pre_handler ( kp , regs ) ;
2010-02-25 16:34:07 +03:00
}
reset_kprobe_instance ( ) ;
}
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( opt_pre_handler ) ;
2010-02-25 16:34:07 +03:00
2010-12-03 12:54:09 +03:00
/* Free optimized instructions and optimized_kprobe */
2014-04-17 12:17:54 +04:00
static void free_aggr_kprobe ( struct kprobe * p )
2010-12-03 12:54:09 +03:00
{
struct optimized_kprobe * op ;
op = container_of ( p , struct optimized_kprobe , kp ) ;
arch_remove_optimized_kprobe ( op ) ;
arch_remove_kprobe ( p ) ;
kfree ( op ) ;
}
2021-09-14 17:39:34 +03:00
/* Return true if the kprobe is ready for optimization. */
2010-02-25 16:34:07 +03:00
static inline int kprobe_optready ( struct kprobe * p )
{
struct optimized_kprobe * op ;
if ( kprobe_aggrprobe ( p ) ) {
op = container_of ( p , struct optimized_kprobe , kp ) ;
return arch_prepared_optinsn ( & op - > optinsn ) ;
}
return 0 ;
}
2021-09-14 17:39:34 +03:00
/* Return true if the kprobe is disarmed. Note: p must be on hash list */
x86/kprobes: Fix arch_check_optimized_kprobe check within optimized_kprobe range
When arch_prepare_optimized_kprobe calculating jump destination address,
it copies original instructions from jmp-optimized kprobe (see
__recover_optprobed_insn), and calculated based on length of original
instruction.
arch_check_optimized_kprobe does not check KPROBE_FLAG_OPTIMATED when
checking whether jmp-optimized kprobe exists.
As a result, setup_detour_execution may jump to a range that has been
overwritten by jump destination address, resulting in an inval opcode error.
For example, assume that register two kprobes whose addresses are
<func+9> and <func+11> in "func" function.
The original code of "func" function is as follows:
0xffffffff816cb5e9 <+9>: push %r12
0xffffffff816cb5eb <+11>: xor %r12d,%r12d
0xffffffff816cb5ee <+14>: test %rdi,%rdi
0xffffffff816cb5f1 <+17>: setne %r12b
0xffffffff816cb5f5 <+21>: push %rbp
1.Register the kprobe for <func+11>, assume that is kp1, corresponding optimized_kprobe is op1.
After the optimization, "func" code changes to:
0xffffffff816cc079 <+9>: push %r12
0xffffffff816cc07b <+11>: jmp 0xffffffffa0210000
0xffffffff816cc080 <+16>: incl 0xf(%rcx)
0xffffffff816cc083 <+19>: xchg %eax,%ebp
0xffffffff816cc084 <+20>: (bad)
0xffffffff816cc085 <+21>: push %rbp
Now op1->flags == KPROBE_FLAG_OPTIMATED;
2. Register the kprobe for <func+9>, assume that is kp2, corresponding optimized_kprobe is op2.
register_kprobe(kp2)
register_aggr_kprobe
alloc_aggr_kprobe
__prepare_optimized_kprobe
arch_prepare_optimized_kprobe
__recover_optprobed_insn // copy original bytes from kp1->optinsn.copied_insn,
// jump address = <func+14>
3. disable kp1:
disable_kprobe(kp1)
__disable_kprobe
...
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
ret = disarm_kprobe(orig_p, true) // add op1 in unoptimizing_list, not unoptimized
orig_p->flags |= KPROBE_FLAG_DISABLED; // op1->flags == KPROBE_FLAG_OPTIMATED | KPROBE_FLAG_DISABLED
...
4. unregister kp2
__unregister_kprobe_top
...
if (!kprobe_disabled(ap) && !kprobes_all_disarmed) {
optimize_kprobe(op)
...
if (arch_check_optimized_kprobe(op) < 0) // because op1 has KPROBE_FLAG_DISABLED, here not return
return;
p->kp.flags |= KPROBE_FLAG_OPTIMIZED; // now op2 has KPROBE_FLAG_OPTIMIZED
}
"func" code now is:
0xffffffff816cc079 <+9>: int3
0xffffffff816cc07a <+10>: push %rsp
0xffffffff816cc07b <+11>: jmp 0xffffffffa0210000
0xffffffff816cc080 <+16>: incl 0xf(%rcx)
0xffffffff816cc083 <+19>: xchg %eax,%ebp
0xffffffff816cc084 <+20>: (bad)
0xffffffff816cc085 <+21>: push %rbp
5. if call "func", int3 handler call setup_detour_execution:
if (p->flags & KPROBE_FLAG_OPTIMIZED) {
...
regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
...
}
The code for the destination address is
0xffffffffa021072c: push %r12
0xffffffffa021072e: xor %r12d,%r12d
0xffffffffa0210731: jmp 0xffffffff816cb5ee <func+14>
However, <func+14> is not a valid start instruction address. As a result, an error occurs.
Link: https://lore.kernel.org/all/20230216034247.32348-3-yangjihong1@huawei.com/
Fixes: f66c0447cca1 ("kprobes: Set unoptimized flag after unoptimizing code")
Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
Cc: stable@vger.kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-02-21 02:49:16 +03:00
bool kprobe_disarmed ( struct kprobe * p )
2010-12-03 12:54:09 +03:00
{
struct optimized_kprobe * op ;
/* If kprobe is not aggr/opt probe, just return kprobe is disabled */
if ( ! kprobe_aggrprobe ( p ) )
return kprobe_disabled ( p ) ;
op = container_of ( p , struct optimized_kprobe , kp ) ;
return kprobe_disabled ( p ) & & list_empty ( & op - > list ) ;
}
2021-09-14 17:39:34 +03:00
/* Return true if the probe is queued on (un)optimizing lists */
2021-09-14 17:40:16 +03:00
static bool kprobe_queued ( struct kprobe * p )
2010-12-03 12:54:09 +03:00
{
struct optimized_kprobe * op ;
if ( kprobe_aggrprobe ( p ) ) {
op = container_of ( p , struct optimized_kprobe , kp ) ;
if ( ! list_empty ( & op - > list ) )
2021-09-14 17:40:16 +03:00
return true ;
2010-12-03 12:54:09 +03:00
}
2021-09-14 17:40:16 +03:00
return false ;
2010-12-03 12:54:09 +03:00
}
2010-02-25 16:34:07 +03:00
/*
* Return an optimized kprobe whose optimizing code replaces
2021-09-14 17:39:34 +03:00
* instructions including ' addr ' ( exclude breakpoint ) .
2010-02-25 16:34:07 +03:00
*/
2021-09-14 17:40:07 +03:00
static struct kprobe * get_optimized_kprobe ( kprobe_opcode_t * addr )
2010-02-25 16:34:07 +03:00
{
int i ;
struct kprobe * p = NULL ;
struct optimized_kprobe * op ;
/* Don't check i == 0, since that is a breakpoint case. */
2021-09-14 17:40:07 +03:00
for ( i = 1 ; ! p & & i < MAX_OPTIMIZED_LENGTH / sizeof ( kprobe_opcode_t ) ; i + + )
p = get_kprobe ( addr - i ) ;
2010-02-25 16:34:07 +03:00
if ( p & & kprobe_optready ( p ) ) {
op = container_of ( p , struct optimized_kprobe , kp ) ;
if ( arch_within_optimized_kprobe ( op , addr ) )
return p ;
}
return NULL ;
}
2021-09-14 17:39:34 +03:00
/* Optimization staging list, protected by 'kprobe_mutex' */
2010-02-25 16:34:07 +03:00
static LIST_HEAD ( optimizing_list ) ;
2010-12-03 12:54:09 +03:00
static LIST_HEAD ( unoptimizing_list ) ;
2013-05-22 13:34:09 +04:00
static LIST_HEAD ( freeing_list ) ;
2010-02-25 16:34:07 +03:00
static void kprobe_optimizer ( struct work_struct * work ) ;
static DECLARE_DELAYED_WORK ( optimizing_work , kprobe_optimizer ) ;
# define OPTIMIZE_DELAY 5
2010-12-03 12:54:03 +03:00
/*
* Optimize ( replace a breakpoint with a jump ) kprobes listed on
2021-09-14 17:39:34 +03:00
* ' optimizing_list ' .
2010-12-03 12:54:03 +03:00
*/
2014-04-17 12:17:54 +04:00
static void do_optimize_kprobes ( void )
2010-02-25 16:34:07 +03:00
{
2019-08-12 21:43:02 +03:00
lockdep_assert_held ( & text_mutex ) ;
2010-02-25 16:34:07 +03:00
/*
2021-09-14 17:39:34 +03:00
* The optimization / unoptimization refers ' online_cpus ' via
* stop_machine ( ) and cpu - hotplug modifies the ' online_cpus ' .
* And same time , ' text_mutex ' will be held in cpu - hotplug and here .
* This combination can cause a deadlock ( cpu - hotplug tries to lock
* ' text_mutex ' but stop_machine ( ) can not be done because
* the ' online_cpus ' has been changed )
* To avoid this deadlock , caller must have locked cpu - hotplug
* for preventing cpu - hotplug outside of ' text_mutex ' locking .
2010-02-25 16:34:07 +03:00
*/
2017-05-24 11:15:36 +03:00
lockdep_assert_cpus_held ( ) ;
/* Optimization never be done when disarmed */
if ( kprobes_all_disarmed | | ! kprobes_allow_optimization | |
list_empty ( & optimizing_list ) )
return ;
2010-12-03 12:54:28 +03:00
arch_optimize_kprobes ( & optimizing_list ) ;
2010-12-03 12:54:03 +03:00
}
2010-12-03 12:54:09 +03:00
/*
* Unoptimize ( replace a jump with a breakpoint and remove the breakpoint
2021-09-14 17:39:34 +03:00
* if need ) kprobes listed on ' unoptimizing_list ' .
2010-12-03 12:54:09 +03:00
*/
2014-04-17 12:17:54 +04:00
static void do_unoptimize_kprobes ( void )
2010-12-03 12:54:09 +03:00
{
struct optimized_kprobe * op , * tmp ;
2019-08-12 21:43:02 +03:00
lockdep_assert_held ( & text_mutex ) ;
2017-05-24 11:15:36 +03:00
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held ( ) ;
2023-02-21 02:49:16 +03:00
if ( ! list_empty ( & unoptimizing_list ) )
arch_unoptimize_kprobes ( & unoptimizing_list , & freeing_list ) ;
2010-12-03 12:54:09 +03:00
2023-02-21 02:49:16 +03:00
/* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
2013-05-22 13:34:09 +04:00
list_for_each_entry_safe ( op , tmp , & freeing_list , list ) {
2019-11-27 08:57:04 +03:00
/* Switching from detour code to origin */
op - > kp . flags & = ~ KPROBE_FLAG_OPTIMIZED ;
2023-02-21 02:49:16 +03:00
/* Disarm probes if marked disabled and not gone */
if ( kprobe_disabled ( & op - > kp ) & & ! kprobe_gone ( & op - > kp ) )
2010-12-03 12:54:09 +03:00
arch_disarm_kprobe ( & op - > kp ) ;
if ( kprobe_unused ( & op - > kp ) ) {
/*
* Remove unused probes from hash list . After waiting
* for synchronization , these probes are reclaimed .
2021-09-14 17:39:34 +03:00
* ( reclaiming is done by do_free_cleaned_kprobes ( ) . )
2010-12-03 12:54:09 +03:00
*/
hlist_del_rcu ( & op - > kp . hlist ) ;
} else
list_del_init ( & op - > list ) ;
}
}
2021-09-14 17:39:34 +03:00
/* Reclaim all kprobes on the 'freeing_list' */
2014-04-17 12:17:54 +04:00
static void do_free_cleaned_kprobes ( void )
2010-12-03 12:54:09 +03:00
{
struct optimized_kprobe * op , * tmp ;
2013-05-22 13:34:09 +04:00
list_for_each_entry_safe ( op , tmp , & freeing_list , list ) {
2010-12-03 12:54:09 +03:00
list_del_init ( & op - > list ) ;
2018-09-11 13:21:09 +03:00
if ( WARN_ON_ONCE ( ! kprobe_unused ( & op - > kp ) ) ) {
/*
* This must not happen , but if there is a kprobe
* still in use , keep it on kprobes hash list .
*/
continue ;
}
2010-12-03 12:54:09 +03:00
free_aggr_kprobe ( & op - > kp ) ;
}
}
/* Start optimizer after OPTIMIZE_DELAY passed */
2014-04-17 12:17:54 +04:00
static void kick_kprobe_optimizer ( void )
2010-12-03 12:54:09 +03:00
{
2012-12-22 05:57:00 +04:00
schedule_delayed_work ( & optimizing_work , OPTIMIZE_DELAY ) ;
2010-12-03 12:54:09 +03:00
}
2010-12-03 12:54:03 +03:00
/* Kprobe jump optimizer */
2014-04-17 12:17:54 +04:00
static void kprobe_optimizer ( struct work_struct * work )
2010-12-03 12:54:03 +03:00
{
2012-06-05 14:28:14 +04:00
mutex_lock ( & kprobe_mutex ) ;
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2019-08-12 21:43:02 +03:00
mutex_lock ( & text_mutex ) ;
2010-12-03 12:54:03 +03:00
/*
2010-12-03 12:54:09 +03:00
* Step 1 : Unoptimize kprobes and collect cleaned ( unused and disarmed )
* kprobes before waiting for quiesence period .
*/
2013-05-22 13:34:09 +04:00
do_unoptimize_kprobes ( ) ;
2010-12-03 12:54:09 +03:00
/*
2017-10-20 02:43:39 +03:00
* Step 2 : Wait for quiesence period to ensure all potentially
* preempted tasks to have normally scheduled . Because optprobe
* may modify multiple instructions , there is a chance that Nth
* instruction is preempted . In that case , such tasks can return
* to 2 nd - Nth byte of jump instruction . This wait is for avoiding it .
* Note that on non - preemptive kernel , this is transparently converted
* to synchronoze_sched ( ) to wait for all interrupts to have completed .
2010-12-03 12:54:03 +03:00
*/
2017-10-20 02:43:39 +03:00
synchronize_rcu_tasks ( ) ;
2010-12-03 12:54:03 +03:00
2010-12-03 12:54:09 +03:00
/* Step 3: Optimize kprobes after quiesence period */
2010-12-03 12:54:03 +03:00
do_optimize_kprobes ( ) ;
2010-12-03 12:54:09 +03:00
/* Step 4: Free cleaned kprobes after quiesence period */
2013-05-22 13:34:09 +04:00
do_free_cleaned_kprobes ( ) ;
2010-12-03 12:54:09 +03:00
2019-08-12 21:43:02 +03:00
mutex_unlock ( & text_mutex ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2010-12-03 12:54:09 +03:00
2010-12-03 12:54:28 +03:00
/* Step 5: Kick optimizer again if needed */
2010-12-03 12:54:34 +03:00
if ( ! list_empty ( & optimizing_list ) | | ! list_empty ( & unoptimizing_list ) )
2010-12-03 12:54:28 +03:00
kick_kprobe_optimizer ( ) ;
2020-05-12 11:02:56 +03:00
mutex_unlock ( & kprobe_mutex ) ;
2010-12-03 12:54:09 +03:00
}
/* Wait for completing optimization and unoptimization */
2017-05-17 11:19:49 +03:00
void wait_for_kprobe_optimizer ( void )
2010-12-03 12:54:09 +03:00
{
2012-12-22 05:57:00 +04:00
mutex_lock ( & kprobe_mutex ) ;
while ( ! list_empty ( & optimizing_list ) | | ! list_empty ( & unoptimizing_list ) ) {
mutex_unlock ( & kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
/* This will also make 'optimizing_work' execute immmediately */
2012-12-22 05:57:00 +04:00
flush_delayed_work ( & optimizing_work ) ;
2021-09-14 17:39:34 +03:00
/* 'optimizing_work' might not have been queued yet, relax */
2012-12-22 05:57:00 +04:00
cpu_relax ( ) ;
mutex_lock ( & kprobe_mutex ) ;
}
mutex_unlock ( & kprobe_mutex ) ;
2010-02-25 16:34:07 +03:00
}
2023-02-21 02:49:16 +03:00
bool optprobe_queued_unopt ( struct optimized_kprobe * op )
2020-01-07 17:42:24 +03:00
{
struct optimized_kprobe * _op ;
list_for_each_entry ( _op , & unoptimizing_list , list ) {
if ( op = = _op )
return true ;
}
return false ;
}
2010-02-25 16:34:07 +03:00
/* Optimize kprobe if p is ready to be optimized */
2014-04-17 12:17:54 +04:00
static void optimize_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
struct optimized_kprobe * op ;
/* Check if the kprobe is disabled or not ready for optimization. */
2010-02-25 16:34:15 +03:00
if ( ! kprobe_optready ( p ) | | ! kprobes_allow_optimization | |
2010-02-25 16:34:07 +03:00
( kprobe_disabled ( p ) | | kprobes_all_disarmed ) )
return ;
2021-09-14 17:39:34 +03:00
/* kprobes with 'post_handler' can not be optimized */
2018-06-19 19:10:27 +03:00
if ( p - > post_handler )
2010-02-25 16:34:07 +03:00
return ;
op = container_of ( p , struct optimized_kprobe , kp ) ;
/* Check there is no other kprobes at the optimized instructions */
if ( arch_check_optimized_kprobe ( op ) < 0 )
return ;
/* Check if it is already optimized. */
2020-01-07 17:42:24 +03:00
if ( op - > kp . flags & KPROBE_FLAG_OPTIMIZED ) {
if ( optprobe_queued_unopt ( op ) ) {
/* This is under unoptimizing. Just dequeue the probe */
list_del_init ( & op - > list ) ;
}
2010-02-25 16:34:07 +03:00
return ;
2020-01-07 17:42:24 +03:00
}
2010-02-25 16:34:07 +03:00
op - > kp . flags | = KPROBE_FLAG_OPTIMIZED ;
2010-12-03 12:54:09 +03:00
2021-09-14 17:39:34 +03:00
/*
* On the ' unoptimizing_list ' and ' optimizing_list ' ,
* ' op ' must have OPTIMIZED flag
*/
2020-01-07 17:42:24 +03:00
if ( WARN_ON_ONCE ( ! list_empty ( & op - > list ) ) )
return ;
list_add ( & op - > list , & optimizing_list ) ;
kick_kprobe_optimizer ( ) ;
2010-12-03 12:54:09 +03:00
}
/* Short cut to direct unoptimizing */
2014-04-17 12:17:54 +04:00
static void force_unoptimize_kprobe ( struct optimized_kprobe * op )
2010-12-03 12:54:09 +03:00
{
2017-05-24 11:15:36 +03:00
lockdep_assert_cpus_held ( ) ;
2010-12-03 12:54:09 +03:00
arch_unoptimize_kprobe ( op ) ;
2019-11-27 08:57:04 +03:00
op - > kp . flags & = ~ KPROBE_FLAG_OPTIMIZED ;
2010-02-25 16:34:07 +03:00
}
/* Unoptimize a kprobe if p is optimized */
2014-04-17 12:17:54 +04:00
static void unoptimize_kprobe ( struct kprobe * p , bool force )
2010-02-25 16:34:07 +03:00
{
struct optimized_kprobe * op ;
2010-12-03 12:54:09 +03:00
if ( ! kprobe_aggrprobe ( p ) | | kprobe_disarmed ( p ) )
return ; /* This is not an optprobe nor optimized */
op = container_of ( p , struct optimized_kprobe , kp ) ;
2020-01-07 17:42:24 +03:00
if ( ! kprobe_optimized ( p ) )
2010-12-03 12:54:09 +03:00
return ;
if ( ! list_empty ( & op - > list ) ) {
2020-01-07 17:42:24 +03:00
if ( optprobe_queued_unopt ( op ) ) {
/* Queued in unoptimizing queue */
if ( force ) {
/*
* Forcibly unoptimize the kprobe here , and queue it
* in the freeing list for release afterwards .
*/
force_unoptimize_kprobe ( op ) ;
list_move ( & op - > list , & freeing_list ) ;
}
} else {
/* Dequeue from the optimizing queue */
list_del_init ( & op - > list ) ;
op - > kp . flags & = ~ KPROBE_FLAG_OPTIMIZED ;
}
2010-12-03 12:54:09 +03:00
return ;
}
2020-01-07 17:42:24 +03:00
2010-12-03 12:54:09 +03:00
/* Optimized kprobe case */
2020-01-07 17:42:24 +03:00
if ( force ) {
2010-12-03 12:54:09 +03:00
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe ( op ) ;
2020-01-07 17:42:24 +03:00
} else {
2010-12-03 12:54:09 +03:00
list_add ( & op - > list , & unoptimizing_list ) ;
kick_kprobe_optimizer ( ) ;
2010-02-25 16:34:07 +03:00
}
}
2010-12-03 12:54:16 +03:00
/* Cancel unoptimizing for reusing */
2018-09-11 13:20:40 +03:00
static int reuse_unused_kprobe ( struct kprobe * ap )
2010-12-03 12:54:16 +03:00
{
struct optimized_kprobe * op ;
/*
* Unused kprobe MUST be on the way of delayed unoptimizing ( means
* there is still a relative jump ) and disabled .
*/
op = container_of ( ap , struct optimized_kprobe , kp ) ;
2018-04-28 15:36:33 +03:00
WARN_ON_ONCE ( list_empty ( & op - > list ) ) ;
2010-12-03 12:54:16 +03:00
/* Enable the probe again */
ap - > flags & = ~ KPROBE_FLAG_DISABLED ;
2021-09-14 17:39:34 +03:00
/* Optimize it again. (remove from 'op->list') */
kprobes: Fix error check when reusing optimized probes
The following commit introduced a bug in one of our error paths:
819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()")
it missed to handle the return value of kprobe_optready() as
error-value. In reality, the kprobe_optready() returns a bool
result, so "true" case must be passed instead of 0.
This causes some errors on kprobe boot-time selftests on ARM:
[ ] Beginning kprobe tests...
[ ] Probe ARM code
[ ] kprobe
[ ] kretprobe
[ ] ARM instruction simulation
[ ] Check decoding tables
[ ] Run test cases
[ ] FAIL: test_case_handler not run
[ ] FAIL: Test andge r10, r11, r14, asr r7
[ ] FAIL: Scenario 11
...
[ ] FAIL: Scenario 7
[ ] Total instruction simulation tests=1631, pass=1433 fail=198
[ ] kprobe tests failed
This can happen if an optimized probe is unregistered and next
kprobe is registered on same address until the previous probe
is not reclaimed.
If this happens, a hidden aggregated probe may be kept in memory,
and no new kprobe can probe same address. Also, in that case
register_kprobe() will return "1" instead of minus error value,
which can mislead caller logic.
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naveen N . Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # v5.0+
Fixes: 819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()")
Link: http://lkml.kernel.org/r/155530808559.32517.539898325433642204.stgit@devnote2
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-15 09:01:25 +03:00
if ( ! kprobe_optready ( ap ) )
return - EINVAL ;
2018-09-11 13:20:40 +03:00
2010-12-03 12:54:16 +03:00
optimize_kprobe ( ap ) ;
2018-09-11 13:20:40 +03:00
return 0 ;
2010-12-03 12:54:16 +03:00
}
2010-02-25 16:34:07 +03:00
/* Remove optimized instructions */
2014-04-17 12:17:54 +04:00
static void kill_optimized_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
struct optimized_kprobe * op ;
op = container_of ( p , struct optimized_kprobe , kp ) ;
2010-12-03 12:54:09 +03:00
if ( ! list_empty ( & op - > list ) )
/* Dequeue from the (un)optimization queue */
2010-02-25 16:34:07 +03:00
list_del_init ( & op - > list ) ;
2010-12-03 12:54:09 +03:00
op - > kp . flags & = ~ KPROBE_FLAG_OPTIMIZED ;
2013-05-22 13:34:09 +04:00
if ( kprobe_unused ( p ) ) {
/*
2023-02-21 02:49:16 +03:00
* Unused kprobe is on unoptimizing or freeing list . We move it
* to freeing_list and let the kprobe_optimizer ( ) remove it from
* the kprobe hash list and free it .
2013-05-22 13:34:09 +04:00
*/
2023-02-21 02:49:16 +03:00
if ( optprobe_queued_unopt ( op ) )
list_move ( & op - > list , & freeing_list ) ;
2013-05-22 13:34:09 +04:00
}
2010-12-03 12:54:09 +03:00
/* Don't touch the code, because it is already freed. */
2010-02-25 16:34:07 +03:00
arch_remove_optimized_kprobe ( op ) ;
}
2017-04-19 15:52:25 +03:00
static inline
void __prepare_optimized_kprobe ( struct optimized_kprobe * op , struct kprobe * p )
{
if ( ! kprobe_ftrace ( p ) )
arch_prepare_optimized_kprobe ( op , p ) ;
}
2010-02-25 16:34:07 +03:00
/* Try to prepare optimized instructions */
2014-04-17 12:17:54 +04:00
static void prepare_optimized_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
struct optimized_kprobe * op ;
op = container_of ( p , struct optimized_kprobe , kp ) ;
2017-04-19 15:52:25 +03:00
__prepare_optimized_kprobe ( op , p ) ;
2010-02-25 16:34:07 +03:00
}
2021-09-14 17:39:34 +03:00
/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
2014-04-17 12:17:54 +04:00
static struct kprobe * alloc_aggr_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
struct optimized_kprobe * op ;
op = kzalloc ( sizeof ( struct optimized_kprobe ) , GFP_KERNEL ) ;
if ( ! op )
return NULL ;
INIT_LIST_HEAD ( & op - > list ) ;
op - > kp . addr = p - > addr ;
2017-04-19 15:52:25 +03:00
__prepare_optimized_kprobe ( op , p ) ;
2010-02-25 16:34:07 +03:00
return & op - > kp ;
}
2014-04-17 12:17:54 +04:00
static void init_aggr_kprobe ( struct kprobe * ap , struct kprobe * p ) ;
2010-02-25 16:34:07 +03:00
/*
2021-09-14 17:39:34 +03:00
* Prepare an optimized_kprobe and optimize it .
* NOTE : ' p ' must be a normal registered kprobe .
2010-02-25 16:34:07 +03:00
*/
2014-04-17 12:17:54 +04:00
static void try_to_optimize_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
struct kprobe * ap ;
struct optimized_kprobe * op ;
2021-09-14 17:39:34 +03:00
/* Impossible to optimize ftrace-based kprobe. */
2012-06-05 14:28:32 +04:00
if ( kprobe_ftrace ( p ) )
return ;
2021-09-14 17:39:34 +03:00
/* For preparing optimization, jump_label_text_reserved() is called. */
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2012-06-05 14:28:26 +04:00
jump_label_lock ( ) ;
mutex_lock ( & text_mutex ) ;
2010-02-25 16:34:07 +03:00
ap = alloc_aggr_kprobe ( p ) ;
if ( ! ap )
2012-06-05 14:28:26 +04:00
goto out ;
2010-02-25 16:34:07 +03:00
op = container_of ( ap , struct optimized_kprobe , kp ) ;
if ( ! arch_prepared_optinsn ( & op - > optinsn ) ) {
2021-09-14 17:39:34 +03:00
/* If failed to setup optimizing, fallback to kprobe. */
2010-12-03 12:54:09 +03:00
arch_remove_optimized_kprobe ( op ) ;
kfree ( op ) ;
2012-06-05 14:28:26 +04:00
goto out ;
2010-02-25 16:34:07 +03:00
}
init_aggr_kprobe ( ap , p ) ;
2021-09-14 17:39:34 +03:00
optimize_kprobe ( ap ) ; /* This just kicks optimizer thread. */
2012-06-05 14:28:26 +04:00
out :
mutex_unlock ( & text_mutex ) ;
jump_label_unlock ( ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2010-02-25 16:34:07 +03:00
}
2014-04-17 12:17:54 +04:00
static void optimize_all_kprobes ( void )
2010-02-25 16:34:15 +03:00
{
struct hlist_head * head ;
struct kprobe * p ;
unsigned int i ;
2013-04-18 13:33:18 +04:00
mutex_lock ( & kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
/* If optimization is already allowed, just return. */
2010-02-25 16:34:15 +03:00
if ( kprobes_allow_optimization )
2013-04-18 13:33:18 +04:00
goto out ;
2010-02-25 16:34:15 +03:00
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2010-02-25 16:34:15 +03:00
kprobes_allow_optimization = true ;
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
2020-05-12 11:02:44 +03:00
hlist_for_each_entry ( p , head , hlist )
2010-02-25 16:34:15 +03:00
if ( ! kprobe_disabled ( p ) )
optimize_kprobe ( p ) ;
}
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2021-09-14 17:39:25 +03:00
pr_info ( " kprobe jump-optimization is enabled. All kprobes are optimized if possible. \n " ) ;
2013-04-18 13:33:18 +04:00
out :
mutex_unlock ( & kprobe_mutex ) ;
2010-02-25 16:34:15 +03:00
}
2021-02-18 17:29:23 +03:00
# ifdef CONFIG_SYSCTL
2014-04-17 12:17:54 +04:00
static void unoptimize_all_kprobes ( void )
2010-02-25 16:34:15 +03:00
{
struct hlist_head * head ;
struct kprobe * p ;
unsigned int i ;
2013-04-18 13:33:18 +04:00
mutex_lock ( & kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
/* If optimization is already prohibited, just return. */
2013-04-18 13:33:18 +04:00
if ( ! kprobes_allow_optimization ) {
mutex_unlock ( & kprobe_mutex ) ;
2010-02-25 16:34:15 +03:00
return ;
2013-04-18 13:33:18 +04:00
}
2010-02-25 16:34:15 +03:00
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2010-02-25 16:34:15 +03:00
kprobes_allow_optimization = false ;
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
2020-05-12 11:02:44 +03:00
hlist_for_each_entry ( p , head , hlist ) {
2010-02-25 16:34:15 +03:00
if ( ! kprobe_disabled ( p ) )
2010-12-03 12:54:09 +03:00
unoptimize_kprobe ( p , false ) ;
2010-02-25 16:34:15 +03:00
}
}
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2013-04-18 13:33:18 +04:00
mutex_unlock ( & kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
/* Wait for unoptimizing completion. */
2010-12-03 12:54:09 +03:00
wait_for_kprobe_optimizer ( ) ;
2021-09-14 17:39:25 +03:00
pr_info ( " kprobe jump-optimization is disabled. All kprobes are based on software breakpoint. \n " ) ;
2010-02-25 16:34:15 +03:00
}
2013-04-18 13:33:18 +04:00
static DEFINE_MUTEX ( kprobe_sysctl_mutex ) ;
2022-01-22 09:13:41 +03:00
static int sysctl_kprobes_optimization ;
static int proc_kprobes_optimization_handler ( struct ctl_table * table ,
int write , void * buffer ,
size_t * length , loff_t * ppos )
2010-02-25 16:34:15 +03:00
{
int ret ;
2013-04-18 13:33:18 +04:00
mutex_lock ( & kprobe_sysctl_mutex ) ;
2010-02-25 16:34:15 +03:00
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0 ;
ret = proc_dointvec_minmax ( table , write , buffer , length , ppos ) ;
if ( sysctl_kprobes_optimization )
optimize_all_kprobes ( ) ;
else
unoptimize_all_kprobes ( ) ;
2013-04-18 13:33:18 +04:00
mutex_unlock ( & kprobe_sysctl_mutex ) ;
2010-02-25 16:34:15 +03:00
return ret ;
}
2022-01-22 09:13:41 +03:00
static struct ctl_table kprobe_sysctls [ ] = {
{
. procname = " kprobes-optimization " ,
. data = & sysctl_kprobes_optimization ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = proc_kprobes_optimization_handler ,
. extra1 = SYSCTL_ZERO ,
. extra2 = SYSCTL_ONE ,
} ,
{ }
} ;
static void __init kprobe_sysctls_init ( void )
{
register_sysctl_init ( " debug " , kprobe_sysctls ) ;
}
2010-02-25 16:34:15 +03:00
# endif /* CONFIG_SYSCTL */
2021-09-14 17:39:55 +03:00
/* Put a breakpoint for a probe. */
2014-04-17 12:17:54 +04:00
static void __arm_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
2010-12-03 12:53:50 +03:00
struct kprobe * _p ;
2010-02-25 16:34:07 +03:00
2021-09-14 17:39:55 +03:00
lockdep_assert_held ( & text_mutex ) ;
2021-09-14 17:39:34 +03:00
/* Find the overlapping optimized kprobes. */
2021-09-14 17:40:07 +03:00
_p = get_optimized_kprobe ( p - > addr ) ;
2010-12-03 12:53:50 +03:00
if ( unlikely ( _p ) )
2010-12-03 12:54:09 +03:00
/* Fallback to unoptimized kprobe */
unoptimize_kprobe ( _p , true ) ;
2010-02-25 16:34:07 +03:00
arch_arm_kprobe ( p ) ;
optimize_kprobe ( p ) ; /* Try to optimize (add kprobe to a list) */
}
2021-09-14 17:39:55 +03:00
/* Remove the breakpoint of a probe. */
2014-04-17 12:17:54 +04:00
static void __disarm_kprobe ( struct kprobe * p , bool reopt )
2010-02-25 16:34:07 +03:00
{
2010-12-03 12:53:50 +03:00
struct kprobe * _p ;
2010-02-25 16:34:07 +03:00
2021-09-14 17:39:55 +03:00
lockdep_assert_held ( & text_mutex ) ;
2015-02-14 01:40:26 +03:00
/* Try to unoptimize */
unoptimize_kprobe ( p , kprobes_all_disarmed ) ;
2010-02-25 16:34:07 +03:00
2010-12-03 12:54:09 +03:00
if ( ! kprobe_queued ( p ) ) {
arch_disarm_kprobe ( p ) ;
2021-09-14 17:39:34 +03:00
/* If another kprobe was blocked, re-optimize it. */
2021-09-14 17:40:07 +03:00
_p = get_optimized_kprobe ( p - > addr ) ;
2010-12-03 12:54:09 +03:00
if ( unlikely ( _p ) & & reopt )
optimize_kprobe ( _p ) ;
}
2021-09-14 17:39:34 +03:00
/*
* TODO : Since unoptimization and real disarming will be done by
* the worker thread , we can not check whether another probe are
* unoptimized because of this probe here . It should be re - optimized
* by the worker thread .
*/
2010-02-25 16:34:07 +03:00
}
# else /* !CONFIG_OPTPROBES */
# define optimize_kprobe(p) do {} while (0)
2010-12-03 12:54:09 +03:00
# define unoptimize_kprobe(p, f) do {} while (0)
2010-02-25 16:34:07 +03:00
# define kill_optimized_kprobe(p) do {} while (0)
# define prepare_optimized_kprobe(p) do {} while (0)
# define try_to_optimize_kprobe(p) do {} while (0)
# define __arm_kprobe(p) arch_arm_kprobe(p)
2010-12-03 12:54:09 +03:00
# define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
# define kprobe_disarmed(p) kprobe_disabled(p)
# define wait_for_kprobe_optimizer() do {} while (0)
2010-02-25 16:34:07 +03:00
2018-09-11 13:20:40 +03:00
static int reuse_unused_kprobe ( struct kprobe * ap )
2010-12-03 12:54:16 +03:00
{
2018-09-11 13:20:40 +03:00
/*
* If the optimized kprobe is NOT supported , the aggr kprobe is
* released at the same time that the last aggregated kprobe is
* unregistered .
* Thus there should be no chance to reuse unused kprobe .
*/
2021-09-14 17:39:25 +03:00
WARN_ON_ONCE ( 1 ) ;
2018-09-11 13:20:40 +03:00
return - EINVAL ;
2010-12-03 12:54:16 +03:00
}
2014-04-17 12:17:54 +04:00
static void free_aggr_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
2010-12-03 12:54:09 +03:00
arch_remove_kprobe ( p ) ;
2010-02-25 16:34:07 +03:00
kfree ( p ) ;
}
2014-04-17 12:17:54 +04:00
static struct kprobe * alloc_aggr_kprobe ( struct kprobe * p )
2010-02-25 16:34:07 +03:00
{
return kzalloc ( sizeof ( struct kprobe ) , GFP_KERNEL ) ;
}
# endif /* CONFIG_OPTPROBES */
2012-09-28 12:15:20 +04:00
# ifdef CONFIG_KPROBES_ON_FTRACE
2012-06-05 14:28:32 +04:00
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
2019-07-25 09:24:37 +03:00
. func = kprobe_ftrace_handler ,
. flags = FTRACE_OPS_FL_SAVE_REGS ,
} ;
static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
2012-06-05 14:28:38 +04:00
. func = kprobe_ftrace_handler ,
2014-11-21 13:25:23 +03:00
. flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY ,
2012-06-05 14:28:32 +04:00
} ;
2019-07-25 09:24:37 +03:00
static int kprobe_ipmodify_enabled ;
2012-06-05 14:28:32 +04:00
static int kprobe_ftrace_enabled ;
2019-07-25 09:24:37 +03:00
static int __arm_kprobe_ftrace ( struct kprobe * p , struct ftrace_ops * ops ,
int * cnt )
2012-06-05 14:28:32 +04:00
{
2023-07-11 21:53:53 +03:00
int ret ;
2012-06-05 14:28:32 +04:00
2021-09-14 17:39:55 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
2019-07-25 09:24:37 +03:00
ret = ftrace_set_filter_ip ( ops , ( unsigned long ) p - > addr , 0 , 0 ) ;
2021-09-14 17:39:25 +03:00
if ( WARN_ONCE ( ret < 0 , " Failed to arm kprobe-ftrace at %pS (error %d) \n " , p - > addr , ret ) )
2018-01-10 02:51:23 +03:00
return ret ;
2019-07-25 09:24:37 +03:00
if ( * cnt = = 0 ) {
ret = register_ftrace_function ( ops ) ;
2021-09-14 17:39:25 +03:00
if ( WARN ( ret < 0 , " Failed to register kprobe-ftrace (error %d) \n " , ret ) )
2018-01-10 02:51:23 +03:00
goto err_ftrace ;
2012-06-05 14:28:32 +04:00
}
2018-01-10 02:51:23 +03:00
2019-07-25 09:24:37 +03:00
( * cnt ) + + ;
2018-01-10 02:51:23 +03:00
return ret ;
err_ftrace :
/*
2019-07-25 09:24:37 +03:00
* At this point , sinec ops is not registered , we should be sefe from
* registering empty filter .
2018-01-10 02:51:23 +03:00
*/
2019-07-25 09:24:37 +03:00
ftrace_set_filter_ip ( ops , ( unsigned long ) p - > addr , 1 , 0 ) ;
2018-01-10 02:51:23 +03:00
return ret ;
2012-06-05 14:28:32 +04:00
}
2019-07-25 09:24:37 +03:00
static int arm_kprobe_ftrace ( struct kprobe * p )
{
bool ipmodify = ( p - > post_handler ! = NULL ) ;
return __arm_kprobe_ftrace ( p ,
ipmodify ? & kprobe_ipmodify_ops : & kprobe_ftrace_ops ,
ipmodify ? & kprobe_ipmodify_enabled : & kprobe_ftrace_enabled ) ;
}
static int __disarm_kprobe_ftrace ( struct kprobe * p , struct ftrace_ops * ops ,
int * cnt )
2012-06-05 14:28:32 +04:00
{
2023-07-11 21:53:53 +03:00
int ret ;
2012-06-05 14:28:32 +04:00
2021-09-14 17:39:55 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
2019-07-25 09:24:37 +03:00
if ( * cnt = = 1 ) {
ret = unregister_ftrace_function ( ops ) ;
2021-09-14 17:39:25 +03:00
if ( WARN ( ret < 0 , " Failed to unregister kprobe-ftrace (error %d) \n " , ret ) )
2018-01-10 02:51:24 +03:00
return ret ;
2012-06-05 14:28:32 +04:00
}
2018-01-10 02:51:24 +03:00
2019-07-25 09:24:37 +03:00
( * cnt ) - - ;
2018-01-10 02:51:24 +03:00
2019-07-25 09:24:37 +03:00
ret = ftrace_set_filter_ip ( ops , ( unsigned long ) p - > addr , 1 , 0 ) ;
2021-09-14 17:39:25 +03:00
WARN_ONCE ( ret < 0 , " Failed to disarm kprobe-ftrace at %pS (error %d) \n " ,
2018-04-28 15:36:33 +03:00
p - > addr , ret ) ;
2018-01-10 02:51:24 +03:00
return ret ;
2012-06-05 14:28:32 +04:00
}
2019-07-25 09:24:37 +03:00
static int disarm_kprobe_ftrace ( struct kprobe * p )
{
bool ipmodify = ( p - > post_handler ! = NULL ) ;
return __disarm_kprobe_ftrace ( p ,
ipmodify ? & kprobe_ipmodify_ops : & kprobe_ftrace_ops ,
ipmodify ? & kprobe_ipmodify_enabled : & kprobe_ftrace_enabled ) ;
}
2012-09-28 12:15:20 +04:00
# else /* !CONFIG_KPROBES_ON_FTRACE */
2020-08-05 20:20:46 +03:00
static inline int arm_kprobe_ftrace ( struct kprobe * p )
{
return - ENODEV ;
}
static inline int disarm_kprobe_ftrace ( struct kprobe * p )
{
return - ENODEV ;
}
2012-06-05 14:28:32 +04:00
# endif
2021-09-14 17:38:57 +03:00
static int prepare_kprobe ( struct kprobe * p )
{
/* Must ensure p->addr is really on ftrace */
if ( kprobe_ftrace ( p ) )
return arch_prepare_kprobe_ftrace ( p ) ;
return arch_prepare_kprobe ( p ) ;
}
2018-01-10 02:51:23 +03:00
static int arm_kprobe ( struct kprobe * kp )
2009-05-08 00:31:26 +04:00
{
2018-01-10 02:51:23 +03:00
if ( unlikely ( kprobe_ftrace ( kp ) ) )
return arm_kprobe_ftrace ( kp ) ;
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2009-05-08 00:31:26 +04:00
mutex_lock ( & text_mutex ) ;
2010-02-25 16:34:07 +03:00
__arm_kprobe ( kp ) ;
2009-05-08 00:31:26 +04:00
mutex_unlock ( & text_mutex ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2018-01-10 02:51:23 +03:00
return 0 ;
2009-05-08 00:31:26 +04:00
}
2018-01-10 02:51:24 +03:00
static int disarm_kprobe ( struct kprobe * kp , bool reopt )
2009-05-08 00:31:26 +04:00
{
2018-01-10 02:51:24 +03:00
if ( unlikely ( kprobe_ftrace ( kp ) ) )
return disarm_kprobe_ftrace ( kp ) ;
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2009-05-08 00:31:26 +04:00
mutex_lock ( & text_mutex ) ;
2012-06-05 14:28:32 +04:00
__disarm_kprobe ( kp , reopt ) ;
2009-05-08 00:31:26 +04:00
mutex_unlock ( & text_mutex ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2018-01-10 02:51:24 +03:00
return 0 ;
2009-05-08 00:31:26 +04:00
}
2005-05-06 03:15:42 +04:00
/*
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p - > list
*/
2014-04-17 12:18:21 +04:00
static int aggr_pre_handler ( struct kprobe * p , struct pt_regs * regs )
2005-05-06 03:15:42 +04:00
{
struct kprobe * kp ;
2005-11-07 12:00:13 +03:00
list_for_each_entry_rcu ( kp , & p - > list , list ) {
2009-04-07 06:01:02 +04:00
if ( kp - > pre_handler & & likely ( ! kprobe_disabled ( kp ) ) ) {
2005-11-07 12:00:07 +03:00
set_kprobe_instance ( kp ) ;
2005-06-23 11:09:41 +04:00
if ( kp - > pre_handler ( kp , regs ) )
return 1 ;
2005-05-06 03:15:42 +04:00
}
2005-11-07 12:00:07 +03:00
reset_kprobe_instance ( ) ;
2005-05-06 03:15:42 +04:00
}
return 0 ;
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( aggr_pre_handler ) ;
2005-05-06 03:15:42 +04:00
2014-04-17 12:18:21 +04:00
static void aggr_post_handler ( struct kprobe * p , struct pt_regs * regs ,
unsigned long flags )
2005-05-06 03:15:42 +04:00
{
struct kprobe * kp ;
2005-11-07 12:00:13 +03:00
list_for_each_entry_rcu ( kp , & p - > list , list ) {
2009-04-07 06:01:02 +04:00
if ( kp - > post_handler & & likely ( ! kprobe_disabled ( kp ) ) ) {
2005-11-07 12:00:07 +03:00
set_kprobe_instance ( kp ) ;
2005-05-06 03:15:42 +04:00
kp - > post_handler ( kp , regs , flags ) ;
2005-11-07 12:00:07 +03:00
reset_kprobe_instance ( ) ;
2005-05-06 03:15:42 +04:00
}
}
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( aggr_post_handler ) ;
2005-05-06 03:15:42 +04:00
2021-09-14 17:39:34 +03:00
/* Walks the list and increments 'nmissed' if 'p' has child probes. */
2014-04-17 12:18:21 +04:00
void kprobes_inc_nmissed_count ( struct kprobe * p )
2005-12-12 11:37:34 +03:00
{
struct kprobe * kp ;
2021-09-14 17:39:34 +03:00
2010-02-25 16:34:07 +03:00
if ( ! kprobe_aggrprobe ( p ) ) {
2005-12-12 11:37:34 +03:00
p - > nmissed + + ;
} else {
list_for_each_entry_rcu ( kp , & p - > list , list )
kp - > nmissed + + ;
}
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( kprobes_inc_nmissed_count ) ;
2005-12-12 11:37:34 +03:00
2022-03-26 05:27:05 +03:00
static struct kprobe kprobe_busy = {
. addr = ( void * ) get_kprobe ,
} ;
void kprobe_busy_begin ( void )
{
struct kprobe_ctlblk * kcb ;
preempt_disable ( ) ;
__this_cpu_write ( current_kprobe , & kprobe_busy ) ;
kcb = get_kprobe_ctlblk ( ) ;
kcb - > kprobe_status = KPROBE_HIT_ACTIVE ;
}
void kprobe_busy_end ( void )
{
__this_cpu_write ( current_kprobe , NULL ) ;
preempt_enable ( ) ;
}
2021-09-14 17:39:34 +03:00
/* Add the new probe to 'ap->list'. */
2014-04-17 12:17:54 +04:00
static int add_new_kprobe ( struct kprobe * ap , struct kprobe * p )
2005-06-23 11:09:41 +04:00
{
2018-06-19 19:10:27 +03:00
if ( p - > post_handler )
2010-12-03 12:54:09 +03:00
unoptimize_kprobe ( ap , true ) ; /* Fall back to normal kprobe */
2010-02-25 16:34:07 +03:00
2018-06-19 19:10:27 +03:00
list_add_rcu ( & p - > list , & ap - > list ) ;
2009-04-07 06:00:58 +04:00
if ( p - > post_handler & & ! ap - > post_handler )
ap - > post_handler = aggr_post_handler ;
2009-04-07 06:01:02 +04:00
2005-06-23 11:09:41 +04:00
return 0 ;
}
2005-05-06 03:15:42 +04:00
/*
2021-09-14 17:39:34 +03:00
* Fill in the required fields of the aggregator kprobe . Replace the
* earlier kprobe in the hlist with the aggregator kprobe .
2005-05-06 03:15:42 +04:00
*/
2014-04-17 12:17:54 +04:00
static void init_aggr_kprobe ( struct kprobe * ap , struct kprobe * p )
2005-05-06 03:15:42 +04:00
{
2021-09-14 17:39:34 +03:00
/* Copy the insn slot of 'p' to 'ap'. */
2005-06-23 11:09:41 +04:00
copy_kprobe ( p , ap ) ;
2006-07-30 14:03:26 +04:00
flush_insn_slot ( ap ) ;
2005-05-06 03:15:42 +04:00
ap - > addr = p - > addr ;
2010-02-25 16:34:07 +03:00
ap - > flags = p - > flags & ~ KPROBE_FLAG_OPTIMIZED ;
2005-05-06 03:15:42 +04:00
ap - > pre_handler = aggr_pre_handler ;
2009-01-07 01:41:52 +03:00
/* We don't care the kprobe which has gone. */
if ( p - > post_handler & & ! kprobe_gone ( p ) )
2006-06-26 11:25:22 +04:00
ap - > post_handler = aggr_post_handler ;
2005-05-06 03:15:42 +04:00
INIT_LIST_HEAD ( & ap - > list ) ;
2010-02-25 16:34:07 +03:00
INIT_HLIST_NODE ( & ap - > hlist ) ;
2005-05-06 03:15:42 +04:00
2010-02-25 16:34:07 +03:00
list_add_rcu ( & p - > list , & ap - > list ) ;
2005-12-12 11:37:12 +03:00
hlist_replace_rcu ( & p - > hlist , & ap - > hlist ) ;
2005-05-06 03:15:42 +04:00
}
/*
2021-09-14 17:39:34 +03:00
* This registers the second or subsequent kprobe at the same address .
2005-05-06 03:15:42 +04:00
*/
2014-04-17 12:17:54 +04:00
static int register_aggr_kprobe ( struct kprobe * orig_p , struct kprobe * p )
2005-05-06 03:15:42 +04:00
{
int ret = 0 ;
2010-12-03 12:53:50 +03:00
struct kprobe * ap = orig_p ;
2005-05-06 03:15:42 +04:00
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
2012-06-05 14:28:26 +04:00
/* For preparing optimization, jump_label_text_reserved() is called */
jump_label_lock ( ) ;
mutex_lock ( & text_mutex ) ;
2010-12-03 12:53:50 +03:00
if ( ! kprobe_aggrprobe ( orig_p ) ) {
2021-09-14 17:39:34 +03:00
/* If 'orig_p' is not an 'aggr_kprobe', create new one. */
2010-12-03 12:53:50 +03:00
ap = alloc_aggr_kprobe ( orig_p ) ;
2012-06-05 14:28:26 +04:00
if ( ! ap ) {
ret = - ENOMEM ;
goto out ;
}
2010-12-03 12:53:50 +03:00
init_aggr_kprobe ( ap , orig_p ) ;
2018-09-11 13:20:40 +03:00
} else if ( kprobe_unused ( ap ) ) {
2010-12-03 12:54:16 +03:00
/* This probe is going to die. Rescue it */
2018-09-11 13:20:40 +03:00
ret = reuse_unused_kprobe ( ap ) ;
if ( ret )
goto out ;
}
2009-04-07 06:00:58 +04:00
if ( kprobe_gone ( ap ) ) {
2009-01-07 01:41:52 +03:00
/*
* Attempting to insert new probe at the same location that
* had a probe in the module vaddr area which already
* freed . So , the instruction slot has already been
* released . We need a new slot for the new probe .
*/
2009-04-07 06:00:58 +04:00
ret = arch_prepare_kprobe ( ap ) ;
2009-01-07 01:41:52 +03:00
if ( ret )
2009-04-07 06:00:58 +04:00
/*
* Even if fail to allocate new slot , don ' t need to
2021-09-14 17:39:34 +03:00
* free the ' ap ' . It will be used next time , or
* freed by unregister_kprobe ( ) .
2009-04-07 06:00:58 +04:00
*/
2012-06-05 14:28:26 +04:00
goto out ;
2009-04-07 06:01:02 +04:00
2010-02-25 16:34:07 +03:00
/* Prepare optimized instructions if possible. */
prepare_optimized_kprobe ( ap ) ;
2009-01-07 01:41:52 +03:00
/*
2009-04-07 06:01:02 +04:00
* Clear gone flag to prevent allocating new slot again , and
* set disabled flag because it is not armed yet .
2009-01-07 01:41:52 +03:00
*/
2009-04-07 06:01:02 +04:00
ap - > flags = ( ap - > flags & ~ KPROBE_FLAG_GONE )
| KPROBE_FLAG_DISABLED ;
2009-01-07 01:41:52 +03:00
}
2009-04-07 06:00:58 +04:00
2021-09-14 17:39:34 +03:00
/* Copy the insn slot of 'p' to 'ap'. */
2009-04-07 06:00:58 +04:00
copy_kprobe ( ap , p ) ;
2012-06-05 14:28:26 +04:00
ret = add_new_kprobe ( ap , p ) ;
out :
mutex_unlock ( & text_mutex ) ;
jump_label_unlock ( ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2012-06-05 14:28:26 +04:00
if ( ret = = 0 & & kprobe_disabled ( ap ) & & ! kprobe_disabled ( p ) ) {
ap - > flags & = ~ KPROBE_FLAG_DISABLED ;
2018-01-10 02:51:23 +03:00
if ( ! kprobes_all_disarmed ) {
2012-06-05 14:28:26 +04:00
/* Arm the breakpoint again. */
2018-01-10 02:51:23 +03:00
ret = arm_kprobe ( ap ) ;
if ( ret ) {
ap - > flags | = KPROBE_FLAG_DISABLED ;
list_del_rcu ( & p - > list ) ;
2018-11-07 06:04:39 +03:00
synchronize_rcu ( ) ;
2018-01-10 02:51:23 +03:00
}
}
2012-06-05 14:28:26 +04:00
}
return ret ;
2005-05-06 03:15:42 +04:00
}
2014-04-17 12:16:58 +04:00
bool __weak arch_within_kprobe_blacklist ( unsigned long addr )
{
2021-09-14 17:39:34 +03:00
/* The '__kprobes' functions and entry code must not be probed. */
2014-04-17 12:16:58 +04:00
return addr > = ( unsigned long ) __kprobes_text_start & &
addr < ( unsigned long ) __kprobes_text_end ;
}
2019-02-12 19:13:12 +03:00
static bool __within_kprobe_blacklist ( unsigned long addr )
2005-09-07 02:19:26 +04:00
{
2014-04-17 12:17:05 +04:00
struct kprobe_blacklist_entry * ent ;
2008-04-28 13:14:26 +04:00
2014-04-17 12:16:58 +04:00
if ( arch_within_kprobe_blacklist ( addr ) )
2014-04-17 12:17:05 +04:00
return true ;
2008-04-28 13:14:26 +04:00
/*
2021-09-14 17:39:34 +03:00
* If ' kprobe_blacklist ' is defined , check the address and
* reject any probe registration in the prohibited area .
2008-04-28 13:14:26 +04:00
*/
2014-04-17 12:17:05 +04:00
list_for_each_entry ( ent , & kprobe_blacklist , list ) {
if ( addr > = ent - > start_addr & & addr < ent - > end_addr )
return true ;
2008-04-28 13:14:26 +04:00
}
2019-02-12 19:13:12 +03:00
return false ;
}
2014-04-17 12:17:05 +04:00
2019-02-12 19:13:12 +03:00
bool within_kprobe_blacklist ( unsigned long addr )
{
char symname [ KSYM_NAME_LEN ] , * p ;
if ( __within_kprobe_blacklist ( addr ) )
return true ;
/* Check if the address is on a suffixed-symbol */
if ( ! lookup_symbol_name ( addr , symname ) ) {
p = strchr ( symname , ' . ' ) ;
if ( ! p )
return false ;
* p = ' \0 ' ;
addr = ( unsigned long ) kprobe_lookup_name ( symname , 0 ) ;
if ( addr )
return __within_kprobe_blacklist ( addr ) ;
}
2014-04-17 12:17:05 +04:00
return false ;
2005-09-07 02:19:26 +04:00
}
2022-03-08 18:30:32 +03:00
/*
* arch_adjust_kprobe_addr - adjust the address
* @ addr : symbol base address
* @ offset : offset within the symbol
* @ on_func_entry : was this @ addr + @ offset on the function entry
*
* Typically returns @ addr + @ offset , except for special cases where the
* function might be prefixed by a CFI landing pad , in that case any offset
* inside the landing pad is mapped to the first ' real ' instruction of the
* symbol .
*
* Specifically , for things like IBT / BTI , skip the resp . ENDBR / BTI . C
* instruction at + 0.
*/
kprobe_opcode_t * __weak arch_adjust_kprobe_addr ( unsigned long addr ,
unsigned long offset ,
bool * on_func_entry )
{
* on_func_entry = ! offset ;
return ( kprobe_opcode_t * ) ( addr + offset ) ;
}
2008-03-05 01:29:44 +03:00
/*
2021-09-14 17:39:34 +03:00
* If ' symbol_name ' is specified , look it up and add the ' offset '
2008-03-05 01:29:44 +03:00
* to it . This way , we can specify a relative address to a symbol .
2011-06-27 11:26:50 +04:00
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters .
2008-03-05 01:29:44 +03:00
*/
2022-03-08 18:30:32 +03:00
static kprobe_opcode_t *
_kprobe_addr ( kprobe_opcode_t * addr , const char * symbol_name ,
unsigned long offset , bool * on_func_entry )
2008-03-05 01:29:44 +03:00
{
2017-03-08 11:26:06 +03:00
if ( ( symbol_name & & addr ) | | ( ! symbol_name & & ! addr ) )
2011-06-27 11:26:50 +04:00
goto invalid ;
2017-03-08 11:26:06 +03:00
if ( symbol_name ) {
2022-03-08 18:30:32 +03:00
/*
* Input : @ sym + @ offset
* Output : @ addr + @ offset
*
* NOTE : kprobe_lookup_name ( ) does * NOT * fold the offset
* argument into it ' s output !
*/
powerpc updates for 4.12 part 1.
Highlights include:
- Larger virtual address space on 64-bit server CPUs. By default we use a 128TB
virtual address space, but a process can request access to the full 512TB by
passing a hint to mmap().
- Support for the new Power9 "XIVE" interrupt controller.
- TLB flushing optimisations for the radix MMU on Power9.
- Support for CAPI cards on Power9, using the "Coherent Accelerator Interface
Architecture 2.0".
- The ability to configure the mmap randomisation limits at build and runtime.
- Several small fixes and cleanups to the kprobes code, as well as support for
KPROBES_ON_FTRACE.
- Major improvements to handling of system reset interrupts, correctly treating
them as NMIs, giving them a dedicated stack and using a new hypervisor call
to trigger them, all of which should aid debugging and robustness.
Many fixes and other minor enhancements.
Thanks to:
Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan,
Aneesh Kumar K.V, Anshuman Khandual, Anton Blanchard, Balbir Singh, Ben
Hutchings, Benjamin Herrenschmidt, Bhupesh Sharma, Chris Packham, Christian
Zigotzky, Christophe Leroy, Christophe Lombard, Daniel Axtens, David Gibson,
Gautham R. Shenoy, Gavin Shan, Geert Uytterhoeven, Guilherme G. Piccoli,
Hamish Martin, Hari Bathini, Kees Cook, Laurent Dufour, Madhavan Srinivasan,
Mahesh J Salgaonkar, Mahesh Salgaonkar, Masami Hiramatsu, Matt Brown, Matthew
R. Ochs, Michael Neuling, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran,
Pan Xinhui, Paul Mackerras, Rashmica Gupta, Russell Currey, Sukadev
Bhattiprolu, Thadeu Lima de Souza Cascardo, Tobin C. Harding, Tyrel Datwyler,
Uma Krishnan, Vaibhav Jain, Vipin K Parashar, Yang Shi.
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1
iQIcBAABAgAGBQJZDHUMAAoJEFHr6jzI4aWAT7oQALkE2Nj3gjcn1z0SkFhq/1iO
Py9Elmqm4E+L6NKYtBY5dS8xVAJ088ffzERyqJ1FY1LHkB8tn8bWRcMQmbjAFzTI
V4TAzDNI890BN/F4ptrYRwNFxRBHAvZ4NDunTzagwYnwmTzW9PYHmOi4pvWTo3Tw
KFUQ0joLSEgHzyfXxYB3fyj41u8N0FZvhfazdNSqia2Y5Vwwv/ION5jKplDM+09Y
EtVEXFvaKAS1sjbM/d/Jo5rblHfR0D9/lYV10+jjyIokjzslIpyTbnj3izeYoM5V
I4h99372zfsEjBGPPXyM3khL3zizGMSDYRmJHQSaKxjtecS9SPywPTZ8ufO/aSzV
Ngq6nlND+f1zep29VQ0cxd3Jh40skWOXzxJaFjfDT25xa6FbfsWP2NCtk8PGylZ7
EyqTuCWkMgIP02KlX3oHvEB2LRRPCDmRU2zECecRGNJrIQwYC2xjoiVi7Q8Qe8rY
gr7Ib5Jj/a+uiTcCIy37+5nXq2s14/JBOKqxuYZIxeuZFvKYuRUipbKWO05WDOAz
m/pSzeC3J8AAoYiqR0gcSOuJTOnJpGhs7zrQFqnEISbXIwLW+ICumzOmTAiBqOEY
Rt8uW2gYkPwKLrE05445RfVUoERaAjaE06eRMOWS6slnngHmmnRJbf3PcoALiJkT
ediqGEj0/N1HMB31V5tS
=vSF3
-----END PGP SIGNATURE-----
Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"Highlights include:
- Larger virtual address space on 64-bit server CPUs. By default we
use a 128TB virtual address space, but a process can request access
to the full 512TB by passing a hint to mmap().
- Support for the new Power9 "XIVE" interrupt controller.
- TLB flushing optimisations for the radix MMU on Power9.
- Support for CAPI cards on Power9, using the "Coherent Accelerator
Interface Architecture 2.0".
- The ability to configure the mmap randomisation limits at build and
runtime.
- Several small fixes and cleanups to the kprobes code, as well as
support for KPROBES_ON_FTRACE.
- Major improvements to handling of system reset interrupts,
correctly treating them as NMIs, giving them a dedicated stack and
using a new hypervisor call to trigger them, all of which should
aid debugging and robustness.
- Many fixes and other minor enhancements.
Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple,
Andrew Donnellan, Aneesh Kumar K.V, Anshuman Khandual, Anton
Blanchard, Balbir Singh, Ben Hutchings, Benjamin Herrenschmidt,
Bhupesh Sharma, Chris Packham, Christian Zigotzky, Christophe Leroy,
Christophe Lombard, Daniel Axtens, David Gibson, Gautham R. Shenoy,
Gavin Shan, Geert Uytterhoeven, Guilherme G. Piccoli, Hamish Martin,
Hari Bathini, Kees Cook, Laurent Dufour, Madhavan Srinivasan, Mahesh J
Salgaonkar, Mahesh Salgaonkar, Masami Hiramatsu, Matt Brown, Matthew
R. Ochs, Michael Neuling, Naveen N. Rao, Nicholas Piggin, Oliver
O'Halloran, Pan Xinhui, Paul Mackerras, Rashmica Gupta, Russell
Currey, Sukadev Bhattiprolu, Thadeu Lima de Souza Cascardo, Tobin C.
Harding, Tyrel Datwyler, Uma Krishnan, Vaibhav Jain, Vipin K Parashar,
Yang Shi"
* tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (214 commits)
powerpc/64s: Power9 has no LPCR[VRMASD] field so don't set it
powerpc/powernv: Fix TCE kill on NVLink2
powerpc/mm/radix: Drop support for CPUs without lockless tlbie
powerpc/book3s/mce: Move add_taint() later in virtual mode
powerpc/sysfs: Move #ifdef CONFIG_HOTPLUG_CPU out of the function body
powerpc/smp: Document irq enable/disable after migrating IRQs
powerpc/mpc52xx: Don't select user-visible RTAS_PROC
powerpc/powernv: Document cxl dependency on special case in pnv_eeh_reset()
powerpc/eeh: Clean up and document event handling functions
powerpc/eeh: Avoid use after free in eeh_handle_special_event()
cxl: Mask slice error interrupts after first occurrence
cxl: Route eeh events to all drivers in cxl_pci_error_detected()
cxl: Force context lock during EEH flow
powerpc/64: Allow CONFIG_RELOCATABLE if COMPILE_TEST
powerpc/xmon: Teach xmon oops about radix vectors
powerpc/mm/hash: Fix off-by-one in comment about kernel contexts ids
powerpc/pseries: Enable VFIO
powerpc/powernv: Fix iommu table size calculation hook for small tables
powerpc/powernv: Check kzalloc() return value in pnv_pci_table_alloc
powerpc: Add arch/powerpc/tools directory
...
2017-05-05 21:36:44 +03:00
addr = kprobe_lookup_name ( symbol_name , offset ) ;
2011-06-27 11:26:50 +04:00
if ( ! addr )
return ERR_PTR ( - ENOENT ) ;
2008-03-05 01:29:44 +03:00
}
2022-03-08 18:30:32 +03:00
/*
* So here we have @ addr + @ offset , displace it into a new
* @ addr ' + @ offset ' where @ addr ' is the symbol start address .
*/
addr = ( void * ) addr + offset ;
if ( ! kallsyms_lookup_size_offset ( ( unsigned long ) addr , NULL , & offset ) )
return ERR_PTR ( - ENOENT ) ;
addr = ( void * ) addr - offset ;
/*
* Then ask the architecture to re - combine them , taking care of
* magical function entry details while telling us if this was indeed
* at the start of the function .
*/
addr = arch_adjust_kprobe_addr ( ( unsigned long ) addr , offset , on_func_entry ) ;
2011-06-27 11:26:50 +04:00
if ( addr )
return addr ;
invalid :
return ERR_PTR ( - EINVAL ) ;
2008-03-05 01:29:44 +03:00
}
2017-03-08 11:26:06 +03:00
static kprobe_opcode_t * kprobe_addr ( struct kprobe * p )
{
2022-03-08 18:30:32 +03:00
bool on_func_entry ;
return _kprobe_addr ( p - > addr , p - > symbol_name , p - > offset , & on_func_entry ) ;
2017-03-08 11:26:06 +03:00
}
2021-09-14 17:39:34 +03:00
/*
* Check the ' p ' is valid and return the aggregator kprobe
* at the same address .
*/
2014-04-17 12:17:54 +04:00
static struct kprobe * __get_valid_kprobe ( struct kprobe * p )
2009-09-15 09:13:07 +04:00
{
2010-12-03 12:53:50 +03:00
struct kprobe * ap , * list_p ;
2009-09-15 09:13:07 +04:00
2020-05-12 11:02:44 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
2010-12-03 12:53:50 +03:00
ap = get_kprobe ( p - > addr ) ;
if ( unlikely ( ! ap ) )
2009-09-15 09:13:07 +04:00
return NULL ;
2010-12-03 12:53:50 +03:00
if ( p ! = ap ) {
2020-05-12 11:02:44 +03:00
list_for_each_entry ( list_p , & ap - > list , list )
2009-09-15 09:13:07 +04:00
if ( list_p = = p )
/* kprobe p is a valid probe */
goto valid ;
return NULL ;
}
valid :
2010-12-03 12:53:50 +03:00
return ap ;
2009-09-15 09:13:07 +04:00
}
2021-02-03 17:59:27 +03:00
/*
* Warn and return error if the kprobe is being re - registered since
* there must be a software bug .
*/
static inline int warn_kprobe_rereg ( struct kprobe * p )
2009-09-15 09:13:07 +04:00
{
int ret = 0 ;
mutex_lock ( & kprobe_mutex ) ;
2021-02-03 17:59:27 +03:00
if ( WARN_ON_ONCE ( __get_valid_kprobe ( p ) ) )
2009-09-15 09:13:07 +04:00
ret = - EINVAL ;
mutex_unlock ( & kprobe_mutex ) ;
2010-12-03 12:53:50 +03:00
2009-09-15 09:13:07 +04:00
return ret ;
}
2021-09-14 17:39:16 +03:00
static int check_ftrace_location ( struct kprobe * p )
2005-04-17 02:20:36 +04:00
{
2022-03-08 18:30:29 +03:00
unsigned long addr = ( unsigned long ) p - > addr ;
2012-06-05 14:28:32 +04:00
2022-03-08 18:30:29 +03:00
if ( ftrace_location ( addr ) = = addr ) {
2012-09-28 12:15:20 +04:00
# ifdef CONFIG_KPROBES_ON_FTRACE
2012-06-05 14:28:32 +04:00
p - > flags | = KPROBE_FLAG_FTRACE ;
2012-09-28 12:15:20 +04:00
# else /* !CONFIG_KPROBES_ON_FTRACE */
2012-06-05 14:28:32 +04:00
return - EINVAL ;
# endif
}
2014-10-15 14:17:34 +04:00
return 0 ;
}
2023-07-11 04:50:47 +03:00
static bool is_cfi_preamble_symbol ( unsigned long addr )
{
char symbuf [ KSYM_NAME_LEN ] ;
if ( lookup_symbol_name ( addr , symbuf ) )
return false ;
return str_has_prefix ( " __cfi_ " , symbuf ) | |
str_has_prefix ( " __pfx_ " , symbuf ) ;
}
2014-10-15 14:17:34 +04:00
static int check_kprobe_address_safe ( struct kprobe * p ,
struct module * * probed_mod )
{
int ret ;
2009-09-15 09:13:07 +04:00
2021-09-14 17:39:16 +03:00
ret = check_ftrace_location ( p ) ;
2014-10-15 14:17:34 +04:00
if ( ret )
return ret ;
2010-10-02 01:23:48 +04:00
jump_label_lock ( ) ;
2010-10-18 18:38:58 +04:00
preempt_disable ( ) ;
2012-06-05 14:28:20 +04:00
/* Ensure it is not in reserved area nor out of text */
2022-08-01 06:37:19 +03:00
if ( ! ( core_kernel_text ( ( unsigned long ) p - > addr ) | |
is_module_text_address ( ( unsigned long ) p - > addr ) ) | |
kprobes: Prohibit probes in gate area
The system call gate area counts as kernel text but trying
to install a kprobe in this area fails with an Oops later on.
To fix this explicitly disallow the gate area for kprobes.
Found by syzkaller with the following reproducer:
perf_event_open$cgroup(&(0x7f00000001c0)={0x6, 0x80, 0x0, 0x0, 0x0, 0x0, 0x80ffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext={0x0, 0xffffffffff600000}}, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
Sample report:
BUG: unable to handle page fault for address: fffffbfff3ac6000
PGD 6dfcb067 P4D 6dfcb067 PUD 6df8f067 PMD 6de4d067 PTE 0
Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 0 PID: 21978 Comm: syz-executor.2 Not tainted 6.0.0-rc3-00363-g7726d4c3e60b-dirty #6
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline]
RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline]
RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134
Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89
RSP: 0018:ffffc900088bf860 EFLAGS: 00010246
RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000
RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8
RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001
R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000
FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
<TASK>
insn_get_prefixes arch/x86/lib/insn.c:131 [inline]
insn_get_opcode arch/x86/lib/insn.c:272 [inline]
insn_get_modrm+0x64a/0x7b0 arch/x86/lib/insn.c:343
insn_get_sib+0x29a/0x330 arch/x86/lib/insn.c:421
insn_get_displacement+0x350/0x6b0 arch/x86/lib/insn.c:464
insn_get_immediate arch/x86/lib/insn.c:632 [inline]
insn_get_length arch/x86/lib/insn.c:707 [inline]
insn_decode+0x43a/0x490 arch/x86/lib/insn.c:747
can_probe+0xfc/0x1d0 arch/x86/kernel/kprobes/core.c:282
arch_prepare_kprobe+0x79/0x1c0 arch/x86/kernel/kprobes/core.c:739
prepare_kprobe kernel/kprobes.c:1160 [inline]
register_kprobe kernel/kprobes.c:1641 [inline]
register_kprobe+0xb6e/0x1690 kernel/kprobes.c:1603
__register_trace_kprobe kernel/trace/trace_kprobe.c:509 [inline]
__register_trace_kprobe+0x26a/0x2d0 kernel/trace/trace_kprobe.c:477
create_local_trace_kprobe+0x1f7/0x350 kernel/trace/trace_kprobe.c:1833
perf_kprobe_init+0x18c/0x280 kernel/trace/trace_event_perf.c:271
perf_kprobe_event_init+0xf8/0x1c0 kernel/events/core.c:9888
perf_try_init_event+0x12d/0x570 kernel/events/core.c:11261
perf_init_event kernel/events/core.c:11325 [inline]
perf_event_alloc.part.0+0xf7f/0x36a0 kernel/events/core.c:11619
perf_event_alloc kernel/events/core.c:12059 [inline]
__do_sys_perf_event_open+0x4a8/0x2a00 kernel/events/core.c:12157
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7f63ef7efaed
Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f63eef63028 EFLAGS: 00000246 ORIG_RAX: 000000000000012a
RAX: ffffffffffffffda RBX: 00007f63ef90ff80 RCX: 00007f63ef7efaed
RDX: 0000000000000000 RSI: ffffffffffffffff RDI: 00000000200001c0
RBP: 00007f63ef86019c R08: 0000000000000000 R09: 0000000000000000
R10: ffffffffffffffff R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000002 R14: 00007f63ef90ff80 R15: 00007f63eef43000
</TASK>
Modules linked in:
CR2: fffffbfff3ac6000
---[ end trace 0000000000000000 ]---
RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline]
RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline]
RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134
Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89
RSP: 0018:ffffc900088bf860 EFLAGS: 00010246
RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000
RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8
RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001
R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000
FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0
PKRU: 55555554
==================================================================
Link: https://lkml.kernel.org/r/20220907200917.654103-1-lk@c--e.de
cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
cc: "David S. Miller" <davem@davemloft.net>
Cc: stable@vger.kernel.org
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Christian A. Ehrhardt <lk@c--e.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-09-07 23:09:17 +03:00
in_gate_area_no_mm ( ( unsigned long ) p - > addr ) | |
2014-04-17 12:17:05 +04:00
within_kprobe_blacklist ( ( unsigned long ) p - > addr ) | |
2019-09-03 14:08:21 +03:00
jump_label_text_reserved ( p - > addr , p - > addr ) | |
2021-06-28 14:24:12 +03:00
static_call_text_reserved ( p - > addr , p - > addr ) | |
2023-07-11 04:50:47 +03:00
find_bug ( ( unsigned long ) p - > addr ) | |
is_cfi_preamble_symbol ( ( unsigned long ) p - > addr ) ) {
2012-03-06 02:59:12 +04:00
ret = - EINVAL ;
2012-06-05 14:28:20 +04:00
goto out ;
2012-03-06 02:59:12 +04:00
}
2005-12-12 11:37:00 +03:00
2021-09-14 17:39:34 +03:00
/* Check if 'p' is probing a module. */
2012-06-05 14:28:20 +04:00
* probed_mod = __module_text_address ( ( unsigned long ) p - > addr ) ;
if ( * probed_mod ) {
2007-05-08 11:34:13 +04:00
/*
2009-01-07 01:41:52 +03:00
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading .
2006-01-11 23:17:41 +03:00
*/
2012-06-05 14:28:20 +04:00
if ( unlikely ( ! try_module_get ( * probed_mod ) ) ) {
ret = - ENOENT ;
goto out ;
}
2010-10-18 18:38:58 +04:00
2009-01-07 01:41:55 +03:00
/*
2021-09-14 17:39:34 +03:00
* If the module freed ' . init . text ' , we couldn ' t insert
2009-01-07 01:41:55 +03:00
* kprobes in there .
*/
2012-06-05 14:28:20 +04:00
if ( within_module_init ( ( unsigned long ) p - > addr , * probed_mod ) & &
( * probed_mod ) - > state ! = MODULE_STATE_COMING ) {
module_put ( * probed_mod ) ;
* probed_mod = NULL ;
ret = - ENOENT ;
2009-01-07 01:41:55 +03:00
}
2006-01-11 23:17:41 +03:00
}
2012-06-05 14:28:20 +04:00
out :
2008-11-13 00:26:51 +03:00
preempt_enable ( ) ;
2010-10-18 18:38:58 +04:00
jump_label_unlock ( ) ;
2005-04-17 02:20:36 +04:00
2012-06-05 14:28:20 +04:00
return ret ;
}
2014-04-17 12:17:54 +04:00
int register_kprobe ( struct kprobe * p )
2012-06-05 14:28:20 +04:00
{
int ret ;
struct kprobe * old_p ;
struct module * probed_mod ;
kprobe_opcode_t * addr ;
2022-09-26 18:33:35 +03:00
bool on_func_entry ;
2012-06-05 14:28:20 +04:00
/* Adjust probe address from symbol */
2022-09-26 18:33:35 +03:00
addr = _kprobe_addr ( p - > addr , p - > symbol_name , p - > offset , & on_func_entry ) ;
2012-06-05 14:28:20 +04:00
if ( IS_ERR ( addr ) )
return PTR_ERR ( addr ) ;
p - > addr = addr ;
2021-02-03 17:59:27 +03:00
ret = warn_kprobe_rereg ( p ) ;
2012-06-05 14:28:20 +04:00
if ( ret )
return ret ;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p - > flags & = KPROBE_FLAG_DISABLED ;
2005-11-07 12:00:13 +03:00
p - > nmissed = 0 ;
2008-04-28 13:14:28 +04:00
INIT_LIST_HEAD ( & p - > list ) ;
2010-02-25 16:34:07 +03:00
2012-06-05 14:28:20 +04:00
ret = check_kprobe_address_safe ( p , & probed_mod ) ;
if ( ret )
return ret ;
mutex_lock ( & kprobe_mutex ) ;
2010-02-25 16:34:07 +03:00
2022-09-26 18:33:35 +03:00
if ( on_func_entry )
p - > flags | = KPROBE_FLAG_ON_FUNC_ENTRY ;
2005-05-06 03:15:42 +04:00
old_p = get_kprobe ( p - > addr ) ;
if ( old_p ) {
2021-09-14 17:39:34 +03:00
/* Since this may unoptimize 'old_p', locking 'text_mutex'. */
2005-05-06 03:15:42 +04:00
ret = register_aggr_kprobe ( old_p , p ) ;
2005-04-17 02:20:36 +04:00
goto out ;
}
2017-05-24 11:15:36 +03:00
cpus_read_lock ( ) ;
/* Prevent text modification */
mutex_lock ( & text_mutex ) ;
2012-06-05 14:28:32 +04:00
ret = prepare_kprobe ( p ) ;
2012-06-05 14:28:26 +04:00
mutex_unlock ( & text_mutex ) ;
2017-05-24 11:15:36 +03:00
cpus_read_unlock ( ) ;
2007-05-08 11:34:13 +04:00
if ( ret )
2010-02-25 16:34:07 +03:00
goto out ;
2006-01-10 07:52:43 +03:00
2005-05-06 03:15:42 +04:00
INIT_HLIST_NODE ( & p - > hlist ) ;
2005-11-07 12:00:13 +03:00
hlist_add_head_rcu ( & p - > hlist ,
2005-04-17 02:20:36 +04:00
& kprobe_table [ hash_ptr ( p - > addr , KPROBE_HASH_BITS ) ] ) ;
2018-01-10 02:51:23 +03:00
if ( ! kprobes_all_disarmed & & ! kprobe_disabled ( p ) ) {
ret = arm_kprobe ( p ) ;
if ( ret ) {
hlist_del_rcu ( & p - > hlist ) ;
2018-11-07 06:04:39 +03:00
synchronize_rcu ( ) ;
2018-01-10 02:51:23 +03:00
goto out ;
}
}
2010-02-25 16:34:07 +03:00
/* Try to optimize kprobe */
try_to_optimize_kprobe ( p ) ;
2005-04-17 02:20:36 +04:00
out :
2006-03-23 14:00:35 +03:00
mutex_unlock ( & kprobe_mutex ) ;
2006-01-10 07:52:43 +03:00
2009-01-07 01:41:52 +03:00
if ( probed_mod )
2006-01-11 23:17:41 +03:00
module_put ( probed_mod ) ;
2009-01-07 01:41:52 +03:00
2005-04-17 02:20:36 +04:00
return ret ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kprobe ) ;
2005-04-17 02:20:36 +04:00
2021-09-14 17:39:34 +03:00
/* Check if all probes on the 'ap' are disabled. */
2021-09-14 17:40:16 +03:00
static bool aggr_kprobe_disabled ( struct kprobe * ap )
2010-12-03 12:53:57 +03:00
{
struct kprobe * kp ;
2020-05-12 11:02:44 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
list_for_each_entry ( kp , & ap - > list , list )
2010-12-03 12:53:57 +03:00
if ( ! kprobe_disabled ( kp ) )
/*
2021-09-14 17:39:34 +03:00
* Since there is an active probe on the list ,
* we can ' t disable this ' ap ' .
2010-12-03 12:53:57 +03:00
*/
2021-09-14 17:40:16 +03:00
return false ;
2010-12-03 12:53:57 +03:00
2021-09-14 17:40:16 +03:00
return true ;
2010-12-03 12:53:57 +03:00
}
2014-04-17 12:17:54 +04:00
static struct kprobe * __disable_kprobe ( struct kprobe * p )
2010-12-03 12:53:57 +03:00
{
struct kprobe * orig_p ;
2018-01-10 02:51:24 +03:00
int ret ;
2010-12-03 12:53:57 +03:00
2021-09-14 17:39:55 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
2010-12-03 12:53:57 +03:00
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe ( p ) ;
if ( unlikely ( orig_p = = NULL ) )
2018-01-10 02:51:24 +03:00
return ERR_PTR ( - EINVAL ) ;
2010-12-03 12:53:57 +03:00
if ( ! kprobe_disabled ( p ) ) {
/* Disable probe if it is a child probe */
if ( p ! = orig_p )
p - > flags | = KPROBE_FLAG_DISABLED ;
/* Try to disarm and disable this/parent probe */
if ( p = = orig_p | | aggr_kprobe_disabled ( orig_p ) ) {
2015-02-14 01:40:26 +03:00
/*
kprobes: don't call disarm_kprobe() for disabled kprobes
The assumption in __disable_kprobe() is wrong, and it could try to disarm
an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can
easily reproduce this issue.
1. Write 0 to /sys/kernel/debug/kprobes/enabled.
# echo 0 > /sys/kernel/debug/kprobes/enabled
2. Run execsnoop. At this time, one kprobe is disabled.
# /usr/share/bcc/tools/execsnoop &
[1] 2460
PCOMM PID PPID RET ARGS
# cat /sys/kernel/debug/kprobes/list
ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE]
ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes
kprobes_all_disarmed to false but does not arm the disabled kprobe.
# echo 1 > /sys/kernel/debug/kprobes/enabled
# cat /sys/kernel/debug/kprobes/list
ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE]
ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the
disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace().
# fg
/usr/share/bcc/tools/execsnoop
^C
Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses
some cleanups and leaves the aggregated kprobe in the hash table. Then,
__unregister_trace_kprobe() initialises tk->rp.kp.list and creates an
infinite loop like this.
aggregated kprobe.list -> kprobe.list -.
^ |
'.__.'
In this situation, these commands fall into the infinite loop and result
in RCU stall or soft lockup.
cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the
infinite loop with RCU.
/usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex,
and __get_valid_kprobe() is stuck in
the loop.
To avoid the issue, make sure we don't call disarm_kprobe() for disabled
kprobes.
[0]
Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2)
WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129)
Modules linked in: ena
CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28
Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017
RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129)
Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94
RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001
RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff
RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff
R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40
R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000
FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
__disable_kprobe (kernel/kprobes.c:1716)
disable_kprobe (kernel/kprobes.c:2392)
__disable_trace_kprobe (kernel/trace/trace_kprobe.c:340)
disable_trace_kprobe (kernel/trace/trace_kprobe.c:429)
perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168)
perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295)
_free_event (kernel/events/core.c:4971)
perf_event_release_kernel (kernel/events/core.c:5176)
perf_release (kernel/events/core.c:5186)
__fput (fs/file_table.c:321)
task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1))
exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201)
syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296)
do_syscall_64 (arch/x86/entry/common.c:87)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
RIP: 0033:0x7fe7ff210654
Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc
RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654
RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008
RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30
R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600
R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560
</TASK>
Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com
Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reported-by: Ayushman Dutta <ayudutta@amazon.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Kuniyuki Iwashima <kuni1840@gmail.com>
Cc: Ayushman Dutta <ayudutta@amazon.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-13 05:05:09 +03:00
* Don ' t be lazy here . Even if ' kprobes_all_disarmed '
* is false , ' orig_p ' might not have been armed yet .
* Note arm_all_kprobes ( ) __tries__ to arm all kprobes
* on the best effort basis .
2015-02-14 01:40:26 +03:00
*/
kprobes: don't call disarm_kprobe() for disabled kprobes
The assumption in __disable_kprobe() is wrong, and it could try to disarm
an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can
easily reproduce this issue.
1. Write 0 to /sys/kernel/debug/kprobes/enabled.
# echo 0 > /sys/kernel/debug/kprobes/enabled
2. Run execsnoop. At this time, one kprobe is disabled.
# /usr/share/bcc/tools/execsnoop &
[1] 2460
PCOMM PID PPID RET ARGS
# cat /sys/kernel/debug/kprobes/list
ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE]
ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes
kprobes_all_disarmed to false but does not arm the disabled kprobe.
# echo 1 > /sys/kernel/debug/kprobes/enabled
# cat /sys/kernel/debug/kprobes/list
ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE]
ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE]
4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the
disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace().
# fg
/usr/share/bcc/tools/execsnoop
^C
Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses
some cleanups and leaves the aggregated kprobe in the hash table. Then,
__unregister_trace_kprobe() initialises tk->rp.kp.list and creates an
infinite loop like this.
aggregated kprobe.list -> kprobe.list -.
^ |
'.__.'
In this situation, these commands fall into the infinite loop and result
in RCU stall or soft lockup.
cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the
infinite loop with RCU.
/usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex,
and __get_valid_kprobe() is stuck in
the loop.
To avoid the issue, make sure we don't call disarm_kprobe() for disabled
kprobes.
[0]
Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2)
WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129)
Modules linked in: ena
CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28
Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017
RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129)
Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94
RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001
RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff
RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff
R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40
R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000
FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
__disable_kprobe (kernel/kprobes.c:1716)
disable_kprobe (kernel/kprobes.c:2392)
__disable_trace_kprobe (kernel/trace/trace_kprobe.c:340)
disable_trace_kprobe (kernel/trace/trace_kprobe.c:429)
perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168)
perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295)
_free_event (kernel/events/core.c:4971)
perf_event_release_kernel (kernel/events/core.c:5176)
perf_release (kernel/events/core.c:5186)
__fput (fs/file_table.c:321)
task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1))
exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201)
syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296)
do_syscall_64 (arch/x86/entry/common.c:87)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
RIP: 0033:0x7fe7ff210654
Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc
RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654
RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008
RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30
R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600
R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560
</TASK>
Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com
Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reported-by: Ayushman Dutta <ayudutta@amazon.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Kuniyuki Iwashima <kuni1840@gmail.com>
Cc: Ayushman Dutta <ayudutta@amazon.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-13 05:05:09 +03:00
if ( ! kprobes_all_disarmed & & ! kprobe_disabled ( orig_p ) ) {
2018-01-10 02:51:24 +03:00
ret = disarm_kprobe ( orig_p , true ) ;
if ( ret ) {
p - > flags & = ~ KPROBE_FLAG_DISABLED ;
return ERR_PTR ( ret ) ;
}
}
2010-12-03 12:53:57 +03:00
orig_p - > flags | = KPROBE_FLAG_DISABLED ;
}
}
return orig_p ;
}
2009-04-07 06:01:02 +04:00
/*
* Unregister a kprobe without a scheduler synchronization .
*/
2014-04-17 12:17:54 +04:00
static int __unregister_kprobe_top ( struct kprobe * p )
2009-04-07 06:01:02 +04:00
{
2010-12-03 12:53:50 +03:00
struct kprobe * ap , * list_p ;
2009-04-07 06:01:02 +04:00
2010-12-03 12:53:57 +03:00
/* Disable kprobe. This will disarm it if needed. */
ap = __disable_kprobe ( p ) ;
2018-01-10 02:51:24 +03:00
if ( IS_ERR ( ap ) )
return PTR_ERR ( ap ) ;
2009-04-07 06:01:02 +04:00
2010-12-03 12:53:57 +03:00
if ( ap = = p )
2007-05-08 11:34:16 +04:00
/*
2010-12-03 12:53:57 +03:00
* This probe is an independent ( and non - optimized ) kprobe
* ( not an aggrprobe ) . Remove from the hash list .
2007-05-08 11:34:16 +04:00
*/
2010-12-03 12:53:57 +03:00
goto disarmed ;
/* Following process expects this probe is an aggrprobe */
WARN_ON ( ! kprobe_aggrprobe ( ap ) ) ;
2010-12-03 12:54:09 +03:00
if ( list_is_singular ( & ap - > list ) & & kprobe_disarmed ( ap ) )
/*
* ! disarmed could be happen if the probe is under delayed
* unoptimizing .
*/
2010-12-03 12:53:57 +03:00
goto disarmed ;
else {
/* If disabling probe has special handlers, update aggrprobe */
2009-01-07 01:41:52 +03:00
if ( p - > post_handler & & ! kprobe_gone ( p ) ) {
2020-05-12 11:02:44 +03:00
list_for_each_entry ( list_p , & ap - > list , list ) {
2008-04-28 13:14:28 +04:00
if ( ( list_p ! = p ) & & ( list_p - > post_handler ) )
goto noclean ;
}
2022-11-18 04:15:34 +03:00
/*
* For the kprobe - on - ftrace case , we keep the
* post_handler setting to identify this aggrprobe
* armed with kprobe_ipmodify_ops .
*/
if ( ! kprobe_ftrace ( ap ) )
ap - > post_handler = NULL ;
2008-04-28 13:14:28 +04:00
}
noclean :
2010-12-03 12:53:57 +03:00
/*
* Remove from the aggrprobe : this path will do nothing in
* __unregister_kprobe_bottom ( ) .
*/
2006-01-10 07:52:43 +03:00
list_del_rcu ( & p - > list ) ;
2010-12-03 12:53:57 +03:00
if ( ! kprobe_disabled ( ap ) & & ! kprobes_all_disarmed )
/*
* Try to optimize this probe again , because post
* handler may have been changed .
*/
optimize_kprobe ( ap ) ;
2006-01-10 07:52:43 +03:00
}
2008-04-28 13:14:28 +04:00
return 0 ;
2010-12-03 12:53:57 +03:00
disarmed :
hlist_del_rcu ( & ap - > hlist ) ;
return 0 ;
2008-04-28 13:14:28 +04:00
}
2005-11-07 12:00:13 +03:00
2014-04-17 12:17:54 +04:00
static void __unregister_kprobe_bottom ( struct kprobe * p )
2008-04-28 13:14:28 +04:00
{
2010-12-03 12:53:50 +03:00
struct kprobe * ap ;
2005-12-12 11:37:00 +03:00
2009-01-07 01:41:52 +03:00
if ( list_empty ( & p - > list ) )
2010-12-03 12:54:09 +03:00
/* This is an independent kprobe */
2006-01-10 07:52:46 +03:00
arch_remove_kprobe ( p ) ;
2009-01-07 01:41:52 +03:00
else if ( list_is_singular ( & p - > list ) ) {
2010-12-03 12:54:09 +03:00
/* This is the last child of an aggrprobe */
2010-12-03 12:53:50 +03:00
ap = list_entry ( p - > list . next , struct kprobe , list ) ;
2009-01-07 01:41:52 +03:00
list_del ( & p - > list ) ;
2010-12-03 12:53:50 +03:00
free_aggr_kprobe ( ap ) ;
2008-04-28 13:14:28 +04:00
}
2010-12-03 12:54:09 +03:00
/* Otherwise, do nothing. */
2008-04-28 13:14:28 +04:00
}
2014-04-17 12:17:54 +04:00
int register_kprobes ( struct kprobe * * kps , int num )
2008-04-28 13:14:28 +04:00
{
int i , ret = 0 ;
if ( num < = 0 )
return - EINVAL ;
for ( i = 0 ; i < num ; i + + ) {
2009-01-07 01:41:53 +03:00
ret = register_kprobe ( kps [ i ] ) ;
2008-06-13 02:21:35 +04:00
if ( ret < 0 ) {
if ( i > 0 )
unregister_kprobes ( kps , i ) ;
2008-04-28 13:14:28 +04:00
break ;
2006-06-26 11:25:22 +04:00
}
2006-01-10 07:52:43 +03:00
}
2008-04-28 13:14:28 +04:00
return ret ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kprobes ) ;
2008-04-28 13:14:28 +04:00
2014-04-17 12:17:54 +04:00
void unregister_kprobe ( struct kprobe * p )
2008-04-28 13:14:28 +04:00
{
unregister_kprobes ( & p , 1 ) ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kprobe ) ;
2008-04-28 13:14:28 +04:00
2014-04-17 12:17:54 +04:00
void unregister_kprobes ( struct kprobe * * kps , int num )
2008-04-28 13:14:28 +04:00
{
int i ;
if ( num < = 0 )
return ;
mutex_lock ( & kprobe_mutex ) ;
for ( i = 0 ; i < num ; i + + )
if ( __unregister_kprobe_top ( kps [ i ] ) < 0 )
kps [ i ] - > addr = NULL ;
mutex_unlock ( & kprobe_mutex ) ;
2018-11-07 06:04:39 +03:00
synchronize_rcu ( ) ;
2008-04-28 13:14:28 +04:00
for ( i = 0 ; i < num ; i + + )
if ( kps [ i ] - > addr )
__unregister_kprobe_bottom ( kps [ i ] ) ;
2005-04-17 02:20:36 +04:00
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kprobes ) ;
2005-04-17 02:20:36 +04:00
2017-03-08 20:04:15 +03:00
int __weak kprobe_exceptions_notify ( struct notifier_block * self ,
unsigned long val , void * data )
2017-02-07 22:54:14 +03:00
{
return NOTIFY_DONE ;
}
2017-03-08 20:04:15 +03:00
NOKPROBE_SYMBOL ( kprobe_exceptions_notify ) ;
2017-02-07 22:54:14 +03:00
2005-04-17 02:20:36 +04:00
static struct notifier_block kprobe_exceptions_nb = {
2006-06-26 11:25:28 +04:00
. notifier_call = kprobe_exceptions_notify ,
. priority = 0x7fffffff /* we need to be notified first */
} ;
2008-03-05 01:28:37 +03:00
# ifdef CONFIG_KRETPROBES
2020-08-29 16:00:01 +03:00
2022-03-26 05:27:05 +03:00
# if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
/* callbacks for objpool of kretprobe instances */
static int kretprobe_init_inst ( void * nod , void * context )
{
struct kretprobe_instance * ri = nod ;
ri - > rph = context ;
return 0 ;
}
static int kretprobe_fini_pool ( struct objpool_head * head , void * context )
{
kfree ( context ) ;
return 0 ;
}
2022-05-04 06:36:31 +03:00
static void free_rp_inst_rcu ( struct rcu_head * head )
{
struct kretprobe_instance * ri = container_of ( head , struct kretprobe_instance , rcu ) ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
struct kretprobe_holder * rph = ri - > rph ;
2022-05-04 06:36:31 +03:00
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
objpool_drop ( ri , & rph - > pool ) ;
2022-05-04 06:36:31 +03:00
}
NOKPROBE_SYMBOL ( free_rp_inst_rcu ) ;
static void recycle_rp_inst ( struct kretprobe_instance * ri )
{
struct kretprobe * rp = get_kretprobe ( ri ) ;
if ( likely ( rp ) )
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
objpool_push ( ri , & rp - > rph - > pool ) ;
2022-05-04 06:36:31 +03:00
else
call_rcu ( & ri - > rcu , free_rp_inst_rcu ) ;
}
NOKPROBE_SYMBOL ( recycle_rp_inst ) ;
/*
* This function is called from delayed_put_task_struct ( ) when a task is
* dead and cleaned up to recycle any kretprobe instances associated with
* this task . These left over instances represent probed functions that
* have been called but will never return .
*/
void kprobe_flush_task ( struct task_struct * tk )
{
struct kretprobe_instance * ri ;
struct llist_node * node ;
/* Early boot, not yet initialized. */
if ( unlikely ( ! kprobes_initialized ) )
return ;
kprobe_busy_begin ( ) ;
node = __llist_del_all ( & tk - > kretprobe_instances ) ;
while ( node ) {
ri = container_of ( node , struct kretprobe_instance , llist ) ;
node = node - > next ;
recycle_rp_inst ( ri ) ;
}
kprobe_busy_end ( ) ;
}
NOKPROBE_SYMBOL ( kprobe_flush_task ) ;
static inline void free_rp_inst ( struct kretprobe * rp )
{
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
struct kretprobe_holder * rph = rp - > rph ;
2022-05-04 06:36:31 +03:00
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( ! rph )
return ;
rp - > rph = NULL ;
objpool_fini ( & rph - > pool ) ;
2022-05-04 06:36:31 +03:00
}
2021-09-14 17:41:04 +03:00
/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t * __kretprobe_find_ret_addr ( struct task_struct * tsk ,
struct llist_node * * cur )
2007-07-19 12:48:11 +04:00
{
2020-08-29 16:03:24 +03:00
struct kretprobe_instance * ri = NULL ;
2021-09-14 17:41:04 +03:00
struct llist_node * node = * cur ;
if ( ! node )
node = tsk - > kretprobe_instances . first ;
else
node = node - > next ;
2020-08-29 16:00:01 +03:00
2020-08-29 16:03:24 +03:00
while ( node ) {
ri = container_of ( node , struct kretprobe_instance , llist ) ;
2021-09-14 17:40:45 +03:00
if ( ri - > ret_addr ! = kretprobe_trampoline_addr ( ) ) {
2021-09-14 17:41:04 +03:00
* cur = node ;
return ri - > ret_addr ;
2020-08-29 16:03:24 +03:00
}
node = node - > next ;
2020-08-29 16:00:01 +03:00
}
2021-09-14 17:41:04 +03:00
return NULL ;
2007-07-19 12:48:11 +04:00
}
2021-09-14 17:41:04 +03:00
NOKPROBE_SYMBOL ( __kretprobe_find_ret_addr ) ;
2005-04-17 02:20:36 +04:00
2021-09-14 17:41:04 +03:00
/**
* kretprobe_find_ret_addr - - Find correct return address modified by kretprobe
* @ tsk : Target task
* @ fp : A frame pointer
* @ cur : a storage of the loop cursor llist_node pointer for next call
*
* Find the correct return address modified by a kretprobe on @ tsk in unsigned
* long type . If it finds the return address , this returns that address value ,
* or this returns 0.
* The @ tsk must be ' current ' or a task which is not running . @ fp is a hint
* to get the currect return address - which is compared with the
* kretprobe_instance : : fp field . The @ cur is a loop cursor for searching the
* kretprobe return addresses on the @ tsk . The ' * @ cur ' should be NULL at the
* first call , but ' @ cur ' itself must NOT NULL .
*/
unsigned long kretprobe_find_ret_addr ( struct task_struct * tsk , void * fp ,
struct llist_node * * cur )
{
struct kretprobe_instance * ri = NULL ;
kprobe_opcode_t * ret ;
if ( WARN_ON_ONCE ( ! cur ) )
return 0 ;
2020-08-29 16:00:01 +03:00
2021-09-14 17:41:04 +03:00
do {
ret = __kretprobe_find_ret_addr ( tsk , cur ) ;
if ( ! ret )
break ;
ri = container_of ( * cur , struct kretprobe_instance , llist ) ;
} while ( ri - > fp ! = fp ) ;
return ( unsigned long ) ret ;
}
NOKPROBE_SYMBOL ( kretprobe_find_ret_addr ) ;
2021-09-14 17:42:51 +03:00
void __weak arch_kretprobe_fixup_return ( struct pt_regs * regs ,
kprobe_opcode_t * correct_ret_addr )
{
/*
* Do nothing by default . Please fill this to update the fake return
* address on the stack with the correct one on each arch if possible .
*/
}
2020-08-29 16:00:01 +03:00
unsigned long __kretprobe_trampoline_handler ( struct pt_regs * regs ,
void * frame_pointer )
{
2020-08-29 16:03:24 +03:00
struct kretprobe_instance * ri = NULL ;
2021-09-14 17:41:04 +03:00
struct llist_node * first , * node = NULL ;
2023-07-04 22:43:59 +03:00
kprobe_opcode_t * correct_ret_addr ;
2020-08-29 16:03:24 +03:00
struct kretprobe * rp ;
2020-08-29 16:00:01 +03:00
2021-09-14 17:41:04 +03:00
/* Find correct address and all nodes for this frame. */
correct_ret_addr = __kretprobe_find_ret_addr ( current , & node ) ;
if ( ! correct_ret_addr ) {
pr_err ( " kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel. \n " ) ;
BUG_ON ( 1 ) ;
2020-08-29 16:00:01 +03:00
}
2021-09-14 17:42:12 +03:00
/*
* Set the return address as the instruction pointer , because if the
* user handler calls stack_trace_save_regs ( ) with this ' regs ' ,
* the stack trace will start from the instruction pointer .
*/
instruction_pointer_set ( regs , ( unsigned long ) correct_ret_addr ) ;
2020-08-29 16:00:01 +03:00
2021-09-14 17:41:04 +03:00
/* Run the user handler of the nodes. */
first = current - > kretprobe_instances . first ;
2020-08-29 16:03:24 +03:00
while ( first ) {
ri = container_of ( first , struct kretprobe_instance , llist ) ;
2021-09-14 17:41:04 +03:00
if ( WARN_ON_ONCE ( ri - > fp ! = frame_pointer ) )
break ;
2020-08-29 16:00:01 +03:00
2020-08-29 16:03:24 +03:00
rp = get_kretprobe ( ri ) ;
if ( rp & & rp - > handler ) {
2020-08-29 16:00:01 +03:00
struct kprobe * prev = kprobe_running ( ) ;
2020-08-29 16:03:24 +03:00
__this_cpu_write ( current_kprobe , & rp - > kp ) ;
2020-08-29 16:00:01 +03:00
ri - > ret_addr = correct_ret_addr ;
2020-08-29 16:03:24 +03:00
rp - > handler ( ri , regs ) ;
2020-08-29 16:00:01 +03:00
__this_cpu_write ( current_kprobe , prev ) ;
}
2021-09-14 17:41:04 +03:00
if ( first = = node )
break ;
first = first - > next ;
}
2021-09-14 17:42:51 +03:00
arch_kretprobe_fixup_return ( regs , correct_ret_addr ) ;
2021-09-14 17:41:04 +03:00
/* Unlink all nodes for this frame. */
first = current - > kretprobe_instances . first ;
current - > kretprobe_instances . first = node - > next ;
node - > next = NULL ;
/* Recycle free instances. */
while ( first ) {
ri = container_of ( first , struct kretprobe_instance , llist ) ;
first = first - > next ;
2020-08-29 16:00:01 +03:00
2020-08-29 16:02:47 +03:00
recycle_rp_inst ( ri ) ;
2020-08-29 16:00:01 +03:00
}
return ( unsigned long ) correct_ret_addr ;
}
NOKPROBE_SYMBOL ( __kretprobe_trampoline_handler )
2006-02-03 14:03:42 +03:00
/*
* This kprobe pre_handler is registered with every kretprobe . When probe
* hits it will set up the return probe .
*/
2014-04-17 12:18:21 +04:00
static int pre_handler_kretprobe ( struct kprobe * p , struct pt_regs * regs )
2006-02-03 14:03:42 +03:00
{
struct kretprobe * rp = container_of ( p , struct kretprobe , kp ) ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
struct kretprobe_holder * rph = rp - > rph ;
2008-07-25 12:46:04 +04:00
struct kretprobe_instance * ri ;
2006-02-03 14:03:42 +03:00
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
ri = objpool_pop ( & rph - > pool ) ;
if ( ! ri ) {
2020-08-29 16:03:56 +03:00
rp - > nmissed + + ;
return 0 ;
}
2007-05-08 11:34:14 +04:00
2020-08-29 16:03:56 +03:00
if ( rp - > entry_handler & & rp - > entry_handler ( ri , regs ) ) {
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
objpool_push ( ri , & rph - > pool ) ;
2020-08-29 16:03:56 +03:00
return 0 ;
2008-07-25 12:46:04 +04:00
}
2020-08-29 16:03:56 +03:00
arch_prepare_kretprobe ( ri , regs ) ;
__llist_add ( & ri - > llist , & current - > kretprobe_instances ) ;
2006-02-03 14:03:42 +03:00
return 0 ;
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( pre_handler_kretprobe ) ;
2022-03-26 05:27:05 +03:00
# else /* CONFIG_KRETPROBE_ON_RETHOOK */
/*
* This kprobe pre_handler is registered with every kretprobe . When probe
* hits it will set up the return probe .
*/
static int pre_handler_kretprobe ( struct kprobe * p , struct pt_regs * regs )
{
struct kretprobe * rp = container_of ( p , struct kretprobe , kp ) ;
struct kretprobe_instance * ri ;
struct rethook_node * rhn ;
rhn = rethook_try_get ( rp - > rh ) ;
if ( ! rhn ) {
rp - > nmissed + + ;
return 0 ;
}
ri = container_of ( rhn , struct kretprobe_instance , node ) ;
if ( rp - > entry_handler & & rp - > entry_handler ( ri , regs ) )
rethook_recycle ( rhn ) ;
else
rethook_hook ( rhn , regs , kprobe_ftrace ( p ) ) ;
return 0 ;
}
NOKPROBE_SYMBOL ( pre_handler_kretprobe ) ;
static void kretprobe_rethook_handler ( struct rethook_node * rh , void * data ,
2023-06-06 15:39:55 +03:00
unsigned long ret_addr ,
2022-03-26 05:27:05 +03:00
struct pt_regs * regs )
{
struct kretprobe * rp = ( struct kretprobe * ) data ;
struct kretprobe_instance * ri ;
struct kprobe_ctlblk * kcb ;
/* The data must NOT be null. This means rethook data structure is broken. */
2022-04-22 19:40:27 +03:00
if ( WARN_ON_ONCE ( ! data ) | | ! rp - > handler )
2022-03-26 05:27:05 +03:00
return ;
__this_cpu_write ( current_kprobe , & rp - > kp ) ;
kcb = get_kprobe_ctlblk ( ) ;
kcb - > kprobe_status = KPROBE_HIT_ACTIVE ;
ri = container_of ( rh , struct kretprobe_instance , node ) ;
rp - > handler ( ri , regs ) ;
__this_cpu_write ( current_kprobe , NULL ) ;
}
NOKPROBE_SYMBOL ( kretprobe_rethook_handler ) ;
# endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
2006-02-03 14:03:42 +03:00
2021-01-27 18:37:51 +03:00
/**
* kprobe_on_func_entry ( ) - - check whether given address is function entry
* @ addr : Target address
* @ sym : Target symbol name
* @ offset : The offset from the symbol or the address
*
* This checks whether the given @ addr + @ offset or @ sym + @ offset is on the
* function entry address or not .
* This returns 0 if it is the function entry , or - EINVAL if it is not .
* And also it returns - ENOENT if it fails the symbol or address lookup .
* Caller must pass @ addr or @ sym ( either one must be NULL ) , or this
* returns - EINVAL .
*/
int kprobe_on_func_entry ( kprobe_opcode_t * addr , const char * sym , unsigned long offset )
2017-03-08 11:26:06 +03:00
{
2022-03-08 18:30:32 +03:00
bool on_func_entry ;
kprobe_opcode_t * kp_addr = _kprobe_addr ( addr , sym , offset , & on_func_entry ) ;
2017-03-08 11:26:06 +03:00
if ( IS_ERR ( kp_addr ) )
2021-01-27 18:37:51 +03:00
return PTR_ERR ( kp_addr ) ;
2017-03-08 11:26:06 +03:00
2022-03-08 18:30:32 +03:00
if ( ! on_func_entry )
2021-01-27 18:37:51 +03:00
return - EINVAL ;
return 0 ;
2017-03-08 11:26:06 +03:00
}
2014-04-17 12:17:54 +04:00
int register_kretprobe ( struct kretprobe * rp )
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
{
2021-01-27 18:37:51 +03:00
int ret ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
int i ;
2008-03-05 01:29:44 +03:00
void * addr ;
2017-02-22 16:53:37 +03:00
2021-01-27 18:37:51 +03:00
ret = kprobe_on_func_entry ( rp - > kp . addr , rp - > kp . symbol_name , rp - > kp . offset ) ;
if ( ret )
return ret ;
2007-10-16 12:27:49 +04:00
2021-09-14 17:39:34 +03:00
/* If only 'rp->kp.addr' is specified, check reregistering kprobes */
2021-02-03 17:59:27 +03:00
if ( rp - > kp . addr & & warn_kprobe_rereg ( & rp - > kp ) )
2021-01-28 15:44:27 +03:00
return - EINVAL ;
2007-10-16 12:27:49 +04:00
if ( kretprobe_blacklist_size ) {
2008-03-05 01:29:44 +03:00
addr = kprobe_addr ( & rp - > kp ) ;
2011-06-27 11:26:50 +04:00
if ( IS_ERR ( addr ) )
return PTR_ERR ( addr ) ;
2007-10-16 12:27:49 +04:00
for ( i = 0 ; kretprobe_blacklist [ i ] . name ! = NULL ; i + + ) {
if ( kretprobe_blacklist [ i ] . addr = = addr )
return - EINVAL ;
}
}
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
2021-12-01 17:45:50 +03:00
if ( rp - > data_size > KRETPROBE_MAX_DATA_SIZE )
return - E2BIG ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
rp - > kp . pre_handler = pre_handler_kretprobe ;
2006-04-20 13:43:11 +04:00
rp - > kp . post_handler = NULL ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
/* Pre-allocate memory for max kretprobe instances */
2022-11-10 11:15:02 +03:00
if ( rp - > maxactive < = 0 )
2009-12-21 15:02:24 +03:00
rp - > maxactive = max_t ( unsigned int , 10 , 2 * num_possible_cpus ( ) ) ;
2022-11-10 11:15:02 +03:00
2022-03-26 05:27:05 +03:00
# ifdef CONFIG_KRETPROBE_ON_RETHOOK
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
rp - > rh = rethook_alloc ( ( void * ) rp , kretprobe_rethook_handler ,
sizeof ( struct kretprobe_instance ) +
rp - > data_size , rp - > maxactive ) ;
if ( IS_ERR ( rp - > rh ) )
return PTR_ERR ( rp - > rh ) ;
2022-03-26 05:27:05 +03:00
rp - > nmissed = 0 ;
/* Establish function entry probe point */
ret = register_kprobe ( & rp - > kp ) ;
if ( ret ! = 0 ) {
rethook_free ( rp - > rh ) ;
rp - > rh = NULL ;
}
# else /* !CONFIG_KRETPROBE_ON_RETHOOK */
2020-08-29 16:03:24 +03:00
rp - > rph = kzalloc ( sizeof ( struct kretprobe_holder ) , GFP_KERNEL ) ;
if ( ! rp - > rph )
return - ENOMEM ;
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
if ( objpool_init ( & rp - > rph - > pool , rp - > maxactive , rp - > data_size +
sizeof ( struct kretprobe_instance ) , GFP_KERNEL ,
rp - > rph , kretprobe_init_inst , kretprobe_fini_pool ) ) {
kfree ( rp - > rph ) ;
rp - > rph = NULL ;
return - ENOMEM ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
}
kprobes: kretprobe scalability improvement
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.
Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):
OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
1T 2T 4T 8T 16T 24T
24150045 29317964 15446741 12494489 18287272 17708768
32T 48T 64T 72T 96T 128T
16200682 13737658 11645677 11269858 10470118 9931051
This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:
1T 2T 4T 8T 16T
native: 41186213 82336866 164250978 328662645 658810299
freelist: 24150045 29317964 15446741 12494489 18287272
objpool: 23926730 48010314 96125218 191782984 385091769
32T 48T 64T 96T 128T
native: 1330338351 1969957941 2512291791 2615754135 2671040914
freelist: 16200682 13737658 11645677 10470118 9931051
objpool: 764481096 1147149781 1456220214 1502109662 1579015050
Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:
OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
1T 2T 4T 8T 16T
native: . 30066096 63569843 126194076 257447289 505800181
freelist: 16152090 11064397 11124068 7215768 5663013
objpool: 13997541 28032100 55726624 110099926 221498787
24T 32T 48T 64T 96T
native: 763305277 1015925192 1521075123 2033009392 3021013752
freelist: 5015810 4602893 3766792 3382478 2945292
objpool: 328192025 439439564 668534502 887401381 1319972072
Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2023-10-17 16:56:52 +03:00
rp - > rph - > rp = rp ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
rp - > nmissed = 0 ;
/* Establish function entry probe point */
2009-01-07 01:41:53 +03:00
ret = register_kprobe ( & rp - > kp ) ;
2008-04-28 13:14:29 +04:00
if ( ret ! = 0 )
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
free_rp_inst ( rp ) ;
2022-03-26 05:27:05 +03:00
# endif
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
return ret ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kretprobe ) ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
2014-04-17 12:17:54 +04:00
int register_kretprobes ( struct kretprobe * * rps , int num )
2008-04-28 13:14:29 +04:00
{
int ret = 0 , i ;
if ( num < = 0 )
return - EINVAL ;
for ( i = 0 ; i < num ; i + + ) {
2009-01-07 01:41:53 +03:00
ret = register_kretprobe ( rps [ i ] ) ;
2008-06-13 02:21:35 +04:00
if ( ret < 0 ) {
if ( i > 0 )
unregister_kretprobes ( rps , i ) ;
2008-04-28 13:14:29 +04:00
break ;
}
}
return ret ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kretprobes ) ;
2008-04-28 13:14:29 +04:00
2014-04-17 12:17:54 +04:00
void unregister_kretprobe ( struct kretprobe * rp )
2008-04-28 13:14:29 +04:00
{
unregister_kretprobes ( & rp , 1 ) ;
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kretprobe ) ;
2008-04-28 13:14:29 +04:00
2014-04-17 12:17:54 +04:00
void unregister_kretprobes ( struct kretprobe * * rps , int num )
2008-04-28 13:14:29 +04:00
{
int i ;
if ( num < = 0 )
return ;
mutex_lock ( & kprobe_mutex ) ;
2020-08-29 16:03:24 +03:00
for ( i = 0 ; i < num ; i + + ) {
2008-04-28 13:14:29 +04:00
if ( __unregister_kprobe_top ( & rps [ i ] - > kp ) < 0 )
rps [ i ] - > kp . addr = NULL ;
2022-03-26 05:27:05 +03:00
# ifdef CONFIG_KRETPROBE_ON_RETHOOK
rethook_free ( rps [ i ] - > rh ) ;
# else
2020-08-29 16:03:24 +03:00
rps [ i ] - > rph - > rp = NULL ;
2022-03-26 05:27:05 +03:00
# endif
2020-08-29 16:03:24 +03:00
}
2008-04-28 13:14:29 +04:00
mutex_unlock ( & kprobe_mutex ) ;
2018-11-07 06:04:39 +03:00
synchronize_rcu ( ) ;
2008-04-28 13:14:29 +04:00
for ( i = 0 ; i < num ; i + + ) {
if ( rps [ i ] - > kp . addr ) {
__unregister_kprobe_bottom ( & rps [ i ] - > kp ) ;
2022-03-26 05:27:05 +03:00
# ifndef CONFIG_KRETPROBE_ON_RETHOOK
2020-08-29 16:03:24 +03:00
free_rp_inst ( rps [ i ] ) ;
2022-03-26 05:27:05 +03:00
# endif
2008-04-28 13:14:29 +04:00
}
}
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kretprobes ) ;
2008-04-28 13:14:29 +04:00
2008-03-05 01:28:37 +03:00
# else /* CONFIG_KRETPROBES */
2014-04-17 12:17:54 +04:00
int register_kretprobe ( struct kretprobe * rp )
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
{
2021-09-14 17:39:34 +03:00
return - EOPNOTSUPP ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kretprobe ) ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
2014-04-17 12:17:54 +04:00
int register_kretprobes ( struct kretprobe * * rps , int num )
2007-02-21 00:57:54 +03:00
{
2021-09-14 17:39:34 +03:00
return - EOPNOTSUPP ;
2007-02-21 00:57:54 +03:00
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( register_kretprobes ) ;
2014-04-17 12:17:54 +04:00
void unregister_kretprobe ( struct kretprobe * rp )
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
{
2008-04-28 13:14:29 +04:00
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kretprobe ) ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
2014-04-17 12:17:54 +04:00
void unregister_kretprobes ( struct kretprobe * * rps , int num )
2008-04-28 13:14:29 +04:00
{
}
2009-04-07 06:00:59 +04:00
EXPORT_SYMBOL_GPL ( unregister_kretprobes ) ;
2007-05-08 11:34:14 +04:00
2014-04-17 12:18:21 +04:00
static int pre_handler_kretprobe ( struct kprobe * p , struct pt_regs * regs )
2008-04-28 13:14:29 +04:00
{
return 0 ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( pre_handler_kretprobe ) ;
[PATCH] kprobes: function-return probes
This patch adds function-return probes to kprobes for the i386
architecture. This enables you to establish a handler to be run when a
function returns.
1. API
Two new functions are added to kprobes:
int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
2. Registration and unregistration
2.1 Register
To register a function-return probe, the user populates the following
fields in a kretprobe object and calls register_kretprobe() with the
kretprobe address as an argument:
kp.addr - the function's address
handler - this function is run after the ret instruction executes, but
before control returns to the return address in the caller.
maxactive - The maximum number of instances of the probed function that
can be active concurrently. For example, if the function is non-
recursive and is called with a spinlock or mutex held, maxactive = 1
should be enough. If the function is non-recursive and can never
relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
be enough. maxactive is used to determine how many kretprobe_instance
objects to allocate for this particular probed function. If maxactive <=
0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
NR_CPUS) else maxactive=NR_CPUS)
For example:
struct kretprobe rp;
rp.kp.addr = /* entrypoint address */
rp.handler = /*return probe handler */
rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
register_kretprobe(&rp);
The following field may also be of interest:
nmissed - Initialized to zero when the function-return probe is
registered, and incremented every time the probed function is entered but
there is no kretprobe_instance object available for establishing the
function-return probe (i.e., because maxactive was set too low).
2.2 Unregister
To unregiter a function-return probe, the user calls
unregister_kretprobe() with the same kretprobe object as registered
previously. If a probed function is running when the return probe is
unregistered, the function will return as expected, but the handler won't
be run.
3. Limitations
3.1 This patch supports only the i386 architecture, but patches for
x86_64 and ppc64 are anticipated soon.
3.2 Return probes operates by replacing the return address in the stack
(or in a known register, such as the lr register for ppc). This may
cause __builtin_return_address(0), when invoked from the return-probed
function, to return the address of the return-probes trampoline.
3.3 This implementation uses the "Multiprobes at an address" feature in
2.6.12-rc3-mm3.
3.4 Due to a limitation in multi-probes, you cannot currently establish
a return probe and a jprobe on the same function. A patch to remove
this limitation is being tested.
This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.
Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:09:19 +04:00
2008-04-28 13:14:29 +04:00
# endif /* CONFIG_KRETPROBES */
2009-01-07 01:41:52 +03:00
/* Set the kprobe gone and remove its instruction buffer. */
2014-04-17 12:17:54 +04:00
static void kill_kprobe ( struct kprobe * p )
2009-01-07 01:41:52 +03:00
{
struct kprobe * kp ;
2009-04-07 06:01:02 +04:00
2020-05-12 11:02:44 +03:00
lockdep_assert_held ( & kprobe_mutex ) ;
2022-11-26 14:43:16 +03:00
/*
* The module is going away . We should disarm the kprobe which
* is using ftrace , because ftrace framework is still available at
* ' MODULE_STATE_GOING ' notification .
*/
if ( kprobe_ftrace ( p ) & & ! kprobe_disabled ( p ) & & ! kprobes_all_disarmed )
disarm_kprobe_ftrace ( p ) ;
2009-01-07 01:41:52 +03:00
p - > flags | = KPROBE_FLAG_GONE ;
2010-02-25 16:34:07 +03:00
if ( kprobe_aggrprobe ( p ) ) {
2009-01-07 01:41:52 +03:00
/*
* If this is an aggr_kprobe , we have to list all the
* chained probes and mark them GONE .
*/
2020-05-12 11:02:44 +03:00
list_for_each_entry ( kp , & p - > list , list )
2009-01-07 01:41:52 +03:00
kp - > flags | = KPROBE_FLAG_GONE ;
p - > post_handler = NULL ;
2010-02-25 16:34:07 +03:00
kill_optimized_kprobe ( p ) ;
2009-01-07 01:41:52 +03:00
}
/*
* Here , we can remove insn_slot safely , because no thread calls
* the original probed function ( which will be freed soon ) any more .
*/
arch_remove_kprobe ( p ) ;
}
2010-04-28 02:33:12 +04:00
/* Disable one kprobe */
2014-04-17 12:17:54 +04:00
int disable_kprobe ( struct kprobe * kp )
2010-04-28 02:33:12 +04:00
{
int ret = 0 ;
2018-01-10 02:51:24 +03:00
struct kprobe * p ;
2010-04-28 02:33:12 +04:00
mutex_lock ( & kprobe_mutex ) ;
2010-12-03 12:53:57 +03:00
/* Disable this kprobe */
2018-01-10 02:51:24 +03:00
p = __disable_kprobe ( kp ) ;
if ( IS_ERR ( p ) )
ret = PTR_ERR ( p ) ;
2010-04-28 02:33:12 +04:00
mutex_unlock ( & kprobe_mutex ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( disable_kprobe ) ;
/* Enable one kprobe */
2014-04-17 12:17:54 +04:00
int enable_kprobe ( struct kprobe * kp )
2010-04-28 02:33:12 +04:00
{
int ret = 0 ;
struct kprobe * p ;
mutex_lock ( & kprobe_mutex ) ;
/* Check whether specified probe is valid. */
p = __get_valid_kprobe ( kp ) ;
if ( unlikely ( p = = NULL ) ) {
ret = - EINVAL ;
goto out ;
}
if ( kprobe_gone ( kp ) ) {
/* This kprobe has gone, we couldn't enable it. */
ret = - EINVAL ;
goto out ;
}
if ( p ! = kp )
kp - > flags & = ~ KPROBE_FLAG_DISABLED ;
if ( ! kprobes_all_disarmed & & kprobe_disabled ( p ) ) {
p - > flags & = ~ KPROBE_FLAG_DISABLED ;
2018-01-10 02:51:23 +03:00
ret = arm_kprobe ( p ) ;
2022-11-04 02:49:31 +03:00
if ( ret ) {
2018-01-10 02:51:23 +03:00
p - > flags | = KPROBE_FLAG_DISABLED ;
2022-11-04 02:49:31 +03:00
if ( p ! = kp )
kp - > flags | = KPROBE_FLAG_DISABLED ;
}
2010-04-28 02:33:12 +04:00
}
out :
mutex_unlock ( & kprobe_mutex ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( enable_kprobe ) ;
2018-04-28 15:36:33 +03:00
/* Caller must NOT call this in usual path. This is only for critical case */
2014-04-17 12:18:21 +04:00
void dump_kprobe ( struct kprobe * kp )
2009-08-27 01:38:30 +04:00
{
2021-09-14 17:39:25 +03:00
pr_err ( " Dump kprobe: \n .symbol_name = %s, .offset = %x, .addr = %pS \n " ,
2018-04-28 15:36:33 +03:00
kp - > symbol_name , kp - > offset , kp - > addr ) ;
2009-08-27 01:38:30 +04:00
}
2014-04-17 12:18:21 +04:00
NOKPROBE_SYMBOL ( dump_kprobe ) ;
2009-08-27 01:38:30 +04:00
2018-12-17 11:20:55 +03:00
int kprobe_add_ksym_blacklist ( unsigned long entry )
{
struct kprobe_blacklist_entry * ent ;
unsigned long offset = 0 , size = 0 ;
if ( ! kernel_text_address ( entry ) | |
! kallsyms_lookup_size_offset ( entry , & size , & offset ) )
return - EINVAL ;
ent = kmalloc ( sizeof ( * ent ) , GFP_KERNEL ) ;
if ( ! ent )
return - ENOMEM ;
ent - > start_addr = entry ;
ent - > end_addr = entry + size ;
INIT_LIST_HEAD ( & ent - > list ) ;
list_add_tail ( & ent - > list , & kprobe_blacklist ) ;
return ( int ) size ;
}
/* Add all symbols in given area into kprobe blacklist */
int kprobe_add_area_blacklist ( unsigned long start , unsigned long end )
{
unsigned long entry ;
int ret = 0 ;
for ( entry = start ; entry < end ; entry + = ret ) {
ret = kprobe_add_ksym_blacklist ( entry ) ;
if ( ret < 0 )
return ret ;
if ( ret = = 0 ) /* In case of alias symbol */
ret = 1 ;
}
return 0 ;
}
2020-03-26 17:49:48 +03:00
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist ( unsigned long start , unsigned long end )
{
struct kprobe_blacklist_entry * ent , * n ;
list_for_each_entry_safe ( ent , n , & kprobe_blacklist , list ) {
if ( ent - > start_addr < start | | ent - > start_addr > = end )
continue ;
list_del ( & ent - > list ) ;
kfree ( ent ) ;
}
}
2020-03-26 17:50:00 +03:00
static void kprobe_remove_ksym_blacklist ( unsigned long entry )
{
kprobe_remove_area_blacklist ( entry , entry + 1 ) ;
}
2020-05-28 11:00:58 +03:00
int __weak arch_kprobe_get_kallsym ( unsigned int * symnum , unsigned long * value ,
char * type , char * sym )
{
return - ERANGE ;
}
int kprobe_get_kallsym ( unsigned int symnum , unsigned long * value , char * type ,
char * sym )
{
# ifdef __ARCH_WANT_KPROBES_INSN_SLOT
if ( ! kprobe_cache_get_kallsym ( & kprobe_insn_slots , & symnum , value , type , sym ) )
return 0 ;
# ifdef CONFIG_OPTPROBES
if ( ! kprobe_cache_get_kallsym ( & kprobe_optinsn_slots , & symnum , value , type , sym ) )
return 0 ;
# endif
# endif
if ( ! arch_kprobe_get_kallsym ( & symnum , value , type , sym ) )
return 0 ;
return - ERANGE ;
}
2018-12-17 11:20:55 +03:00
int __init __weak arch_populate_kprobe_blacklist ( void )
{
return 0 ;
}
2014-04-17 12:17:05 +04:00
/*
* Lookup and populate the kprobe_blacklist .
*
* Unlike the kretprobe blacklist , we ' ll need to determine
* the range of addresses that belong to the said functions ,
* since a kprobe need not necessarily be at the beginning
* of a function .
*/
static int __init populate_kprobe_blacklist ( unsigned long * start ,
unsigned long * end )
{
2018-12-17 11:20:55 +03:00
unsigned long entry ;
2014-04-17 12:17:05 +04:00
unsigned long * iter ;
2018-12-17 11:20:55 +03:00
int ret ;
2014-04-17 12:17:05 +04:00
for ( iter = start ; iter < end ; iter + + ) {
2021-09-14 17:40:36 +03:00
entry = ( unsigned long ) dereference_symbol_descriptor ( ( void * ) * iter ) ;
2018-12-17 11:20:55 +03:00
ret = kprobe_add_ksym_blacklist ( entry ) ;
if ( ret = = - EINVAL )
2014-04-17 12:17:05 +04:00
continue ;
2018-12-17 11:20:55 +03:00
if ( ret < 0 )
return ret ;
2014-04-17 12:17:05 +04:00
}
2018-12-17 11:20:55 +03:00
2021-09-14 17:39:34 +03:00
/* Symbols in '__kprobes_text' are blacklisted */
2018-12-17 11:20:55 +03:00
ret = kprobe_add_area_blacklist ( ( unsigned long ) __kprobes_text_start ,
( unsigned long ) __kprobes_text_end ) ;
2020-03-10 16:04:34 +03:00
if ( ret )
return ret ;
2021-09-14 17:39:34 +03:00
/* Symbols in 'noinstr' section are blacklisted */
2020-03-10 16:04:34 +03:00
ret = kprobe_add_area_blacklist ( ( unsigned long ) __noinstr_text_start ,
( unsigned long ) __noinstr_text_end ) ;
2018-12-17 11:20:55 +03:00
return ret ? : arch_populate_kprobe_blacklist ( ) ;
2014-04-17 12:17:05 +04:00
}
2020-03-26 17:49:48 +03:00
static void add_module_kprobe_blacklist ( struct module * mod )
{
unsigned long start , end ;
2020-03-26 17:50:00 +03:00
int i ;
if ( mod - > kprobe_blacklist ) {
for ( i = 0 ; i < mod - > num_kprobe_blacklist ; i + + )
kprobe_add_ksym_blacklist ( mod - > kprobe_blacklist [ i ] ) ;
}
2020-03-26 17:49:48 +03:00
start = ( unsigned long ) mod - > kprobes_text_start ;
if ( start ) {
end = start + mod - > kprobes_text_size ;
kprobe_add_area_blacklist ( start , end ) ;
}
2020-03-10 16:04:34 +03:00
start = ( unsigned long ) mod - > noinstr_text_start ;
if ( start ) {
end = start + mod - > noinstr_text_size ;
kprobe_add_area_blacklist ( start , end ) ;
}
2020-03-26 17:49:48 +03:00
}
static void remove_module_kprobe_blacklist ( struct module * mod )
{
unsigned long start , end ;
2020-03-26 17:50:00 +03:00
int i ;
if ( mod - > kprobe_blacklist ) {
for ( i = 0 ; i < mod - > num_kprobe_blacklist ; i + + )
kprobe_remove_ksym_blacklist ( mod - > kprobe_blacklist [ i ] ) ;
}
2020-03-26 17:49:48 +03:00
start = ( unsigned long ) mod - > kprobes_text_start ;
if ( start ) {
end = start + mod - > kprobes_text_size ;
kprobe_remove_area_blacklist ( start , end ) ;
}
2020-03-10 16:04:34 +03:00
start = ( unsigned long ) mod - > noinstr_text_start ;
if ( start ) {
end = start + mod - > noinstr_text_size ;
kprobe_remove_area_blacklist ( start , end ) ;
}
2020-03-26 17:49:48 +03:00
}
2009-01-07 01:41:52 +03:00
/* Module notifier call back, checking kprobes on the module */
2014-04-17 12:17:54 +04:00
static int kprobes_module_callback ( struct notifier_block * nb ,
unsigned long val , void * data )
2009-01-07 01:41:52 +03:00
{
struct module * mod = data ;
struct hlist_head * head ;
struct kprobe * p ;
unsigned int i ;
2009-01-07 01:41:55 +03:00
int checkcore = ( val = = MODULE_STATE_GOING ) ;
2009-01-07 01:41:52 +03:00
2020-03-26 17:49:48 +03:00
if ( val = = MODULE_STATE_COMING ) {
mutex_lock ( & kprobe_mutex ) ;
add_module_kprobe_blacklist ( mod ) ;
mutex_unlock ( & kprobe_mutex ) ;
}
2009-01-07 01:41:55 +03:00
if ( val ! = MODULE_STATE_GOING & & val ! = MODULE_STATE_LIVE )
2009-01-07 01:41:52 +03:00
return NOTIFY_DONE ;
/*
2021-09-14 17:39:34 +03:00
* When ' MODULE_STATE_GOING ' was notified , both of module ' . text ' and
* ' . init . text ' sections would be freed . When ' MODULE_STATE_LIVE ' was
* notified , only ' . init . text ' section would be freed . We need to
2009-01-07 01:41:55 +03:00
* disable kprobes which have been inserted in the sections .
2009-01-07 01:41:52 +03:00
*/
mutex_lock ( & kprobe_mutex ) ;
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
2020-05-12 11:02:44 +03:00
hlist_for_each_entry ( p , head , hlist )
2009-01-07 01:41:55 +03:00
if ( within_module_init ( ( unsigned long ) p - > addr , mod ) | |
( checkcore & &
within_module_core ( ( unsigned long ) p - > addr , mod ) ) ) {
2009-01-07 01:41:52 +03:00
/*
* The vaddr this probe is installed will soon
* be vfreed buy not synced to disk . Hence ,
* disarming the breakpoint isn ' t needed .
2017-05-16 21:58:35 +03:00
*
* Note , this will also move any optimized probes
* that are pending to be removed from their
2021-09-14 17:39:34 +03:00
* corresponding lists to the ' freeing_list ' and
2017-05-16 21:58:35 +03:00
* will not be touched by the delayed
2021-09-14 17:39:34 +03:00
* kprobe_optimizer ( ) work handler .
2009-01-07 01:41:52 +03:00
*/
kill_kprobe ( p ) ;
}
}
2020-03-26 17:49:48 +03:00
if ( val = = MODULE_STATE_GOING )
remove_module_kprobe_blacklist ( mod ) ;
2009-01-07 01:41:52 +03:00
mutex_unlock ( & kprobe_mutex ) ;
return NOTIFY_DONE ;
}
static struct notifier_block kprobe_module_nb = {
. notifier_call = kprobes_module_callback ,
. priority = 0
} ;
2020-09-10 11:55:05 +03:00
void kprobe_free_init_mem ( void )
{
void * start = ( void * ) ( & __init_begin ) ;
void * end = ( void * ) ( & __init_end ) ;
struct hlist_head * head ;
struct kprobe * p ;
int i ;
mutex_lock ( & kprobe_mutex ) ;
2021-09-14 17:39:34 +03:00
/* Kill all kprobes on initmem because the target code has been freed. */
2020-09-10 11:55:05 +03:00
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
hlist_for_each_entry ( p , head , hlist ) {
if ( start < = ( void * ) p - > addr & & ( void * ) p - > addr < end )
kill_kprobe ( p ) ;
}
}
mutex_unlock ( & kprobe_mutex ) ;
}
2005-04-17 02:20:36 +04:00
static int __init init_kprobes ( void )
{
2023-07-11 21:53:53 +03:00
int i , err ;
2005-04-17 02:20:36 +04:00
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
2020-08-29 16:03:24 +03:00
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + )
2005-04-17 02:20:36 +04:00
INIT_HLIST_HEAD ( & kprobe_table [ i ] ) ;
2014-04-17 12:17:05 +04:00
err = populate_kprobe_blacklist ( __start_kprobe_blacklist ,
__stop_kprobe_blacklist ) ;
2021-09-14 17:39:34 +03:00
if ( err )
2021-09-14 17:39:25 +03:00
pr_err ( " Failed to populate blacklist (error %d), kprobes not restricted, be careful using them! \n " , err ) ;
2008-04-28 13:14:26 +04:00
2007-10-16 12:27:49 +04:00
if ( kretprobe_blacklist_size ) {
/* lookup the function address from its name */
for ( i = 0 ; kretprobe_blacklist [ i ] . name ! = NULL ; i + + ) {
2017-04-19 15:51:00 +03:00
kretprobe_blacklist [ i ] . addr =
2017-04-19 15:51:01 +03:00
kprobe_lookup_name ( kretprobe_blacklist [ i ] . name , 0 ) ;
2007-10-16 12:27:49 +04:00
if ( ! kretprobe_blacklist [ i ] . addr )
2021-09-14 17:39:25 +03:00
pr_err ( " Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed. \n " ,
2007-10-16 12:27:49 +04:00
kretprobe_blacklist [ i ] . name ) ;
}
}
2009-04-07 06:01:01 +04:00
/* By default, kprobes are armed */
kprobes_all_disarmed = false ;
2007-05-08 11:34:16 +04:00
2021-02-18 17:29:23 +03:00
# if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
2021-09-14 17:39:34 +03:00
/* Init 'kprobe_optinsn_slots' for allocation */
2021-02-18 17:29:23 +03:00
kprobe_optinsn_slots . insn_size = MAX_OPTINSN_SIZE ;
# endif
2005-07-06 05:54:50 +04:00
err = arch_init_kprobes ( ) ;
[PATCH] Return probe redesign: architecture independent changes
The following is the second version of the function return probe patches
I sent out earlier this week. Changes since my last submission include:
* Fix in ppc64 code removing an unneeded call to re-enable preemption
* Fix a build problem in ia64 when kprobes was turned off
* Added another BUG_ON check to each of the architecture trampoline
handlers
My initial patch description ==>
From my experiences with adding return probes to x86_64 and ia64, and the
feedback on LKML to those patches, I think we can simplify the design
for return probes.
The following patch tweaks the original design such that:
* Instead of storing the stack address in the return probe instance, the
task pointer is stored. This gives us all we need in order to:
- find the correct return probe instance when we enter the trampoline
(even if we are recursing)
- find all left-over return probe instances when the task is going away
This has the side effect of simplifying the implementation since more
work can be done in kernel/kprobes.c since architecture specific knowledge
of the stack layout is no longer required. Specifically, we no longer have:
- arch_get_kprobe_task()
- arch_kprobe_flush_task()
- get_rp_inst_tsk()
- get_rp_inst()
- trampoline_post_handler() <see next bullet>
* Instead of splitting the return probe handling and cleanup logic across
the pre and post trampoline handlers, all the work is pushed into the
pre function (trampoline_probe_handler), and then we skip single stepping
the original function. In this case the original instruction to be single
stepped was just a NOP, and we can do without the extra interruption.
The new flow of events to having a return probe handler execute when a target
function exits is:
* At system initialization time, a kprobe is inserted at the beginning of
kretprobe_trampoline. kernel/kprobes.c use to handle this on it's own,
but ia64 needed to do this a little differently (i.e. a function pointer
is really a pointer to a structure containing the instruction pointer and
a global pointer), so I added the notion of arch_init(), so that
kernel/kprobes.c:init_kprobes() now allows architecture specific
initialization by calling arch_init() before exiting. Each architecture
now registers a kprobe on it's own trampoline function.
* register_kretprobe() will insert a kprobe at the beginning of the targeted
function with the kprobe pre_handler set to arch_prepare_kretprobe
(still no change)
* When the target function is entered, the kprobe is fired, calling
arch_prepare_kretprobe (still no change)
* In arch_prepare_kretprobe() we try to get a free instance and if one is
available then we fill out the instance with a pointer to the return probe,
the original return address, and a pointer to the task structure (instead
of the stack address.) Just like before we change the return address
to the trampoline function and mark the instance as used.
If multiple return probes are registered for a given target function,
then arch_prepare_kretprobe() will get called multiple times for the same
task (since our kprobe implementation is able to handle multiple kprobes
at the same address.) Past the first call to arch_prepare_kretprobe,
we end up with the original address stored in the return probe instance
pointing to our trampoline function. (This is a significant difference
from the original arch_prepare_kretprobe design.)
* Target function executes like normal and then returns to kretprobe_trampoline.
* kprobe inserted on the first instruction of kretprobe_trampoline is fired
and calls trampoline_probe_handler() (no change here)
* trampoline_probe_handler() consumes each of the instances associated with
the current task by calling the registered handler function and marking
the instance as unused until an instance is found that has a return address
different then the trampoline function.
(change similar to my previous ia64 RFC)
* If the task is killed with some left-over return probe instances (meaning
that a target function was entered, but never returned), then we just
free any instances associated with the task. (Not much different other
then we can handle this without calling architecture specific functions.)
There is a known problem that this patch does not yet solve where
registering a return probe flush_old_exec or flush_thread will put us
in a bad state. Most likely the best way to handle this is to not allow
registering return probes on these two functions.
(Significant change)
This patch series applies to the 2.6.12-rc6-mm1 kernel, and provides:
* kernel/kprobes.c changes
* i386 patch of existing return probes implementation
* x86_64 patch of existing return probe implementation
* ia64 implementation
* ppc64 implementation (provided by Ananth)
This patch implements the architecture independant changes for a reworking
of the kprobes based function return probes design. Changes include:
* Removing functions for querying a return probe instance off a stack address
* Removing the stack_addr field from the kretprobe_instance definition,
and adding a task pointer
* Adding architecture specific initialization via arch_init()
* Removing extern definitions for the architecture trampoline functions
(this isn't needed anymore since the architecture handles the
initialization of the kprobe in the return probe trampoline function.)
Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-28 02:17:08 +04:00
if ( ! err )
err = register_die_notifier ( & kprobe_exceptions_nb ) ;
2009-01-07 01:41:52 +03:00
if ( ! err )
err = register_module_notifier ( & kprobe_module_nb ) ;
2008-07-25 12:46:04 +04:00
kprobes_initialized = ( err = = 0 ) ;
2022-01-22 09:13:41 +03:00
kprobe_sysctls_init ( ) ;
2005-04-17 02:20:36 +04:00
return err ;
}
2020-09-10 15:38:39 +03:00
early_initcall ( init_kprobes ) ;
2005-04-17 02:20:36 +04:00
2021-02-18 17:29:23 +03:00
# if defined(CONFIG_OPTPROBES)
static int __init init_optprobes ( void )
{
/*
* Enable kprobe optimization - this kicks the optimizer which
* depends on synchronize_rcu_tasks ( ) and ksoftirqd , that is
* not spawned in early initcall . So delay the optimization .
*/
optimize_all_kprobes ( ) ;
return 0 ;
}
subsys_initcall ( init_optprobes ) ;
# endif
2007-02-21 00:57:54 +03:00
# ifdef CONFIG_DEBUG_FS
2014-04-17 12:17:54 +04:00
static void report_probe ( struct seq_file * pi , struct kprobe * p ,
2010-02-25 16:34:07 +03:00
const char * sym , int offset , char * modname , struct kprobe * pp )
2007-02-21 00:57:54 +03:00
{
char * kprobe_type ;
2018-04-28 15:36:02 +03:00
void * addr = p - > addr ;
2007-02-21 00:57:54 +03:00
if ( p - > pre_handler = = pre_handler_kretprobe )
kprobe_type = " r " ;
else
kprobe_type = " k " ;
2010-02-25 16:34:07 +03:00
2020-07-03 01:20:22 +03:00
if ( ! kallsyms_show_value ( pi - > file - > f_cred ) )
2018-04-28 15:36:02 +03:00
addr = NULL ;
2007-02-21 00:57:54 +03:00
if ( sym )
2018-04-28 15:36:02 +03:00
seq_printf ( pi , " %px %s %s+0x%x %s " ,
addr , kprobe_type , sym , offset ,
2010-02-25 16:34:07 +03:00
( modname ? modname : " " ) ) ;
2018-04-28 15:36:02 +03:00
else /* try to use %pS */
seq_printf ( pi , " %px %s %pS " ,
addr , kprobe_type , p - > addr ) ;
2010-02-25 16:34:07 +03:00
if ( ! pp )
pp = p ;
2012-06-05 14:28:32 +04:00
seq_printf ( pi , " %s%s%s%s \n " ,
2010-02-25 16:34:07 +03:00
( kprobe_gone ( p ) ? " [GONE] " : " " ) ,
( ( kprobe_disabled ( p ) & & ! kprobe_gone ( p ) ) ? " [DISABLED] " : " " ) ,
2012-06-05 14:28:32 +04:00
( kprobe_optimized ( pp ) ? " [OPTIMIZED] " : " " ) ,
( kprobe_ftrace ( pp ) ? " [FTRACE] " : " " ) ) ;
2007-02-21 00:57:54 +03:00
}
2014-04-17 12:17:54 +04:00
static void * kprobe_seq_start ( struct seq_file * f , loff_t * pos )
2007-02-21 00:57:54 +03:00
{
return ( * pos < KPROBE_TABLE_SIZE ) ? pos : NULL ;
}
2014-04-17 12:17:54 +04:00
static void * kprobe_seq_next ( struct seq_file * f , void * v , loff_t * pos )
2007-02-21 00:57:54 +03:00
{
( * pos ) + + ;
if ( * pos > = KPROBE_TABLE_SIZE )
return NULL ;
return pos ;
}
2014-04-17 12:17:54 +04:00
static void kprobe_seq_stop ( struct seq_file * f , void * v )
2007-02-21 00:57:54 +03:00
{
/* Nothing to do */
}
2014-04-17 12:17:54 +04:00
static int show_kprobe_addr ( struct seq_file * pi , void * v )
2007-02-21 00:57:54 +03:00
{
struct hlist_head * head ;
struct kprobe * p , * kp ;
const char * sym = NULL ;
unsigned int i = * ( loff_t * ) v ;
2007-05-08 11:28:41 +04:00
unsigned long offset = 0 ;
2013-11-13 03:10:23 +04:00
char * modname , namebuf [ KSYM_NAME_LEN ] ;
2007-02-21 00:57:54 +03:00
head = & kprobe_table [ i ] ;
preempt_disable ( ) ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
hlist_for_each_entry_rcu ( p , head , hlist ) {
2007-05-08 11:28:41 +04:00
sym = kallsyms_lookup ( ( unsigned long ) p - > addr , NULL ,
2007-02-21 00:57:54 +03:00
& offset , & modname , namebuf ) ;
2010-02-25 16:34:07 +03:00
if ( kprobe_aggrprobe ( p ) ) {
2007-02-21 00:57:54 +03:00
list_for_each_entry_rcu ( kp , & p - > list , list )
2010-02-25 16:34:07 +03:00
report_probe ( pi , kp , sym , offset , modname , p ) ;
2007-02-21 00:57:54 +03:00
} else
2010-02-25 16:34:07 +03:00
report_probe ( pi , p , sym , offset , modname , NULL ) ;
2007-02-21 00:57:54 +03:00
}
preempt_enable ( ) ;
return 0 ;
}
2020-06-05 02:51:11 +03:00
static const struct seq_operations kprobes_sops = {
2007-02-21 00:57:54 +03:00
. start = kprobe_seq_start ,
. next = kprobe_seq_next ,
. stop = kprobe_seq_stop ,
. show = show_kprobe_addr
} ;
2020-06-05 02:51:11 +03:00
DEFINE_SEQ_ATTRIBUTE ( kprobes ) ;
2007-02-21 00:57:54 +03:00
2014-04-17 12:18:49 +04:00
/* kprobes/blacklist -- shows which functions can not be probed */
static void * kprobe_blacklist_seq_start ( struct seq_file * m , loff_t * pos )
{
2020-03-26 17:49:36 +03:00
mutex_lock ( & kprobe_mutex ) ;
2014-04-17 12:18:49 +04:00
return seq_list_start ( & kprobe_blacklist , * pos ) ;
}
static void * kprobe_blacklist_seq_next ( struct seq_file * m , void * v , loff_t * pos )
{
return seq_list_next ( v , & kprobe_blacklist , pos ) ;
}
static int kprobe_blacklist_seq_show ( struct seq_file * m , void * v )
{
struct kprobe_blacklist_entry * ent =
list_entry ( v , struct kprobe_blacklist_entry , list ) ;
2018-04-28 15:35:32 +03:00
/*
2021-09-14 17:39:34 +03:00
* If ' / proc / kallsyms ' is not showing kernel address , we won ' t
2018-04-28 15:35:32 +03:00
* show them here either .
*/
2020-07-03 01:20:22 +03:00
if ( ! kallsyms_show_value ( m - > file - > f_cred ) )
2018-04-28 15:35:32 +03:00
seq_printf ( m , " 0x%px-0x%px \t %ps \n " , NULL , NULL ,
( void * ) ent - > start_addr ) ;
else
seq_printf ( m , " 0x%px-0x%px \t %ps \n " , ( void * ) ent - > start_addr ,
( void * ) ent - > end_addr , ( void * ) ent - > start_addr ) ;
2014-04-17 12:18:49 +04:00
return 0 ;
}
2020-03-26 17:49:36 +03:00
static void kprobe_blacklist_seq_stop ( struct seq_file * f , void * v )
{
mutex_unlock ( & kprobe_mutex ) ;
}
2020-06-05 02:51:11 +03:00
static const struct seq_operations kprobe_blacklist_sops = {
2014-04-17 12:18:49 +04:00
. start = kprobe_blacklist_seq_start ,
. next = kprobe_blacklist_seq_next ,
2020-03-26 17:49:36 +03:00
. stop = kprobe_blacklist_seq_stop ,
2014-04-17 12:18:49 +04:00
. show = kprobe_blacklist_seq_show ,
} ;
2020-06-05 02:51:11 +03:00
DEFINE_SEQ_ATTRIBUTE ( kprobe_blacklist ) ;
2014-04-17 12:18:49 +04:00
2018-01-10 02:51:23 +03:00
static int arm_all_kprobes ( void )
2007-05-08 11:34:16 +04:00
{
struct hlist_head * head ;
struct kprobe * p ;
2018-01-10 02:51:23 +03:00
unsigned int i , total = 0 , errors = 0 ;
int err , ret = 0 ;
2007-05-08 11:34:16 +04:00
mutex_lock ( & kprobe_mutex ) ;
2009-04-07 06:01:01 +04:00
/* If kprobes are armed, just return */
if ( ! kprobes_all_disarmed )
2007-05-08 11:34:16 +04:00
goto already_enabled ;
2015-02-14 01:40:24 +03:00
/*
* optimize_kprobe ( ) called by arm_kprobe ( ) checks
* kprobes_all_disarmed , so set kprobes_all_disarmed before
* arm_kprobe .
*/
kprobes_all_disarmed = false ;
2010-02-25 16:34:07 +03:00
/* Arming kprobes doesn't optimize kprobe itself */
2007-05-08 11:34:16 +04:00
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
2018-01-10 02:51:23 +03:00
/* Arm all kprobes on a best-effort basis */
2020-05-12 11:02:44 +03:00
hlist_for_each_entry ( p , head , hlist ) {
2018-01-10 02:51:23 +03:00
if ( ! kprobe_disabled ( p ) ) {
err = arm_kprobe ( p ) ;
if ( err ) {
errors + + ;
ret = err ;
}
total + + ;
}
}
2007-05-08 11:34:16 +04:00
}
2018-01-10 02:51:23 +03:00
if ( errors )
2021-09-14 17:39:25 +03:00
pr_warn ( " Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs. \n " ,
2018-01-10 02:51:23 +03:00
errors , total ) ;
else
pr_info ( " Kprobes globally enabled \n " ) ;
2007-05-08 11:34:16 +04:00
already_enabled :
mutex_unlock ( & kprobe_mutex ) ;
2018-01-10 02:51:23 +03:00
return ret ;
2007-05-08 11:34:16 +04:00
}
2018-01-10 02:51:24 +03:00
static int disarm_all_kprobes ( void )
2007-05-08 11:34:16 +04:00
{
struct hlist_head * head ;
struct kprobe * p ;
2018-01-10 02:51:24 +03:00
unsigned int i , total = 0 , errors = 0 ;
int err , ret = 0 ;
2007-05-08 11:34:16 +04:00
mutex_lock ( & kprobe_mutex ) ;
2009-04-07 06:01:01 +04:00
/* If kprobes are already disarmed, just return */
2010-12-03 12:54:09 +03:00
if ( kprobes_all_disarmed ) {
mutex_unlock ( & kprobe_mutex ) ;
2018-01-10 02:51:24 +03:00
return 0 ;
2010-12-03 12:54:09 +03:00
}
2007-05-08 11:34:16 +04:00
2009-04-07 06:01:01 +04:00
kprobes_all_disarmed = true ;
2010-02-25 16:34:07 +03:00
2007-05-08 11:34:16 +04:00
for ( i = 0 ; i < KPROBE_TABLE_SIZE ; i + + ) {
head = & kprobe_table [ i ] ;
2018-01-10 02:51:24 +03:00
/* Disarm all kprobes on a best-effort basis */
2020-05-12 11:02:44 +03:00
hlist_for_each_entry ( p , head , hlist ) {
2018-01-10 02:51:24 +03:00
if ( ! arch_trampoline_kprobe ( p ) & & ! kprobe_disabled ( p ) ) {
err = disarm_kprobe ( p , false ) ;
if ( err ) {
errors + + ;
ret = err ;
}
total + + ;
}
2007-05-08 11:34:16 +04:00
}
}
2018-01-10 02:51:24 +03:00
if ( errors )
2021-09-14 17:39:25 +03:00
pr_warn ( " Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs. \n " ,
2018-01-10 02:51:24 +03:00
errors , total ) ;
else
pr_info ( " Kprobes globally disabled \n " ) ;
2007-05-08 11:34:16 +04:00
mutex_unlock ( & kprobe_mutex ) ;
2010-12-03 12:54:09 +03:00
/* Wait for disarming all kprobes by optimizer */
wait_for_kprobe_optimizer ( ) ;
2018-01-10 02:51:24 +03:00
return ret ;
2007-05-08 11:34:16 +04:00
}
/*
* XXX : The debugfs bool file interface doesn ' t allow for callbacks
* when the bool state is switched . We can reuse that facility when
* available
*/
static ssize_t read_enabled_file_bool ( struct file * file ,
char __user * user_buf , size_t count , loff_t * ppos )
{
char buf [ 3 ] ;
2009-04-07 06:01:01 +04:00
if ( ! kprobes_all_disarmed )
2007-05-08 11:34:16 +04:00
buf [ 0 ] = ' 1 ' ;
else
buf [ 0 ] = ' 0 ' ;
buf [ 1 ] = ' \n ' ;
buf [ 2 ] = 0x00 ;
return simple_read_from_buffer ( user_buf , count , ppos , buf , 2 ) ;
}
static ssize_t write_enabled_file_bool ( struct file * file ,
const char __user * user_buf , size_t count , loff_t * ppos )
{
2021-09-14 17:38:46 +03:00
bool enable ;
int ret ;
2007-05-08 11:34:16 +04:00
2021-09-14 17:38:46 +03:00
ret = kstrtobool_from_user ( user_buf , count , & enable ) ;
if ( ret )
return ret ;
2007-05-08 11:34:16 +04:00
2021-09-14 17:38:46 +03:00
ret = enable ? arm_all_kprobes ( ) : disarm_all_kprobes ( ) ;
2018-01-10 02:51:23 +03:00
if ( ret )
return ret ;
2007-05-08 11:34:16 +04:00
return count ;
}
2009-10-02 02:43:56 +04:00
static const struct file_operations fops_kp = {
2007-05-08 11:34:16 +04:00
. read = read_enabled_file_bool ,
. write = write_enabled_file_bool ,
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 20:52:59 +04:00
. llseek = default_llseek ,
2007-05-08 11:34:16 +04:00
} ;
2014-04-17 12:17:54 +04:00
static int __init debugfs_kprobe_init ( void )
2007-02-21 00:57:54 +03:00
{
2019-01-22 18:21:46 +03:00
struct dentry * dir ;
2007-02-21 00:57:54 +03:00
dir = debugfs_create_dir ( " kprobes " , NULL ) ;
2020-06-05 02:51:11 +03:00
debugfs_create_file ( " list " , 0400 , dir , NULL , & kprobes_fops ) ;
2007-02-21 00:57:54 +03:00
2021-09-14 17:38:37 +03:00
debugfs_create_file ( " enabled " , 0600 , dir , NULL , & fops_kp ) ;
2014-04-17 12:18:49 +04:00
2019-01-22 18:21:46 +03:00
debugfs_create_file ( " blacklist " , 0400 , dir , NULL ,
2020-06-05 02:51:11 +03:00
& kprobe_blacklist_fops ) ;
2007-05-08 11:34:16 +04:00
2007-02-21 00:57:54 +03:00
return 0 ;
}
late_initcall ( debugfs_kprobe_init ) ;
# endif /* CONFIG_DEBUG_FS */