2009-03-19 22:26:15 +03:00
/*
2010-03-05 07:35:37 +03:00
* trace event based perf event profiling / tracing
2009-03-19 22:26:15 +03:00
*
* Copyright ( C ) 2009 Red Hat Inc , Peter Zijlstra < pzijlstr @ redhat . com >
2010-03-03 09:16:16 +03:00
* Copyright ( C ) 2009 - 2010 Frederic Weisbecker < fweisbec @ gmail . com >
2009-03-19 22:26:15 +03:00
*/
2009-08-24 08:19:47 +04:00
# include <linux/module.h>
2010-01-28 04:32:29 +03:00
# include <linux/kprobes.h>
2009-03-19 22:26:15 +03:00
# include "trace.h"
2010-08-11 07:47:59 +04:00
static char __percpu * perf_trace_buf [ PERF_NR_CONTEXTS ] ;
2009-09-18 08:10:28 +04:00
2010-03-23 02:08:59 +03:00
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
* suprises
*/
typedef typeof ( unsigned long [ PERF_MAX_TRACE_SIZE / sizeof ( unsigned long ) ] )
perf_trace_t ;
2009-11-22 07:26:55 +03:00
2009-09-18 08:10:28 +04:00
/* Count the events in use (per event id, not per instance) */
2010-03-05 07:35:37 +03:00
static int total_ref_count ;
2009-09-18 08:10:28 +04:00
2010-11-18 03:39:17 +03:00
static int perf_trace_event_perm ( struct ftrace_event_call * tp_event ,
struct perf_event * p_event )
{
2013-11-14 19:23:04 +04:00
if ( tp_event - > perf_perm ) {
int ret = tp_event - > perf_perm ( tp_event , p_event ) ;
if ( ret )
return ret ;
}
2014-07-16 16:33:29 +04:00
/*
* We checked and allowed to create parent ,
* allow children without checking .
*/
if ( p_event - > parent )
return 0 ;
/*
* It ' s ok to check current process ( owner ) permissions in here ,
* because code below is called only via perf_event_open syscall .
*/
2012-02-15 18:51:52 +04:00
/* The ftrace function trace is allowed only for root. */
2014-03-02 19:56:38 +04:00
if ( ftrace_event_is_function ( tp_event ) ) {
if ( perf_paranoid_tracepoint_raw ( ) & & ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
/*
* We don ' t allow user space callchains for function trace
* event , due to issues with page faults while tracing page
* fault handler and its overall trickiness nature .
*/
if ( ! p_event - > attr . exclude_callchain_user )
return - EINVAL ;
2014-03-02 19:56:39 +04:00
/*
* Same reason to disable user stack dump as for user space
* callchains above .
*/
if ( p_event - > attr . sample_type & PERF_SAMPLE_STACK_USER )
return - EINVAL ;
2014-03-02 19:56:38 +04:00
}
2012-02-15 18:51:52 +04:00
2010-11-18 03:39:17 +03:00
/* No tracing, just counting, so no obvious leak */
if ( ! ( p_event - > attr . sample_type & PERF_SAMPLE_RAW ) )
return 0 ;
/* Some events are ok to be traced by non-root users... */
if ( p_event - > attach_state = = PERF_ATTACH_TASK ) {
if ( tp_event - > flags & TRACE_EVENT_FL_CAP_ANY )
return 0 ;
}
/*
* . . . otherwise raw tracepoint data can be a severe data leak ,
* only allow root to have these .
*/
if ( perf_paranoid_tracepoint_raw ( ) & & ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
return 0 ;
}
2012-02-15 18:51:49 +04:00
static int perf_trace_event_reg ( struct ftrace_event_call * tp_event ,
struct perf_event * p_event )
2009-09-18 02:54:43 +04:00
{
2010-08-11 07:47:59 +04:00
struct hlist_head __percpu * list ;
2012-02-15 18:51:49 +04:00
int ret = - ENOMEM ;
2010-05-19 16:02:22 +04:00
int cpu ;
2009-09-18 08:10:28 +04:00
2010-05-19 16:02:22 +04:00
p_event - > tp_event = tp_event ;
if ( tp_event - > perf_refcount + + > 0 )
2009-09-18 02:54:43 +04:00
return 0 ;
2010-05-19 16:02:22 +04:00
list = alloc_percpu ( struct hlist_head ) ;
if ( ! list )
goto fail ;
for_each_possible_cpu ( cpu )
INIT_HLIST_HEAD ( per_cpu_ptr ( list , cpu ) ) ;
2009-09-18 08:10:28 +04:00
2010-05-19 16:02:22 +04:00
tp_event - > perf_events = list ;
2009-09-18 02:54:43 +04:00
2010-03-05 07:35:37 +03:00
if ( ! total_ref_count ) {
2010-08-11 07:47:59 +04:00
char __percpu * buf ;
2010-05-19 12:52:27 +04:00
int i ;
2009-09-18 08:10:28 +04:00
2010-08-14 22:45:13 +04:00
for ( i = 0 ; i < PERF_NR_CONTEXTS ; i + + ) {
2010-08-11 07:47:59 +04:00
buf = ( char __percpu * ) alloc_percpu ( perf_trace_t ) ;
2010-05-19 12:52:27 +04:00
if ( ! buf )
2010-05-19 16:02:22 +04:00
goto fail ;
2009-09-18 08:10:28 +04:00
2010-05-19 16:02:22 +04:00
perf_trace_buf [ i ] = buf ;
2010-05-19 12:52:27 +04:00
}
2009-09-18 08:10:28 +04:00
}
2012-02-15 18:51:49 +04:00
ret = tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_REGISTER , NULL ) ;
2010-05-19 16:02:22 +04:00
if ( ret )
goto fail ;
2009-09-18 08:10:28 +04:00
2010-05-19 16:02:22 +04:00
total_ref_count + + ;
return 0 ;
fail :
2010-03-05 07:35:37 +03:00
if ( ! total_ref_count ) {
2010-05-19 12:52:27 +04:00
int i ;
2010-08-14 22:45:13 +04:00
for ( i = 0 ; i < PERF_NR_CONTEXTS ; i + + ) {
2010-05-19 12:52:27 +04:00
free_percpu ( perf_trace_buf [ i ] ) ;
perf_trace_buf [ i ] = NULL ;
}
2009-10-03 16:55:18 +04:00
}
2010-05-19 16:02:22 +04:00
if ( ! - - tp_event - > perf_refcount ) {
free_percpu ( tp_event - > perf_events ) ;
tp_event - > perf_events = NULL ;
2009-10-03 16:55:18 +04:00
}
2009-09-18 08:10:28 +04:00
return ret ;
2009-09-18 02:54:43 +04:00
}
2012-02-15 18:51:49 +04:00
static void perf_trace_event_unreg ( struct perf_event * p_event )
{
struct ftrace_event_call * tp_event = p_event - > tp_event ;
int i ;
if ( - - tp_event - > perf_refcount > 0 )
goto out ;
tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_UNREGISTER , NULL ) ;
/*
* Ensure our callback won ' t be called anymore . The buffers
* will be freed after that .
*/
tracepoint_synchronize_unregister ( ) ;
free_percpu ( tp_event - > perf_events ) ;
tp_event - > perf_events = NULL ;
if ( ! - - total_ref_count ) {
for ( i = 0 ; i < PERF_NR_CONTEXTS ; i + + ) {
free_percpu ( perf_trace_buf [ i ] ) ;
perf_trace_buf [ i ] = NULL ;
}
}
out :
module_put ( tp_event - > mod ) ;
}
static int perf_trace_event_open ( struct perf_event * p_event )
{
struct ftrace_event_call * tp_event = p_event - > tp_event ;
return tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_OPEN , p_event ) ;
}
static void perf_trace_event_close ( struct perf_event * p_event )
{
struct ftrace_event_call * tp_event = p_event - > tp_event ;
tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_CLOSE , p_event ) ;
}
static int perf_trace_event_init ( struct ftrace_event_call * tp_event ,
struct perf_event * p_event )
{
int ret ;
ret = perf_trace_event_perm ( tp_event , p_event ) ;
if ( ret )
return ret ;
ret = perf_trace_event_reg ( tp_event , p_event ) ;
if ( ret )
return ret ;
ret = perf_trace_event_open ( p_event ) ;
if ( ret ) {
perf_trace_event_unreg ( p_event ) ;
return ret ;
}
return 0 ;
}
2010-05-19 16:02:22 +04:00
int perf_trace_init ( struct perf_event * p_event )
2009-03-19 22:26:15 +03:00
{
2010-05-19 16:02:22 +04:00
struct ftrace_event_call * tp_event ;
2013-11-15 21:39:45 +04:00
u64 event_id = p_event - > attr . config ;
2009-05-06 06:33:45 +04:00
int ret = - EINVAL ;
2009-03-19 22:26:15 +03:00
2009-05-06 06:33:45 +04:00
mutex_lock ( & event_mutex ) ;
2010-05-19 16:02:22 +04:00
list_for_each_entry ( tp_event , & ftrace_events , list ) {
2010-05-21 19:49:57 +04:00
if ( tp_event - > event . type = = event_id & &
2010-06-08 19:22:06 +04:00
tp_event - > class & & tp_event - > class - > reg & &
2010-05-19 16:02:22 +04:00
try_module_get ( tp_event - > mod ) ) {
ret = perf_trace_event_init ( tp_event , p_event ) ;
2010-09-01 14:58:43 +04:00
if ( ret )
module_put ( tp_event - > mod ) ;
2009-05-06 06:33:45 +04:00
break ;
}
2009-03-19 22:26:15 +03:00
}
2009-05-06 06:33:45 +04:00
mutex_unlock ( & event_mutex ) ;
2009-03-19 22:26:15 +03:00
2009-05-06 06:33:45 +04:00
return ret ;
2009-03-19 22:26:15 +03:00
}
2012-02-15 18:51:49 +04:00
void perf_trace_destroy ( struct perf_event * p_event )
{
mutex_lock ( & event_mutex ) ;
perf_trace_event_close ( p_event ) ;
perf_trace_event_unreg ( p_event ) ;
mutex_unlock ( & event_mutex ) ;
}
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 16:37:10 +04:00
int perf_trace_add ( struct perf_event * p_event , int flags )
2009-09-18 02:54:43 +04:00
{
2010-05-19 16:02:22 +04:00
struct ftrace_event_call * tp_event = p_event - > tp_event ;
2010-08-11 07:47:59 +04:00
struct hlist_head __percpu * pcpu_list ;
2010-05-19 16:02:22 +04:00
struct hlist_head * list ;
2009-09-18 08:10:28 +04:00
2010-08-11 07:47:59 +04:00
pcpu_list = tp_event - > perf_events ;
if ( WARN_ON_ONCE ( ! pcpu_list ) )
2010-05-19 16:02:22 +04:00
return - EINVAL ;
2009-09-18 08:10:28 +04:00
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 16:37:10 +04:00
if ( ! ( flags & PERF_EF_START ) )
p_event - > hw . state = PERF_HES_STOPPED ;
2010-08-11 07:47:59 +04:00
list = this_cpu_ptr ( pcpu_list ) ;
2010-05-19 16:02:22 +04:00
hlist_add_head_rcu ( & p_event - > hlist_entry , list ) ;
2009-09-18 08:10:28 +04:00
2012-02-15 18:51:50 +04:00
return tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_ADD , p_event ) ;
2010-05-19 16:02:22 +04:00
}
2009-09-18 08:10:28 +04:00
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 16:37:10 +04:00
void perf_trace_del ( struct perf_event * p_event , int flags )
2010-05-19 16:02:22 +04:00
{
2012-02-15 18:51:50 +04:00
struct ftrace_event_call * tp_event = p_event - > tp_event ;
2010-05-19 16:02:22 +04:00
hlist_del_rcu ( & p_event - > hlist_entry ) ;
2012-02-15 18:51:50 +04:00
tp_event - > class - > reg ( tp_event , TRACE_REG_PERF_DEL , p_event ) ;
2009-09-18 02:54:43 +04:00
}
2014-04-17 12:18:28 +04:00
void * perf_trace_buf_prepare ( int size , unsigned short type ,
2014-12-16 14:47:34 +03:00
struct pt_regs * * regs , int * rctxp )
2010-01-28 04:32:29 +03:00
{
struct trace_entry * entry ;
2010-05-25 13:02:55 +04:00
unsigned long flags ;
2010-05-19 16:02:22 +04:00
char * raw_data ;
2010-05-19 12:52:27 +04:00
int pc ;
2010-01-28 04:32:29 +03:00
2010-03-23 02:08:59 +03:00
BUILD_BUG_ON ( PERF_MAX_TRACE_SIZE % sizeof ( unsigned long ) ) ;
2013-06-17 21:02:11 +04:00
if ( WARN_ONCE ( size > PERF_MAX_TRACE_SIZE ,
" perf buffer not large enough " ) )
return NULL ;
2010-01-28 04:32:29 +03:00
pc = preempt_count ( ) ;
* rctxp = perf_swevent_get_recursion_context ( ) ;
if ( * rctxp < 0 )
2010-05-19 16:02:22 +04:00
return NULL ;
2010-01-28 04:32:29 +03:00
2014-12-16 14:47:34 +03:00
if ( regs )
* regs = this_cpu_ptr ( & __perf_regs [ * rctxp ] ) ;
2010-05-21 14:31:09 +04:00
raw_data = this_cpu_ptr ( perf_trace_buf [ * rctxp ] ) ;
2010-01-28 04:32:29 +03:00
/* zero the dead bytes from align to not leak stack to user */
2010-03-23 02:08:59 +03:00
memset ( & raw_data [ size - sizeof ( u64 ) ] , 0 , sizeof ( u64 ) ) ;
2010-01-28 04:32:29 +03:00
entry = ( struct trace_entry * ) raw_data ;
2010-05-25 13:02:55 +04:00
local_save_flags ( flags ) ;
tracing_generic_entry_update ( entry , flags , pc ) ;
2010-01-28 04:32:29 +03:00
entry - > type = type ;
return raw_data ;
}
2010-03-05 07:35:37 +03:00
EXPORT_SYMBOL_GPL ( perf_trace_buf_prepare ) ;
2014-04-17 12:18:28 +04:00
NOKPROBE_SYMBOL ( perf_trace_buf_prepare ) ;
2012-02-15 18:51:52 +04:00
# ifdef CONFIG_FUNCTION_TRACER
static void
2011-08-09 00:57:47 +04:00
perf_ftrace_function_call ( unsigned long ip , unsigned long parent_ip ,
2011-08-09 20:50:46 +04:00
struct ftrace_ops * ops , struct pt_regs * pt_regs )
2012-02-15 18:51:52 +04:00
{
struct ftrace_entry * entry ;
struct hlist_head * head ;
struct pt_regs regs ;
int rctx ;
2013-06-17 21:02:04 +04:00
head = this_cpu_ptr ( event_function . perf_events ) ;
if ( hlist_empty ( head ) )
return ;
2012-02-15 18:51:52 +04:00
# define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof ( u64 ) ) - sizeof ( u32 ) )
BUILD_BUG_ON ( ENTRY_SIZE > PERF_MAX_TRACE_SIZE ) ;
perf_fetch_caller_regs ( & regs ) ;
entry = perf_trace_buf_prepare ( ENTRY_SIZE , TRACE_FN , NULL , & rctx ) ;
if ( ! entry )
return ;
entry - > ip = ip ;
entry - > parent_ip = parent_ip ;
perf_trace_buf_submit ( entry , ENTRY_SIZE , rctx , 0 ,
2012-07-11 18:14:58 +04:00
1 , & regs , head , NULL ) ;
2012-02-15 18:51:52 +04:00
# undef ENTRY_SIZE
}
static int perf_ftrace_function_register ( struct perf_event * event )
{
struct ftrace_ops * ops = & event - > ftrace_ops ;
ops - > flags | = FTRACE_OPS_FL_CONTROL ;
ops - > func = perf_ftrace_function_call ;
return register_ftrace_function ( ops ) ;
}
static int perf_ftrace_function_unregister ( struct perf_event * event )
{
struct ftrace_ops * ops = & event - > ftrace_ops ;
ftrace, perf: Add filter support for function trace event
Adding support to filter function trace event via perf
interface. It is now possible to use filter interface
in the perf tool like:
perf record -e ftrace:function --filter="(ip == mm_*)" ls
The filter syntax is restricted to the the 'ip' field only,
and following operators are accepted '==' '!=' '||', ending
up with the filter strings like:
ip == f1[, ]f2 ... || ip != f3[, ]f4 ...
with comma ',' or space ' ' as a function separator. If the
space ' ' is used as a separator, the right side of the
assignment needs to be enclosed in double quotes '"', e.g.:
perf record -e ftrace:function --filter '(ip == do_execve,sys_*,ext*)' ls
perf record -e ftrace:function --filter '(ip == "do_execve,sys_*,ext*")' ls
perf record -e ftrace:function --filter '(ip == "do_execve sys_* ext*")' ls
The '==' operator adds trace filter with same effect as would
be added via set_ftrace_filter file.
The '!=' operator adds trace filter with same effect as would
be added via set_ftrace_notrace file.
The right side of the '!=', '==' operators is list of functions
or regexp. to be added to filter separated by space.
The '||' operator is used for connecting multiple filter definitions
together. It is possible to have more than one '==' and '!='
operators within one filter string.
Link: http://lkml.kernel.org/r/1329317514-8131-8-git-send-email-jolsa@redhat.com
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2012-02-15 18:51:54 +04:00
int ret = unregister_ftrace_function ( ops ) ;
ftrace_free_filter ( ops ) ;
return ret ;
2012-02-15 18:51:52 +04:00
}
static void perf_ftrace_function_enable ( struct perf_event * event )
{
ftrace_function_local_enable ( & event - > ftrace_ops ) ;
}
static void perf_ftrace_function_disable ( struct perf_event * event )
{
ftrace_function_local_disable ( & event - > ftrace_ops ) ;
}
int perf_ftrace_event_register ( struct ftrace_event_call * call ,
enum trace_reg type , void * data )
{
switch ( type ) {
case TRACE_REG_REGISTER :
case TRACE_REG_UNREGISTER :
break ;
case TRACE_REG_PERF_REGISTER :
case TRACE_REG_PERF_UNREGISTER :
return 0 ;
case TRACE_REG_PERF_OPEN :
return perf_ftrace_function_register ( data ) ;
case TRACE_REG_PERF_CLOSE :
return perf_ftrace_function_unregister ( data ) ;
case TRACE_REG_PERF_ADD :
perf_ftrace_function_enable ( data ) ;
return 0 ;
case TRACE_REG_PERF_DEL :
perf_ftrace_function_disable ( data ) ;
return 0 ;
}
return - EINVAL ;
}
# endif /* CONFIG_FUNCTION_TRACER */