2011-05-19 19:55:04 +02:00
# ifndef _KERNEL_EVENTS_INTERNAL_H
# define _KERNEL_EVENTS_INTERNAL_H
2011-10-16 17:15:04 +02:00
# include <linux/hardirq.h>
2012-08-07 15:20:38 +02:00
# include <linux/uaccess.h>
2011-10-16 17:15:04 +02:00
/* Buffer handling */
2011-05-19 19:55:04 +02:00
# define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
atomic_t refcount ;
struct rcu_head rcu_head ;
# ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work ;
int page_order ; /* allocation order */
# endif
int nr_pages ; /* nr of data pages */
2013-03-18 14:33:28 +01:00
int overwrite ; /* can overwrite itself */
2011-05-19 19:55:04 +02:00
atomic_t poll ; /* POLL_ for wakeups */
local_t head ; /* write position */
local_t nest ; /* nested writers */
local_t events ; /* event limit */
local_t wakeup ; /* wakeup stamp */
local_t lost ; /* nr records lost */
long watermark ; /* wakeup watermark */
perf: Fix loss of notification with multi-event
When you do:
$ perf record -e cycles,cycles,cycles noploop 10
You expect about 10,000 samples for each event, i.e., 10s at
1000samples/sec. However, this is not what's happening. You
get much fewer samples, maybe 3700 samples/event:
$ perf report -D | tail -15
Aggregated stats:
TOTAL events: 10998
MMAP events: 66
COMM events: 2
SAMPLE events: 10930
cycles stats:
TOTAL events: 3644
SAMPLE events: 3644
cycles stats:
TOTAL events: 3642
SAMPLE events: 3642
cycles stats:
TOTAL events: 3644
SAMPLE events: 3644
On a Intel Nehalem or even AMD64, there are 4 counters capable
of measuring cycles, so there is plenty of space to measure those
events without multiplexing (even with the NMI watchdog active).
And even with multiplexing, we'd expect roughly the same number
of samples per event.
The root of the problem was that when the event that caused the buffer
to become full was not the first event passed on the cmdline, the user
notification would get lost. The notification was sent to the file
descriptor of the overflowed event but the perf tool was not polling
on it. The perf tool aggregates all samples into a single buffer,
i.e., the buffer of the first event. Consequently, it assumes
notifications for any event will come via that descriptor.
The seemingly straight forward solution of moving the waitq into the
ringbuffer object doesn't work because of life-time issues. One could
perf_event_set_output() on a fd that you're also blocking on and cause
the old rb object to be freed while its waitq would still be
referenced by the blocked thread -> FAIL.
Therefore link all events to the ringbuffer and broadcast the wakeup
from the ringbuffer object to all possible events that could be waited
upon. This is rather ugly, and we're open to better solutions but it
works for now.
Reported-by: Stephane Eranian <eranian@google.com>
Finished-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-11-26 02:47:31 +01:00
/* poll crap */
spinlock_t event_lock ;
struct list_head event_list ;
2011-05-19 19:55:04 +02:00
2013-06-04 10:44:21 +02:00
atomic_t mmap_count ;
unsigned long mmap_locked ;
2013-05-28 10:55:48 +02:00
struct user_struct * mmap_user ;
2011-05-19 19:55:04 +02:00
struct perf_event_mmap_page * user_page ;
void * data_pages [ 0 ] ;
} ;
extern void rb_free ( struct ring_buffer * rb ) ;
extern struct ring_buffer *
rb_alloc ( int nr_pages , long watermark , int cpu , int flags ) ;
extern void perf_event_wakeup ( struct perf_event * event ) ;
extern void
perf_event_header__init_id ( struct perf_event_header * header ,
struct perf_sample_data * data ,
struct perf_event * event ) ;
extern void
perf_event__output_id_sample ( struct perf_event * event ,
struct perf_output_handle * handle ,
struct perf_sample_data * sample ) ;
extern struct page *
perf_mmap_to_page ( struct ring_buffer * rb , unsigned long pgoff ) ;
# ifdef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap ( ) with vmalloc memory .
*
* Required for architectures that have d - cache aliasing issues .
*/
static inline int page_order ( struct ring_buffer * rb )
{
return rb - > page_order ;
}
# else
static inline int page_order ( struct ring_buffer * rb )
{
return 0 ;
}
# endif
2011-10-16 17:15:04 +02:00
static inline unsigned long perf_data_size ( struct ring_buffer * rb )
2011-05-19 19:55:04 +02:00
{
return rb - > nr_pages < < ( PAGE_SHIFT + page_order ( rb ) ) ;
}
2012-08-07 15:20:38 +02:00
# define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
2013-10-30 21:16:22 +01:00
static inline unsigned long \
2012-08-07 15:20:38 +02:00
func_name ( struct perf_output_handle * handle , \
2013-10-30 21:16:22 +01:00
const void * buf , unsigned long len ) \
2012-08-07 15:20:38 +02:00
{ \
unsigned long size , written ; \
\
do { \
2013-10-30 21:16:22 +01:00
size = min ( handle - > size , len ) ; \
2012-08-07 15:20:38 +02:00
written = memcpy_func ( handle - > addr , buf , size ) ; \
2013-10-30 21:16:22 +01:00
written = size - written ; \
2012-08-07 15:20:38 +02:00
\
len - = written ; \
handle - > addr + = written ; \
buf + = written ; \
handle - > size - = written ; \
if ( ! handle - > size ) { \
struct ring_buffer * rb = handle - > rb ; \
\
handle - > page + + ; \
handle - > page & = rb - > nr_pages - 1 ; \
handle - > addr = rb - > data_pages [ handle - > page ] ; \
handle - > size = PAGE_SIZE < < page_order ( rb ) ; \
} \
} while ( len & & written = = size ) ; \
\
return len ; \
}
2013-10-30 21:16:22 +01:00
static inline unsigned long
memcpy_common ( void * dst , const void * src , unsigned long n )
2011-05-19 19:55:04 +02:00
{
2012-08-07 15:20:38 +02:00
memcpy ( dst , src , n ) ;
2013-10-30 21:16:22 +01:00
return 0 ;
2011-05-19 19:55:04 +02:00
}
2012-08-07 15:20:38 +02:00
DEFINE_OUTPUT_COPY ( __output_copy , memcpy_common )
2013-10-30 21:16:22 +01:00
static inline unsigned long
memcpy_skip ( void * dst , const void * src , unsigned long n )
{
return 0 ;
}
2012-08-07 15:20:39 +02:00
2013-10-30 21:16:22 +01:00
DEFINE_OUTPUT_COPY ( __output_skip , memcpy_skip )
2012-08-07 15:20:39 +02:00
2012-08-07 15:20:38 +02:00
# ifndef arch_perf_out_copy_user
2013-10-30 21:16:22 +01:00
# define arch_perf_out_copy_user arch_perf_out_copy_user
static inline unsigned long
arch_perf_out_copy_user ( void * dst , const void * src , unsigned long n )
{
unsigned long ret ;
pagefault_disable ( ) ;
ret = __copy_from_user_inatomic ( dst , src , n ) ;
pagefault_enable ( ) ;
return ret ;
}
2012-08-07 15:20:38 +02:00
# endif
DEFINE_OUTPUT_COPY ( __output_copy_user , arch_perf_out_copy_user )
2011-10-16 17:15:04 +02:00
/* Callchain handling */
2012-07-11 18:14:58 +04:00
extern struct perf_callchain_entry *
perf_callchain ( struct perf_event * event , struct pt_regs * regs ) ;
2011-10-16 17:15:04 +02:00
extern int get_callchain_buffers ( void ) ;
extern void put_callchain_buffers ( void ) ;
static inline int get_recursion_context ( int * recursion )
{
int rctx ;
if ( in_nmi ( ) )
rctx = 3 ;
else if ( in_irq ( ) )
rctx = 2 ;
else if ( in_softirq ( ) )
rctx = 1 ;
else
rctx = 0 ;
if ( recursion [ rctx ] )
return - 1 ;
recursion [ rctx ] + + ;
barrier ( ) ;
return rctx ;
}
static inline void put_recursion_context ( int * recursion , int rctx )
{
barrier ( ) ;
recursion [ rctx ] - - ;
}
2012-08-07 15:20:40 +02:00
# ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
static inline bool arch_perf_have_user_stack_dump ( void )
{
return true ;
}
# define perf_user_stack_pointer(regs) user_stack_pointer(regs)
# else
static inline bool arch_perf_have_user_stack_dump ( void )
{
return false ;
}
# define perf_user_stack_pointer(regs) 0
# endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
2011-05-19 19:55:04 +02:00
# endif /* _KERNEL_EVENTS_INTERNAL_H */