e369517ce5
Current collapse stage has a scalability problem which can be reproduced easily with a parallel kernel build. This is because it needs to traverse every children of callchains linearly during the collapse/merge stage. Converting it to a rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio > /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf [.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf [.] dso__find_symbol + 8.02% perf perf [.] sort__comm_cmp + 5.48% perf libc-2.17.so [.] malloc_consolidate Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1381468543-25334-2-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
152 lines
3.4 KiB
C
152 lines
3.4 KiB
C
#ifndef __PERF_CALLCHAIN_H
|
|
#define __PERF_CALLCHAIN_H
|
|
|
|
#include "../perf.h"
|
|
#include <linux/list.h>
|
|
#include <linux/rbtree.h>
|
|
#include "event.h"
|
|
#include "symbol.h"
|
|
|
|
enum chain_mode {
|
|
CHAIN_NONE,
|
|
CHAIN_FLAT,
|
|
CHAIN_GRAPH_ABS,
|
|
CHAIN_GRAPH_REL
|
|
};
|
|
|
|
enum chain_order {
|
|
ORDER_CALLER,
|
|
ORDER_CALLEE
|
|
};
|
|
|
|
struct callchain_node {
|
|
struct callchain_node *parent;
|
|
struct list_head val;
|
|
struct rb_node rb_node_in; /* to insert nodes in an rbtree */
|
|
struct rb_node rb_node; /* to sort nodes in an output tree */
|
|
struct rb_root rb_root_in; /* input tree of children */
|
|
struct rb_root rb_root; /* sorted output tree of children */
|
|
unsigned int val_nr;
|
|
u64 hit;
|
|
u64 children_hit;
|
|
};
|
|
|
|
struct callchain_root {
|
|
u64 max_depth;
|
|
struct callchain_node node;
|
|
};
|
|
|
|
struct callchain_param;
|
|
|
|
typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
|
|
u64, struct callchain_param *);
|
|
|
|
enum chain_key {
|
|
CCKEY_FUNCTION,
|
|
CCKEY_ADDRESS
|
|
};
|
|
|
|
struct callchain_param {
|
|
enum chain_mode mode;
|
|
u32 print_limit;
|
|
double min_percent;
|
|
sort_chain_func_t sort;
|
|
enum chain_order order;
|
|
enum chain_key key;
|
|
};
|
|
|
|
struct callchain_list {
|
|
u64 ip;
|
|
struct map_symbol ms;
|
|
struct list_head list;
|
|
};
|
|
|
|
/*
|
|
* A callchain cursor is a single linked list that
|
|
* let one feed a callchain progressively.
|
|
* It keeps persistent allocated entries to minimize
|
|
* allocations.
|
|
*/
|
|
struct callchain_cursor_node {
|
|
u64 ip;
|
|
struct map *map;
|
|
struct symbol *sym;
|
|
struct callchain_cursor_node *next;
|
|
};
|
|
|
|
struct callchain_cursor {
|
|
u64 nr;
|
|
struct callchain_cursor_node *first;
|
|
struct callchain_cursor_node **last;
|
|
u64 pos;
|
|
struct callchain_cursor_node *curr;
|
|
};
|
|
|
|
extern __thread struct callchain_cursor callchain_cursor;
|
|
|
|
static inline void callchain_init(struct callchain_root *root)
|
|
{
|
|
INIT_LIST_HEAD(&root->node.val);
|
|
|
|
root->node.parent = NULL;
|
|
root->node.hit = 0;
|
|
root->node.children_hit = 0;
|
|
root->node.rb_root_in = RB_ROOT;
|
|
root->max_depth = 0;
|
|
}
|
|
|
|
static inline u64 callchain_cumul_hits(struct callchain_node *node)
|
|
{
|
|
return node->hit + node->children_hit;
|
|
}
|
|
|
|
int callchain_register_param(struct callchain_param *param);
|
|
int callchain_append(struct callchain_root *root,
|
|
struct callchain_cursor *cursor,
|
|
u64 period);
|
|
|
|
int callchain_merge(struct callchain_cursor *cursor,
|
|
struct callchain_root *dst, struct callchain_root *src);
|
|
|
|
/*
|
|
* Initialize a cursor before adding entries inside, but keep
|
|
* the previously allocated entries as a cache.
|
|
*/
|
|
static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
|
|
{
|
|
cursor->nr = 0;
|
|
cursor->last = &cursor->first;
|
|
}
|
|
|
|
int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
|
|
struct map *map, struct symbol *sym);
|
|
|
|
/* Close a cursor writing session. Initialize for the reader */
|
|
static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
|
|
{
|
|
cursor->curr = cursor->first;
|
|
cursor->pos = 0;
|
|
}
|
|
|
|
/* Cursor reading iteration helpers */
|
|
static inline struct callchain_cursor_node *
|
|
callchain_cursor_current(struct callchain_cursor *cursor)
|
|
{
|
|
if (cursor->pos == cursor->nr)
|
|
return NULL;
|
|
|
|
return cursor->curr;
|
|
}
|
|
|
|
static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
|
|
{
|
|
cursor->curr = cursor->curr->next;
|
|
cursor->pos++;
|
|
}
|
|
|
|
struct option;
|
|
|
|
int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
|
|
extern const char record_callchain_help[];
|
|
#endif /* __PERF_CALLCHAIN_H */
|