diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 7fc70ae21185..dc7d0a95bd36 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		 * Per-cpu breakpoints are not supported by our stepping
 		 * mechanism.
 		 */
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			return -EINVAL;
 
 		/*
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 98bbe06e469c..e7d934d3afe0 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	 * Disallow per-task kernel breakpoints since these would
 	 * complicate the stepping code.
 	 */
-	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
+	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
 		return -EINVAL;
 
 	return 0;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 7c4f6690533a..7fd60dcb2cb0 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -124,7 +124,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
 
 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
-static void power_pmu_flush_branch_stack(void) {}
+static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
 static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
 static void pmao_restore_workaround(bool ebb) { }
 #endif /* CONFIG_PPC32 */
@@ -350,6 +350,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
 		cpuhw->bhrb_context = event->ctx;
 	}
 	cpuhw->bhrb_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
 }
 
 static void power_pmu_bhrb_disable(struct perf_event *event)
@@ -361,6 +362,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
 
 	cpuhw->bhrb_users--;
 	WARN_ON_ONCE(cpuhw->bhrb_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
 
 	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
 		/* BHRB cannot be turned off when other
@@ -375,9 +377,12 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
 /* Called from ctxsw to prevent one process's branch entries to
  * mingle with the other process's entries during context switch.
  */
-static void power_pmu_flush_branch_stack(void)
+static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
-	if (ppmu->bhrb_nr)
+	if (!ppmu->bhrb_nr)
+		return;
+
+	if (sched_in)
 		power_pmu_bhrb_reset();
 }
 /* Calculate the to address for a branch */
@@ -1901,7 +1906,7 @@ static struct pmu power_pmu = {
 	.cancel_txn	= power_pmu_cancel_txn,
 	.commit_txn	= power_pmu_commit_txn,
 	.event_idx	= power_pmu_event_idx,
-	.flush_branch_stack = power_pmu_flush_branch_stack,
+	.sched_task	= power_pmu_sched_task,
 };
 
 /*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 854c04b3c9c2..7ee9b94d9921 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/disabled-features.h>
 #endif
 
-#define NCAPINTS	11	/* N 32-bit words worth of info */
+#define NCAPINTS	13	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -195,6 +195,7 @@
 #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
@@ -226,6 +227,7 @@
 #define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 #define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
@@ -244,6 +246,12 @@
 #define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
 #define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
 
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
 /*
  * BUG word(s)
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d2203b5d9538..23ba6765b718 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
 	/* in KB - valid for CPUS which support this call: */
 	int			x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
 	/* cpuid returned max cores value: */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..1a4eae695ca8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
 
+#define MSR_IA32_RTIT_CTL		0x00000570
+#define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_OS			BIT(2)
+#define RTIT_CTL_USR			BIT(3)
+#define RTIT_CTL_CR3EN			BIT(7)
+#define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_TSC_EN			BIT(10)
+#define RTIT_CTL_DISRETC		BIT(11)
+#define RTIT_CTL_BRANCH_EN		BIT(13)
+#define MSR_IA32_RTIT_STATUS		0x00000571
+#define RTIT_STATUS_CONTEXTEN		BIT(1)
+#define RTIT_STATUS_TRIGGEREN		BIT(2)
+#define RTIT_STATUS_ERROR		BIT(4)
+#define RTIT_STATUS_STOPPED		BIT(5)
+#define MSR_IA32_RTIT_CR3_MATCH		0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE	0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK	0x00000561
+
 #define MSR_MTRRfix64K_00000		0x00000250
 #define MSR_MTRRfix16K_80000		0x00000258
 #define MSR_MTRRfix16K_A0000		0x00000259
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3f70538012e2..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[10] = eax;
 	}
 
+	/* Additional Intel-defined flags: level 0x0000000F */
+	if (c->cpuid_level >= 0x0000000F) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=0 */
+		cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+		c->x86_capability[11] = edx;
+		if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+			/* will be overridden if occupancy monitoring exists */
+			c->x86_cache_max_rmid = ebx;
+
+			/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+			c->x86_capability[12] = edx;
+			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+				c->x86_cache_max_rmid = ecx;
+				c->x86_cache_occ_scale = ebx;
+			}
+		} else {
+			c->x86_cache_max_rmid = -1;
+			c->x86_cache_occ_scale = -1;
+		}
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	detect_nopl(c);
 }
 
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+	/*
+	 * The heavy lifting of max_rmid and cache_occ_scale are handled
+	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
+	 * in case CQM bits really aren't there in this CPU.
+	 */
+	if (c != &boot_cpu_data) {
+		boot_cpu_data.x86_cache_max_rmid =
+			min(boot_cpu_data.x86_cache_max_rmid,
+			    c->x86_cache_max_rmid);
+	}
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
 	init_hypervisor(c);
 	x86_init_rdrand(c);
+	x86_init_cache_qos(c);
 
 	/*
 	 * Clear/Set all flags overriden by options, need do it
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+	TOPA_4K	= 0,
+	TOPA_8K,
+	TOPA_16K,
+	TOPA_32K,
+	TOPA_64K,
+	TOPA_128K,
+	TOPA_256K,
+	TOPA_512K,
+	TOPA_1MB,
+	TOPA_2MB,
+	TOPA_4MB,
+	TOPA_8MB,
+	TOPA_16MB,
+	TOPA_32MB,
+	TOPA_64MB,
+	TOPA_128MB,
+	TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+	return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+	u64	end	: 1;
+	u64	rsvd0	: 1;
+	u64	intr	: 1;
+	u64	rsvd1	: 1;
+	u64	stop	: 1;
+	u64	rsvd2	: 1;
+	u64	size	: 4;
+	u64	rsvd3	: 2;
+	u64	base	: 36;
+	u64	rsvd4	: 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+	PT_CAP_max_subleaf = 0,
+	PT_CAP_cr3_filtering,
+	PT_CAP_topa_output,
+	PT_CAP_topa_multiple_entries,
+	PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+	struct pmu		pmu;
+	u32			caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *		cpu, depending on perf event configuration
+ * @cpu:	cpu for per-cpu allocation
+ * @tables:	list of ToPA tables in this buffer
+ * @first:	shorthand for first topa table
+ * @last:	shorthand for last topa table
+ * @cur:	current topa table
+ * @nr_pages:	buffer size in pages
+ * @cur_idx:	current output region's index within @cur table
+ * @output_off:	offset within the current output region
+ * @data_size:	running total of the amount of data in this buffer
+ * @lost:	if data was lost/truncated
+ * @head:	logical write offset inside the buffer
+ * @snapshot:	if this is for a snapshot/overwrite counter
+ * @stop_pos:	STOP topa entry in the buffer
+ * @intr_pos:	INT topa entry in the buffer
+ * @data_pages:	array of pages from perf
+ * @topa_index:	table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+	int			cpu;
+	struct list_head	tables;
+	struct topa		*first, *last, *cur;
+	unsigned int		cur_idx;
+	size_t			output_off;
+	unsigned long		nr_pages;
+	local_t			data_size;
+	local_t			lost;
+	local64_t		head;
+	bool			snapshot;
+	unsigned long		stop_pos, intr_pos;
+	void			**data_pages;
+	struct topa_entry	*topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:	perf output handle
+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+	struct perf_output_handle handle;
+	int			handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e2888a3ad1e3..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
 	}
 }
 
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
 static inline int x86_pmu_initialized(void)
 {
 	return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int ret = -EBUSY, i;
+
+	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+		return 0;
+
+	mutex_lock(&pmc_reserve_mutex);
+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+			goto out;
+
+	atomic_inc(&x86_pmu.lbr_exclusive[what]);
+	ret = 0;
+
+out:
+	mutex_unlock(&pmc_reserve_mutex);
+	return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
 int x86_setup_perfctr(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
 		/* BTS is currently only allowed for user-mode. */
 		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
 	}
 
 	hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
-		/*
-		 * check that PEBS LBR correction does not conflict with
-		 * whatever the user is asking with attr->branch_sample_type
-		 */
-		if (event->attr.precise_ip > 1 &&
-		    x86_pmu.intel_cap.pebs_format < 2) {
-			u64 *br_type = &event->attr.branch_sample_type;
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
 
-			if (has_branch_stack(event)) {
-				if (!precise_br_compat(event))
-					return -EOPNOTSUPP;
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
 
-				/* branch_sample_type is compatible */
+			/* branch_sample_type is compatible */
 
-			} else {
-				/*
-				 * user did not specify  branch_sample_type
-				 *
-				 * For PEBS fixups, we capture all
-				 * the branches at the priv level of the
-				 * event.
-				 */
-				*br_type = PERF_SAMPLE_BRANCH_ANY;
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
 
-				if (!event->attr.exclude_user)
-					*br_type |= PERF_SAMPLE_BRANCH_USER;
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
 
-				if (!event->attr.exclude_kernel)
-					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-			}
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 		}
 	}
 
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	struct perf_event *e;
-	int i, wmin, wmax, num = 0;
+	int i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
 		hwc->constraint = c;
 
 		wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	/* slow path */
 	if (i != n)
-		num = perf_assign_events(cpuc->event_list, n, wmin,
-					 wmax, assign);
+		unsched = perf_assign_events(cpuc->event_list, n, wmin,
+					     wmax, assign);
 
 	/*
-	 * Mark the event as committed, so we do not put_constraint()
-	 * in case new events are added and fail scheduling.
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
 	 */
-	if (!num && assign) {
+	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
-	/*
-	 * scheduling failed or is just a simulation,
-	 * free resources if necessary
-	 */
-	if (!assign || num) {
+
+	if (!assign || unsched) {
+
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
 				continue;
 
+			/*
+			 * release events that failed scheduling
+			 */
 			if (x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
 	}
-	return num ? -EINVAL : 0;
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
+	return unsched ? -EINVAL : 0;
 }
 
 /*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
 	hwc = &event->hw;
 
-	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs;
+	u64 pebs, debugctl;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int ret = NOTIFY_OK;
+	int i, ret = NOTIFY_OK;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		cpuc->kfree_on_online = NULL;
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
 		if (x86_pmu.cpu_prepare)
 			ret = x86_pmu.cpu_prepare(cpu);
 		break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
-		kfree(cpuc->kfree_on_online);
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
 		break;
 
 	case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
 	if (IS_ERR(fake_cpuc))
 		return PTR_ERR(fake_cpuc);
 
-	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
 
 	if (!c || !c->weight)
 		ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
-static void x86_pmu_flush_branch_stack(void)
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
-	if (x86_pmu.flush_branch_stack)
-		x86_pmu.flush_branch_stack();
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
 }
 
 void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
 	.commit_txn		= x86_pmu_commit_txn,
 
 	.event_idx		= x86_pmu_event_idx,
-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
+	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..329f0356ad4a 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
 
 
@@ -123,8 +125,37 @@ struct intel_shared_regs {
 	unsigned                core_id;	/* per-core: core id */
 };
 
+enum intel_excl_state_type {
+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	int  num_alloc_cntrs;/* #counters allocated */
+	int  max_alloc_cntrs;/* max #counters allowed */
+	bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+	raw_spinlock_t	lock;
+
+	struct intel_excl_states states[2];
+
+	int		refcnt;		/* per-core: #HT threads */
+	unsigned	core_id;	/* per-core: core id */
+};
+
 #define MAX_LBR_ENTRIES		16
 
+enum {
+	X86_PERF_KFREE_SHARED = 0,
+	X86_PERF_KFREE_EXCL   = 1,
+	X86_PERF_KFREE_MAX
+};
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
 	 * used on Intel NHM/WSM/SNB
 	 */
 	struct intel_shared_regs	*shared_regs;
+	/*
+	 * manage exclusive counter access between hyperthread
+	 */
+	struct event_constraint *constraint_list; /* in enable order */
+	struct intel_excl_cntrs		*excl_cntrs;
+	int excl_thread_id; /* 0 or 1 */
 
 	/*
 	 * AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
 	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
 	u64				perf_ctr_virt_mask;
 
-	void				*kfree_on_online;
+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
 };
 
 #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
 
+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+			   0, PERF_X86_EVENT_EXCL)
+
 /*
  * The overlap flag marks event constraints with overlapping counter
  * masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
 
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
 #define INTEL_PLD_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
 
 /* Check flags and event code, and set the HSW load flag */
 #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
+	__EVENT_CONSTRAINT(code, n,			\
 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW store flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW load flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW N/A flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
@@ -408,6 +471,13 @@ union x86_pmu_config {
 
 #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
 
+enum {
+	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_bts,
+	x86_lbr_exclusive_pt,
+	x86_lbr_exclusive_max,
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -443,14 +513,25 @@ struct x86_pmu {
 	u64		max_period;
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 int idx,
 						 struct perf_event *event);
 
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
+
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
+					     struct perf_event *event,
+					     int cntr);
+
+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
+
+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
+
 	struct event_constraint *event_constraints;
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
 	void		(*cpu_dead)(int cpu);
 
 	void		(*check_microcode)(void);
-	void		(*flush_branch_stack)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
 
 	/*
 	 * Intel Arch Perfmon v2+
@@ -503,11 +585,16 @@ struct x86_pmu {
 	const int	*lbr_sel_map;		   /* lbr_select mappings */
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 
+	/*
+	 * Intel PT/LBR/BTS are exclusive
+	 */
+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
+
 	/*
 	 * Extra registers for events
 	 */
 	struct extra_reg *extra_regs;
-	unsigned int er_flags;
+	unsigned int flags;
 
 	/*
 	 * Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+struct x86_perf_task_context {
+	u64 lbr_from[MAX_LBR_ENTRIES];
+	u64 lbr_to[MAX_LBR_ENTRIES];
+	int lbr_callstack_users;
+	int lbr_stack_state;
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
@@ -524,8 +618,13 @@ do {									\
 	x86_pmu.quirks = &__quirk;					\
 } while (0)
 
-#define ERF_NO_HT_SHARING	1
-#define ERF_HAS_RSP_1		2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+	return  x86_pmu.lbr_sel_map &&
+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
 int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
 int x86_setup_perfctr(struct perf_event *event);
 
 int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
 
 #ifdef CONFIG_CPU_SUP_INTEL
 
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+	/* user explicitly requested branch sampling */
+	if (has_branch_stack(event))
+		return true;
+
+	/* implicit branch sampling to correct PEBS skid */
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+	    x86_pmu.intel_cap.pebs_format < 2)
+		return true;
+
+	return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !event->attr.freq && event->hw.sample_period == 1)
+		return true;
+
+	return false;
+}
+
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event);
 
 struct intel_shared_regs *allocate_shared_regs(int cpu);
 
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_pmu_lbr_reset(void);
 
 void intel_pmu_lbr_enable(struct perf_event *event);
 
 void intel_pmu_lbr_disable(struct perf_event *event);
 
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
 
 void intel_pmu_lbr_disable_all(void);
 
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
 
 void intel_pmu_lbr_init_snb(void);
 
+void intel_pmu_lbr_init_hsw(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 
+static inline int is_ht_workaround_enabled(void)
+{
+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
 #else /* CONFIG_CPU_SUP_INTEL */
 
 static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
 static void amd_pmu_cpu_starting(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 	struct amd_nb *nb;
 	int i, nb_id;
 
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
 			continue;
 
 		if (nb->nb_id == nb_id) {
-			cpuc->kfree_on_online = cpuc->amd_nb;
+			*onln = cpuc->amd_nb;
 			cpuc->amd_nb = nb;
 			break;
 		}
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
 }
 
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	/*
 	 * if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 
 static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
  * is using the new offset.
  */
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
 {
 	int offset;
 	int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
 
 	if (offset == APIC_EILVT_NR_MAX) {
 		printk(KERN_DEBUG "No EILVT entry available\n");
-		return -EBUSY;
+		return;
 	}
 
 	ret = setup_ibs_ctl(offset);
 	if (ret)
 		goto out;
 
-	if (!ibs_eilvt_valid()) {
-		ret = -EFAULT;
+	if (!ibs_eilvt_valid())
 		goto out;
-	}
 
 	pr_info("IBS: LVT offset %d assigned\n", offset);
 
-	return 0;
+	return;
 out:
 	preempt_disable();
 	put_eilvt(offset);
 	preempt_enable();
-	return ret;
+	return;
 }
 
 static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..9da2400c2ec3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/watchdog.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
 	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
 	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	/*
-	 * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
-	 * siblings; disable these events because they can corrupt unrelated
-	 * counters.
-	 */
-	INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
 	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
 	EVENT_CONSTRAINT_END
 };
 
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+	else
+		intel_bts_disable_local();
 
 	intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+	__intel_pmu_disable_all();
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
+	intel_pmu_lbr_enable_all(pmi);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
 			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
 			return;
 
 		intel_pmu_enable_bts(event->hw.config);
-	}
+	} else
+		intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+	__intel_pmu_enable_all(added, false);
 }
 
 /*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	 * must disable before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_disable(event);
 
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	 * must enabled before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
+	/* Ack all overflows and disable fixed counters */
+	if (x86_pmu.version >= 2) {
+		intel_pmu_ack_status(intel_pmu_get_status());
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	}
+
+	/* Reset LBRs and LBR freezing */
+	if (x86_pmu.lbr_nr) {
+		update_debugctlmsr(get_debugctlmsr() &
+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	 */
 	if (!x86_pmu.late_ack)
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	intel_pmu_disable_all();
+	__intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
 	if (!status)
 		goto done;
@@ -1398,6 +1628,14 @@ again:
 		x86_pmu.drain_pebs(regs);
 	}
 
+	/*
+	 * Intel PT
+	 */
+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+		handled++;
+		intel_pt_interrupt();
+	}
+
 	/*
 	 * Checkpointed counters can lead to 'spurious' PMIs because the
 	 * rollback caused by the PMI will have cleared the overflow status
@@ -1433,7 +1671,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all(0);
+	__intel_pmu_enable_all(0, true);
 	/*
 	 * Only unmask the NMI after the overflow counters
 	 * have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
 
 static int intel_alt_er(int idx)
 {
-	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;
 
 	if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 }
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	return x86_get_event_constraints(cpuc, event);
+	return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = true;
+	xl->num_alloc_cntrs = 0;
+	/*
+	 * lock shared state until we are done scheduling
+	 * in stop_event_scheduling()
+	 * makes scheduling appear as a transaction
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+	raw_spin_lock(&excl_cntrs->lock);
+
+	/*
+	 * save initial state of sibling thread
+	 */
+	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	/*
+	 * make new sibling thread state visible
+	 */
+	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+	xl->sched_started = false;
+	/*
+	 * release shared state lock (acquired in intel_start_scheduling())
+	 */
+	raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			   int idx, struct event_constraint *c)
+{
+	struct event_constraint *cx;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int is_excl, i;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* alternate */
+
+	/*
+	 * validating a group does not require
+	 * enforcing cross-thread  exclusion
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return c;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return c;
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	/*
+	 * xl = state of current HT
+	 * xlo = state of sibling HT
+	 */
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	/*
+	 * do not allow scheduling of more than max_alloc_cntrs
+	 * which is set to half the available generic counters.
+	 * this helps avoid counter starvation of sibling thread
+	 * by ensuring at most half the counters cannot be in
+	 * exclusive mode. There is not designated counters for the
+	 * limits. Any N/2 counters can be used. This helps with
+	 * events with specifix counter constraints
+	 */
+	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+		return &emptyconstraint;
+
+	cx = c;
+
+	/*
+	 * because we modify the constraint, we need
+	 * to make a copy. Static constraints come
+	 * from static const tables.
+	 *
+	 * only needed when constraint has not yet
+	 * been cloned (marked dynamic)
+	 */
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+		/* sanity check */
+		if (idx < 0)
+			return &emptyconstraint;
+
+		/*
+		 * grab pre-allocated constraint entry
+		 */
+		cx = &cpuc->constraint_list[idx];
+
+		/*
+		 * initialize dynamic constraint
+		 * with static constraint
+		 */
+		memcpy(cx, c, sizeof(*cx));
+
+		/*
+		 * mark constraint as dynamic, so we
+		 * can free it later on
+		 */
+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+	}
+
+	/*
+	 * From here on, the constraint is dynamic.
+	 * Either it was just allocated above, or it
+	 * was allocated during a earlier invocation
+	 * of this function
+	 */
+
+	/*
+	 * Modify static constraint with current dynamic
+	 * state of thread
+	 *
+	 * EXCLUSIVE: sibling counter measuring exclusive event
+	 * SHARED   : sibling counter measuring non-exclusive event
+	 * UNUSED   : sibling counter unused
+	 */
+	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+		/*
+		 * exclusive event in sibling counter
+		 * our corresponding counter cannot be used
+		 * regardless of our event
+		 */
+		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, cx->idxmsk);
+		/*
+		 * if measuring an exclusive event, sibling
+		 * measuring non-exclusive, then counter cannot
+		 * be used
+		 */
+		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, cx->idxmsk);
+	}
+
+	/*
+	 * recompute actual bit weight for scheduling algorithm
+	 */
+	cx->weight = hweight64(cx->idxmsk64);
+
+	/*
+	 * if we return an empty mask, then switch
+	 * back to static empty constraint to avoid
+	 * the cost of freeing later on
+	 */
+	if (cx->weight == 0)
+		cx = &emptyconstraint;
+
+	return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c1 = event->hw.constraint;
+	struct event_constraint *c2;
+
+	/*
+	 * first time only
+	 * - static constraint: no change across incremental scheduling calls
+	 * - dynamic constraint: handled by intel_get_excl_constraints()
+	 */
+	c2 = __intel_get_event_constraints(cpuc, idx, event);
+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+		c1->weight = c2->weight;
+		c2 = c1;
+	}
+
+	if (cpuc->excl_cntrs)
+		return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+	return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+		struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xlo, *xl;
+	unsigned long flags = 0; /* keep compiler happy */
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	/*
+	 * put_constraint may be called from x86_schedule_events()
+	 * which already has the lock held so here make locking
+	 * conditional
+	 */
+	if (!xl->sched_started)
+		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+	/*
+	 * if event was actually assigned, then mark the
+	 * counter state as unused now
+	 */
+	if (hwc->idx >= 0)
+		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+	if (!xl->sched_started)
+		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
 }
 
 static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
+	struct event_constraint *c = event->hw.constraint;
+
 	intel_put_shared_regs_event_constraints(cpuc, event);
+
+	/*
+	 * is PMU has exclusive counter restrictions, then
+	 * all events are subject to and must call the
+	 * put_excl_constraints() routine
+	 */
+	if (c && cpuc->excl_cntrs)
+		intel_put_excl_constraints(cpuc, event);
+
+	/* cleanup dynamic constraint */
+	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+		event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+				    struct perf_event *event, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = event->hw.constraint;
+	struct intel_excl_states *xlo, *xl;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+	int is_excl;
+
+	if (cpuc->is_fake || !c)
+		return;
+
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+	if (cntr >= 0) {
+		if (is_excl)
+			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+		else
+			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+	}
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
 		x86_pmu.pebs_aliases(event);
 
-	if (intel_pmu_needs_lbr_smpl(event)) {
+	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+
+		/*
+		 * BTS is set up earlier in this path, so don't account twice
+		 */
+		if (!intel_pmu_has_bts(event)) {
+			/* disallow lbr if conflicting events are present */
+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+				return -EBUSY;
+
+			event->destroy = hw_perf_lbr_event_destroy;
+		}
 	}
 
 	if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
 			EVENT_CONSTRAINT(0, 0x4, 0);
 
 static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
-	struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+	struct event_constraint *c;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
 
 	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
 	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 	return regs;
 }
 
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+	struct intel_excl_cntrs *c;
+	int i;
+
+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+			 GFP_KERNEL, cpu_to_node(cpu));
+	if (c) {
+		raw_spin_lock_init(&c->lock);
+		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+			c->states[0].state[i] = INTEL_EXCL_UNUSED;
+			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+			c->states[1].state[i] = INTEL_EXCL_UNUSED;
+			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+		}
+		c->core_id = -1;
+	}
+	return c;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
-		return NOTIFY_OK;
+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			return NOTIFY_BAD;
+	}
 
-	cpuc->shared_regs = allocate_shared_regs(cpu);
-	if (!cpuc->shared_regs)
-		return NOTIFY_BAD;
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+		if (!cpuc->constraint_list)
+			return NOTIFY_BAD;
+
+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+		if (!cpuc->excl_cntrs) {
+			kfree(cpuc->constraint_list);
+			kfree(cpuc->shared_regs);
+			return NOTIFY_BAD;
+		}
+		cpuc->excl_thread_id = 0;
+	}
 
 	return NOTIFY_OK;
 }
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!cpuc->shared_regs)
 		return;
 
-	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
 			if (pc && pc->core_id == core_id) {
-				cpuc->kfree_on_online = cpuc->shared_regs;
+				*onln = cpuc->shared_regs;
 				cpuc->shared_regs = pc;
 				break;
 			}
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
 
 	if (x86_pmu.lbr_sel_map)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		int h = x86_pmu.num_counters >> 1;
+
+		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+			struct intel_excl_cntrs *c;
+
+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
+			if (c && c->core_id == core_id) {
+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+				cpuc->excl_cntrs = c;
+				cpuc->excl_thread_id = 1;
+				break;
+			}
+		}
+		cpuc->excl_cntrs->core_id = core_id;
+		cpuc->excl_cntrs->refcnt++;
+		/*
+		 * set hard limit to half the number of generic counters
+		 */
+		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+	}
+}
+
+static void free_excl_cntrs(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_excl_cntrs *c;
+
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+		kfree(cpuc->constraint_list);
+		cpuc->constraint_list = NULL;
+	}
 }
 
 static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
 		cpuc->shared_regs = NULL;
 	}
 
-	fini_debug_store_on_cpu(cpu);
-}
+	free_excl_cntrs(cpu);
 
-static void intel_pmu_flush_branch_stack(void)
-{
-	/*
-	 * Intel LBR does not tag entries with the
-	 * PID of the current task, then we need to
-	 * flush it on ctxsw
-	 * For now, we simply reset it
-	 */
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_reset();
+	fini_debug_store_on_cpu(cpu);
 }
 
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.flush_branch_stack	= intel_pmu_flush_branch_stack,
+	.sched_task		= intel_pmu_lbr_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
+	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
 
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_slm_event_constraints;
 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_slm_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		pr_cont("Silvermont events, ");
 		break;
 
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 
 		x86_pmu.cpu_events = nhm_events_attrs;
 
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
 	case 42: /* 32nm SandyBridge         */
 	case 45: /* 32nm SandyBridge-E/EN/EP */
 		x86_add_quirk(intel_sandybridge_quirk);
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
 
 	case 58: /* 22nm IvyBridge       */
 	case 62: /* 22nm IvyBridge-EP/EX */
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		/* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
 	case 63: /* 22nm Haswell Server */
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+		x86_add_quirk(intel_ht_bug);
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
-		intel_pmu_lbr_init_snb();
+		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
 
 	return 0;
 }
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+	int cpu = smp_processor_id();
+	int w, c;
+	/*
+	 * problem not present on this CPU model, nothing to do
+	 */
+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+		return 0;
+
+	w = cpumask_weight(topology_thread_cpumask(cpu));
+	if (w > 1) {
+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+		return 0;
+	}
+
+	watchdog_nmi_disable_all();
+
+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+	x86_pmu.commit_scheduling = NULL;
+	x86_pmu.start_scheduling = NULL;
+	x86_pmu.stop_scheduling = NULL;
+
+	watchdog_nmi_enable_all();
+
+	get_online_cpus();
+
+	for_each_online_cpu(c) {
+		free_excl_cntrs(c);
+	}
+
+	put_online_cpus();
+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+	return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+	struct perf_output_handle	handle;
+	struct debug_store		ds_back;
+	int				started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE		24
+#define BTS_SAFETY_MARGIN	4080
+
+struct bts_phys {
+	struct page	*page;
+	unsigned long	size;
+	unsigned long	offset;
+	unsigned long	displacement;
+};
+
+struct bts_buffer {
+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
+	unsigned int	nr_pages;
+	unsigned int	nr_bufs;
+	unsigned int	cur_buf;
+	bool		snapshot;
+	local_t		data_size;
+	local_t		lost;
+	local_t		head;
+	unsigned long	end;
+	void		**data_pages;
+	struct bts_phys	buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+	return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+	struct bts_buffer *buf;
+	struct page *page;
+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	unsigned long offset;
+	size_t size = nr_pages << PAGE_SHIFT;
+	int pg, nbuf, pad;
+
+	/* count all the high order buffers */
+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
+		page = virt_to_page(pages[pg]);
+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+			return NULL;
+		pg += 1 << page_private(page);
+		nbuf++;
+	}
+
+	/*
+	 * to avoid interrupts in overwrite mode, only allow one physical
+	 */
+	if (overwrite && nbuf > 1)
+		return NULL;
+
+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->nr_bufs = nbuf;
+	buf->snapshot = overwrite;
+	buf->data_pages = pages;
+	buf->real_size = size - size % BTS_RECORD_SIZE;
+
+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+		unsigned int __nr_pages;
+
+		page = virt_to_page(pages[pg]);
+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+		buf->buf[nbuf].page = page;
+		buf->buf[nbuf].offset = offset;
+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+		buf->buf[nbuf].size -= pad;
+
+		pg += __nr_pages;
+		offset += __nr_pages << PAGE_SHIFT;
+	}
+
+	return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+	kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+	return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
+	unsigned long index, thresh = 0, end = phys->size;
+	struct page *page = phys->page;
+
+	index = local_read(&buf->head);
+
+	if (!buf->snapshot) {
+		if (buf->end < phys->offset + buf_size(page))
+			end = buf->end - phys->offset - phys->displacement;
+
+		index -= phys->offset + phys->displacement;
+
+		if (end - index > BTS_SAFETY_MARGIN)
+			thresh = end - BTS_SAFETY_MARGIN;
+		else if (end - index > BTS_RECORD_SIZE)
+			thresh = end - BTS_RECORD_SIZE;
+		else
+			thresh = end;
+	}
+
+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+	ds->bts_index = ds->bts_buffer_base + index;
+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+	ds->bts_interrupt_threshold = !buf->snapshot
+		? ds->bts_buffer_base + thresh
+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+	unsigned long index = head - phys->offset;
+
+	memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= bts->handle.size ||
+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+		return true;
+
+	return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+	if (!buf)
+		return;
+
+	head = index + bts_buffer_offset(buf, buf->cur_buf);
+	old = local_xchg(&buf->head, head);
+
+	if (!buf->snapshot) {
+		if (old == head)
+			return;
+
+		if (ds->bts_index >= ds->bts_absolute_maximum)
+			local_inc(&buf->lost);
+
+		/*
+		 * old and head are always in the same physical buffer, so we
+		 * can subtract them to get the data size.
+		 */
+		local_add(head - old, &buf->data_size);
+	} else {
+		local_set(&buf->data_size, head);
+	}
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	u64 config = 0;
+
+	if (!buf || bts_buffer_is_full(buf, bts))
+		return;
+
+	event->hw.state = 0;
+
+	if (!buf->snapshot)
+		config |= ARCH_PERFMON_EVENTSEL_INT;
+	if (!event->attr.exclude_kernel)
+		config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		config |= ARCH_PERFMON_EVENTSEL_USR;
+
+	bts_config_buffer(buf);
+
+	/*
+	 * local barrier to make sure that ds configuration made it
+	 * before we enable BTS
+	 */
+	wmb();
+
+	intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	__bts_event_start(event);
+
+	/* PMI handler: this counter is running and likely generating PMIs */
+	ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+	/*
+	 * No extra synchronization is mandated by the documentation to have
+	 * BTS data stores globally visible.
+	 */
+	intel_pmu_disable_bts();
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	/* PMI handler: don't restart this counter */
+	ACCESS_ONCE(bts->started) = 0;
+
+	__bts_event_stop(event);
+
+	if (flags & PERF_EF_UPDATE)
+		bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event && bts->started)
+		__bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event)
+		__bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
+	unsigned int next_buf;
+	struct bts_phys *phys, *next_phys;
+	int ret;
+
+	if (buf->snapshot)
+		return 0;
+
+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
+		return -EINVAL;
+
+	phys = &buf->buf[buf->cur_buf];
+	space = phys->offset + phys->displacement + phys->size - head;
+	pad = space;
+	if (space > handle->size) {
+		space = handle->size;
+		space -= space % BTS_RECORD_SIZE;
+	}
+	if (space <= BTS_SAFETY_MARGIN) {
+		/* See if next phys buffer has more space */
+		next_buf = buf->cur_buf + 1;
+		if (next_buf >= buf->nr_bufs)
+			next_buf = 0;
+		next_phys = &buf->buf[next_buf];
+		gap = buf_size(phys->page) - phys->displacement - phys->size +
+		      next_phys->displacement;
+		skip = pad + gap;
+		if (handle->size >= skip) {
+			next_space = next_phys->size;
+			if (next_space + skip > handle->size) {
+				next_space = handle->size - skip;
+				next_space -= next_space % BTS_RECORD_SIZE;
+			}
+			if (next_space > space || !space) {
+				if (pad)
+					bts_buffer_pad_out(phys, head);
+				ret = perf_aux_output_skip(handle, skip);
+				if (ret)
+					return ret;
+				/* Advance to next phys buffer */
+				phys = next_phys;
+				space = next_space;
+				head = phys->offset + phys->displacement;
+				/*
+				 * After this, cur_buf and head won't match ds
+				 * anymore, so we must not be racing with
+				 * bts_update().
+				 */
+				buf->cur_buf = next_buf;
+				local_set(&buf->head, head);
+			}
+		}
+	}
+
+	/* Don't go far beyond wakeup watermark */
+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+		 handle->head;
+	if (space > wakeup) {
+		space = wakeup;
+		space -= space % BTS_RECORD_SIZE;
+	}
+
+	buf->end = head + space;
+
+	/*
+	 * If we have no space, the lost notification would have been sent when
+	 * we hit absolute_maximum - see bts_update()
+	 */
+	if (!space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct perf_event *event = bts->handle.event;
+	struct bts_buffer *buf;
+	s64 old_head;
+	int err;
+
+	if (!event || !bts->started)
+		return 0;
+
+	buf = perf_get_aux(&bts->handle);
+	/*
+	 * Skip snapshot counters: they don't use the interrupt, but
+	 * there's no other way of telling, because the pointer will
+	 * keep moving
+	 */
+	if (!buf || buf->snapshot)
+		return 0;
+
+	old_head = local_read(&buf->head);
+	bts_update(bts);
+
+	/* no new data */
+	if (old_head == local_read(&buf->head))
+		return 0;
+
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+			    !!local_xchg(&buf->lost, 0));
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return 1;
+
+	err = bts_buffer_reset(buf, &bts->handle);
+	if (err)
+		perf_aux_output_end(&bts->handle, 0, false);
+
+	return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+	bts_event_stop(event, PERF_EF_UPDATE);
+
+	if (buf) {
+		if (buf->snapshot)
+			bts->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+				    !!local_xchg(&buf->lost, 0));
+	}
+
+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+	struct bts_buffer *buf;
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		return -EBUSY;
+
+	if (bts->handle.event)
+		return -EBUSY;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return -EINVAL;
+
+	ret = bts_buffer_reset(buf, &bts->handle);
+	if (ret) {
+		perf_aux_output_end(&bts->handle, 0, false);
+		return ret;
+	}
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	if (mode & PERF_EF_START) {
+		bts_event_start(event, 0);
+		if (hwc->state & PERF_HES_STOPPED) {
+			bts_event_del(event, 0);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+	if (event->attr.type != bts_pmu.type)
+		return -ENOENT;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
+		return -EBUSY;
+
+	event->destroy = bts_event_destroy;
+
+	return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+		return -ENODEV;
+
+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+	bts_pmu.task_ctx_nr	= perf_sw_context;
+	bts_pmu.event_init	= bts_event_init;
+	bts_pmu.add		= bts_event_add;
+	bts_pmu.del		= bts_event_del;
+	bts_pmu.start		= bts_event_start;
+	bts_pmu.stop		= bts_event_stop;
+	bts_pmu.read		= bts_event_read;
+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
+	bts_pmu.free_aux	= bts_buffer_free_aux;
+
+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+	raw_spinlock_t		lock;
+	int			rmid;
+	int			cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
+
+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID		(-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+	if (!rmid || rmid == INVALID_RMID)
+		return false;
+
+	return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+enum rmid_recycle_state {
+	RMID_YOUNG = 0,
+	RMID_AVAILABLE,
+	RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+	unsigned int rmid;
+	enum rmid_recycle_state state;
+	struct list_head list;
+	unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	entry = cqm_rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static int __get_rmid(void)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	if (list_empty(&cqm_rmid_free_lru))
+		return INVALID_RMID;
+
+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
+}
+
+static void __put_rmid(unsigned int rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	WARN_ON(!__rmid_valid(rmid));
+	entry = __rmid_entry(rmid);
+
+	entry->queue_time = jiffies;
+	entry->state = RMID_YOUNG;
+
+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+	struct cqm_rmid_entry *entry;
+	unsigned int nr_rmids;
+	int r = 0;
+
+	nr_rmids = cqm_max_rmid + 1;
+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+				nr_rmids, GFP_KERNEL);
+	if (!cqm_rmid_ptrs)
+		return -ENOMEM;
+
+	for (; r <= cqm_max_rmid; r++) {
+		struct cqm_rmid_entry *entry;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto fail;
+
+		INIT_LIST_HEAD(&entry->list);
+		entry->rmid = r;
+		cqm_rmid_ptrs[r] = entry;
+
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
+
+	mutex_lock(&cache_mutex);
+	intel_cqm_rotation_rmid = __get_rmid();
+	mutex_unlock(&cache_mutex);
+
+	return 0;
+fail:
+	while (r--)
+		kfree(cqm_rmid_ptrs[r]);
+
+	kfree(cqm_rmid_ptrs);
+	return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	/* Per-cpu and task events don't mix */
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.target == b->hw.target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.target);
+
+	return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+	struct perf_event *event;
+	unsigned int old_rmid = group->hw.cqm_rmid;
+	struct list_head *head = &group->hw.cqm_group_entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	/*
+	 * If our RMID is being deallocated, perform a read now.
+	 */
+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+		struct rmid_read rr = {
+			.value = ATOMIC64_INIT(0),
+			.rmid = old_rmid,
+		};
+
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+				 &rr, 1);
+		local64_set(&group->count, atomic64_read(&rr.value));
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+
+	group->hw.cqm_rmid = rmid;
+	list_for_each_entry(event, head, hw.cqm_group_entry)
+		event->hw.cqm_rmid = rmid;
+
+	raw_spin_unlock_irq(&cache_lock);
+
+	return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+	struct cqm_rmid_entry *entry;
+
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		if (entry->state != RMID_AVAILABLE)
+			break;
+
+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+			entry->state = RMID_DIRTY;
+	}
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+	struct perf_event *leader, *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	leader = list_first_entry(&cache_groups, struct perf_event,
+				  hw.cqm_groups_entry);
+	event = leader;
+
+	list_for_each_entry_continue(event, &cache_groups,
+				     hw.cqm_groups_entry) {
+		if (__rmid_valid(event->hw.cqm_rmid))
+			continue;
+
+		if (__conflict_event(event, leader))
+			continue;
+
+		intel_cqm_xchg_rmid(event, rmid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+	struct cqm_rmid_entry *entry, *tmp;
+
+	lockdep_assert_held(&cache_mutex);
+
+	*available = 0;
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		unsigned long min_queue_time;
+		unsigned long now = jiffies;
+
+		/*
+		 * We hold RMIDs placed into limbo for a minimum queue
+		 * time. Before the minimum queue time has elapsed we do
+		 * not recycle RMIDs.
+		 *
+		 * The reasoning is that until a sufficient time has
+		 * passed since we stopped using an RMID, any RMID
+		 * placed onto the limbo list will likely still have
+		 * data tagged in the cache, which means we'll probably
+		 * fail to recycle it anyway.
+		 *
+		 * We can save ourselves an expensive IPI by skipping
+		 * any RMIDs that have not been queued for the minimum
+		 * time.
+		 */
+		min_queue_time = entry->queue_time +
+			msecs_to_jiffies(__rmid_queue_time_ms);
+
+		if (time_after(min_queue_time, now))
+			break;
+
+		entry->state = RMID_AVAILABLE;
+		(*available)++;
+	}
+
+	/*
+	 * Fast return if none of the RMIDs on the limbo list have been
+	 * sitting on the queue for the minimum queue time.
+	 */
+	if (!*available)
+		return false;
+
+	/*
+	 * Test whether an RMID is free for each package.
+	 */
+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+		/*
+		 * Exhausted all RMIDs that have waited min queue time.
+		 */
+		if (entry->state == RMID_YOUNG)
+			break;
+
+		if (entry->state == RMID_DIRTY)
+			continue;
+
+		list_del(&entry->list);	/* remove from limbo */
+
+		/*
+		 * The rotation RMID gets priority if it's
+		 * currently invalid. In which case, skip adding
+		 * the RMID to the the free lru.
+		 */
+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+			intel_cqm_rotation_rmid = entry->rmid;
+			continue;
+		}
+
+		/*
+		 * If we have groups waiting for RMIDs, hand
+		 * them one now provided they don't conflict.
+		 */
+		if (intel_cqm_sched_in_event(entry->rmid))
+			continue;
+
+		/*
+		 * Otherwise place it onto the free list.
+		 */
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+
+	return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+	struct perf_event *rotor;
+	unsigned int rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	rotor = list_first_entry(&cache_groups, struct perf_event,
+				 hw.cqm_groups_entry);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
+	 * don't need to rotate the list.
+	 */
+	if (next == rotor)
+		return;
+
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+	__put_rmid(rmid);
+
+	list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+	struct perf_event *group, *g;
+	unsigned int rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+		if (group == event)
+			continue;
+
+		rmid = group->hw.cqm_rmid;
+
+		/*
+		 * Skip events that don't have a valid RMID.
+		 */
+		if (!__rmid_valid(rmid))
+			continue;
+
+		/*
+		 * No conflict? No problem! Leave the event alone.
+		 */
+		if (!__conflict_event(group, event))
+			continue;
+
+		intel_cqm_xchg_rmid(group, INVALID_RMID);
+		__put_rmid(rmid);
+	}
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+	struct perf_event *group, *start = NULL;
+	unsigned int threshold_limit;
+	unsigned int nr_needed = 0;
+	unsigned int nr_available;
+	bool rotated = false;
+
+	mutex_lock(&cache_mutex);
+
+again:
+	/*
+	 * Fast path through this function if there are no groups and no
+	 * RMIDs that need cleaning.
+	 */
+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+		if (!__rmid_valid(group->hw.cqm_rmid)) {
+			if (!start)
+				start = group;
+			nr_needed++;
+		}
+	}
+
+	/*
+	 * We have some event groups, but they all have RMIDs assigned
+	 * and no RMIDs need cleaning.
+	 */
+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	if (!nr_needed)
+		goto stabilize;
+
+	/*
+	 * We have more event groups without RMIDs than available RMIDs,
+	 * or we have event groups that conflict with the ones currently
+	 * scheduled.
+	 *
+	 * We force deallocate the rmid of the group at the head of
+	 * cache_groups. The first event group without an RMID then gets
+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
+	 * forward progress.
+	 *
+	 * Rotate the cache_groups list so the previous head is now the
+	 * tail.
+	 */
+	__intel_cqm_pick_and_rotate(start);
+
+	/*
+	 * If the rotation is going to succeed, reduce the threshold so
+	 * that we don't needlessly reuse dirty RMIDs.
+	 */
+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+		intel_cqm_rotation_rmid = __get_rmid();
+
+		intel_cqm_sched_out_conflicting_events(start);
+
+		if (__intel_cqm_threshold)
+			__intel_cqm_threshold--;
+	}
+
+	rotated = true;
+
+stabilize:
+	/*
+	 * We now need to stablize the RMID we freed above (if any) to
+	 * ensure that the next time we rotate we have an RMID with zero
+	 * occupancy value.
+	 *
+	 * Alternatively, if we didn't need to perform any rotation,
+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
+	 */
+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+	while (intel_cqm_rmid_stabilize(&nr_available) &&
+	       __intel_cqm_threshold < threshold_limit) {
+		unsigned int steal_limit;
+
+		/*
+		 * Don't spin if nobody is actively waiting for an RMID,
+		 * the rotation worker will be kicked as soon as an
+		 * event needs an RMID anyway.
+		 */
+		if (!nr_needed)
+			break;
+
+		/* Allow max 25% of RMIDs to be in limbo. */
+		steal_limit = (cqm_max_rmid + 1) / 4;
+
+		/*
+		 * We failed to stabilize any RMIDs so our rotation
+		 * logic is now stuck. In order to make forward progress
+		 * we have a few options:
+		 *
+		 *   1. rotate ("steal") another RMID
+		 *   2. increase the threshold
+		 *   3. do nothing
+		 *
+		 * We do both of 1. and 2. until we hit the steal limit.
+		 *
+		 * The steal limit prevents all RMIDs ending up on the
+		 * limbo list. This can happen if every RMID has a
+		 * non-zero occupancy above threshold_limit, and the
+		 * occupancy values aren't dropping fast enough.
+		 *
+		 * Note that there is prioritisation at work here - we'd
+		 * rather increase the number of RMIDs on the limbo list
+		 * than increase the threshold, because increasing the
+		 * threshold skews the event data (because we reuse
+		 * dirty RMIDs) - threshold bumps are a last resort.
+		 */
+		if (nr_available < steal_limit)
+			goto again;
+
+		__intel_cqm_threshold++;
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+	unsigned long delay;
+
+	__intel_cqm_rmid_rotate();
+
+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+				  struct perf_event **group)
+{
+	struct perf_event *iter;
+	unsigned int rmid;
+	bool conflict = false;
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		rmid = iter->hw.cqm_rmid;
+
+		if (__match_event(iter, event)) {
+			/* All tasks in a group share an RMID */
+			event->hw.cqm_rmid = rmid;
+			*group = iter;
+			return;
+		}
+
+		/*
+		 * We only care about conflicts for events that are
+		 * actually scheduled in (and hence have a valid RMID).
+		 */
+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
+			conflict = true;
+	}
+
+	if (conflict)
+		rmid = INVALID_RMID;
+	else
+		rmid = __get_rmid();
+
+	event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+	unsigned long flags;
+	unsigned int rmid;
+	u64 val;
+
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	rmid = event->hw.cqm_rmid;
+
+	if (!__rmid_valid(rmid))
+		goto out;
+
+	val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		goto out;
+
+	local64_set(&event->count, val);
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	unsigned long flags;
+	struct rmid_read rr = {
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	/*
+	 * Notice that we don't perform the reading of an RMID
+	 * atomically, because we can't hold a spin lock across the
+	 * IPIs.
+	 *
+	 * Speculatively perform the read, since @event might be
+	 * assigned a different (possibly invalid) RMID while we're
+	 * busying performing the IPI calls. It's therefore necessary to
+	 * check @event's RMID afterwards, and if it has changed,
+	 * discard the result of the read.
+	 */
+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+	if (!__rmid_valid(rr.rmid))
+		goto out;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	if (event->hw.cqm_rmid == rr.rmid)
+		local64_set(&event->count, atomic64_read(&rr.value));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned int rmid = event->hw.cqm_rmid;
+	unsigned long flags;
+
+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (state->cnt++)
+		WARN_ON_ONCE(state->rmid != rmid);
+	else
+		WARN_ON_ONCE(state->rmid);
+
+	state->rmid = rmid;
+	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned long flags;
+
+	if (event->hw.cqm_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cqm_state |= PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	intel_cqm_event_read(event);
+
+	if (!--state->cnt) {
+		state->rmid = 0;
+		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	unsigned long flags;
+	unsigned int rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	event->hw.cqm_state = PERF_HES_STOPPED;
+	rmid = event->hw.cqm_rmid;
+
+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+		intel_cqm_event_start(event, mode);
+
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+	intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_group_entry,
+					       struct perf_event,
+					       hw.cqm_group_entry);
+		list_del(&event->hw.cqm_group_entry);
+	}
+
+	/*
+	 * And we're the group leader..
+	 */
+	if (cqm_group_leader(event)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cqm_groups_entry,
+				     &group_other->hw.cqm_groups_entry);
+		} else {
+			unsigned int rmid = event->hw.cqm_rmid;
+
+			if (__rmid_valid(rmid))
+				__put_rmid(rmid);
+			list_del(&event->hw.cqm_groups_entry);
+		}
+	}
+
+	mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	bool rotate = false;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	/* Will also set rmid */
+	intel_cqm_setup_event(event, &group);
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_group_entry,
+			      &group->hw.cqm_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_groups_entry,
+			      &cache_groups);
+
+		/*
+		 * All RMIDs are either in use or have recently been
+		 * used. Kick the rotation worker to clean/free some.
+		 *
+		 * We only do this for the group leader, rather than for
+		 * every event in a group to save on needless work.
+		 */
+		if (!__rmid_valid(event->hw.cqm_rmid))
+			rotate = true;
+	}
+
+	mutex_unlock(&cache_mutex);
+
+	if (rotate)
+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+	return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+			   char *page)
+{
+	ssize_t rv;
+
+	mutex_lock(&cache_mutex);
+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+	mutex_unlock(&cache_mutex);
+
+	return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int bytes, cachelines;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cache_mutex);
+
+	__intel_cqm_max_threshold = bytes;
+	cachelines = bytes / cqm_l3_scale;
+
+	/*
+	 * The new maximum takes effect immediately.
+	 */
+	if (__intel_cqm_threshold > cachelines)
+		__intel_cqm_threshold = cachelines;
+
+	mutex_unlock(&cache_mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+	&dev_attr_max_recycle_threshold.attr,
+	NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+	.attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	&intel_cqm_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_del,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+	.count		     = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	for_each_cpu(i, &cqm_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;	/* already got reader for this socket */
+	}
+
+	cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	raw_spin_lock_init(&state->lock);
+	state->rmid = 0;
+	state->cnt  = 0;
+
+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+		return;
+
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+
+		if (phys_id == topology_physical_package_id(i)) {
+			cpumask_set_cpu(i, &cqm_cpumask);
+			break;
+		}
+	}
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		intel_cqm_cpu_prepare(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->x86_cache_max_rmid < cqm_max_rmid)
+			cqm_max_rmid = c->x86_cache_max_rmid;
+
+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
+			pr_err("Multiple LLC scale values, disabling\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	ret = intel_cqm_setup_rmid_cache();
+	if (ret)
+		goto out;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_prepare(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+	if (ret)
+		pr_err("Intel CQM perf registration failed: %d\n", ret);
+	else
+		pr_info("Intel CQM monitoring enabled\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..ca69ea56c712 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr |= DEBUGCTLMSR_TR;
 	debugctlmsr |= DEBUGCTLMSR_BTS;
-	debugctlmsr |= DEBUGCTLMSR_BTINT;
+	if (config & ARCH_PERFMON_EVENTSEL_INT)
+		debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
 		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -611,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
@@ -622,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
         EVENT_CONSTRAINT_END
@@ -633,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
 #define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
 #define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
 #define LBR_FAR_BIT		8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
 
 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
 #define LBR_USER	(1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
 #define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
 #define LBR_FAR		(1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
@@ -69,33 +71,31 @@ static enum {
 #define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
 #define LBR_FROM_FLAG_ABORT    (1ULL << 61)
 
-#define for_each_branch_sample_type(x) \
-	for ((x) = PERF_SAMPLE_BRANCH_USER; \
-	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
 /*
  * x86control flow change classification
  * x86control flow changes include branches, interrupts, traps, faults
  */
 enum {
-	X86_BR_NONE     = 0,      /* unknown */
+	X86_BR_NONE		= 0,      /* unknown */
 
-	X86_BR_USER     = 1 << 0, /* branch target is user */
-	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
+	X86_BR_USER		= 1 << 0, /* branch target is user */
+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
 
-	X86_BR_CALL     = 1 << 2, /* call */
-	X86_BR_RET      = 1 << 3, /* return */
-	X86_BR_SYSCALL  = 1 << 4, /* syscall */
-	X86_BR_SYSRET   = 1 << 5, /* syscall return */
-	X86_BR_INT      = 1 << 6, /* sw interrupt */
-	X86_BR_IRET     = 1 << 7, /* return from interrupt */
-	X86_BR_JCC      = 1 << 8, /* conditional */
-	X86_BR_JMP      = 1 << 9, /* jump */
-	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
-	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
-	X86_BR_ABORT    = 1 << 12,/* transaction abort */
-	X86_BR_IN_TX    = 1 << 13,/* in transaction */
-	X86_BR_NO_TX    = 1 << 14,/* not in transaction */
+	X86_BR_CALL		= 1 << 2, /* call */
+	X86_BR_RET		= 1 << 3, /* return */
+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
+	X86_BR_INT		= 1 << 6, /* sw interrupt */
+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
+	X86_BR_JCC		= 1 << 8, /* conditional */
+	X86_BR_JMP		= 1 << 9, /* jump */
+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
 	 X86_BR_JMP	 |\
 	 X86_BR_IRQ	 |\
 	 X86_BR_ABORT	 |\
-	 X86_BR_IND_CALL)
+	 X86_BR_IND_CALL |\
+	 X86_BR_ZERO_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
 
 #define X86_BR_ANY_CALL		 \
 	(X86_BR_CALL		|\
 	 X86_BR_IND_CALL	|\
+	 X86_BR_ZERO_CALL	|\
 	 X86_BR_SYSCALL		|\
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
 {
-	u64 debugctl;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 debugctl, lbr_select = 0, orig_debugctl;
 
-	if (cpuc->lbr_sel)
-		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+	/*
+	 * No need to reprogram LBR_SELECT in a PMI, as it
+	 * did not change.
+	 */
+	if (cpuc->lbr_sel && !pmi) {
+		lbr_select = cpuc->lbr_sel->config;
+		wrmsrl(MSR_LBR_SELECT, lbr_select);
+	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	orig_debugctl = debugctl;
+	debugctl |= DEBUGCTLMSR_LBR;
+	/*
+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+	 * may cause superfluous increase/decrease of LBR_TOS.
+	 */
+	if (!(lbr_select & LBR_CALL_STACK))
+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	if (orig_debugctl != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
 static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+	return tos;
+}
+
+enum {
+	LBR_NONE,
+	LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * If LBR callstack feature is enabled and the stack was saved when
+	 * the task was scheduled out, restore the stack. Otherwise flush
+	 * the LBR stack.
+	 */
+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
+	if (task_ctx) {
+		if (sched_in) {
+			__intel_pmu_lbr_restore(task_ctx);
+			cpuc->lbr_context = ctx;
+		} else {
+			__intel_pmu_lbr_save(task_ctx);
+		}
+		return;
+	}
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	}
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users++;
+	}
+
 	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
 }
 
 void intel_pmu_lbr_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users--;
+	}
+
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
 
 	if (cpuc->enabled && !cpuc->lbr_users) {
 		__intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 	}
 }
 
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	if (cpuc->lbr_users)
-		__intel_pmu_lbr_enable();
+		__intel_pmu_lbr_enable(pmi);
 }
 
 void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-	u64 tos;
-
-	rdmsrl(x86_pmu.lbr_tos, tos);
-
-	return tos;
-}
-
 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
  * - in case there is no HW filter
  * - in case the HW filter has errata or limitations
  */
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 {
 	u64 br_type = event->attr.branch_sample_type;
 	int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 	if (br_type & PERF_SAMPLE_BRANCH_COND)
 		mask |= X86_BR_JCC;
 
+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+		if (!x86_pmu_has_lbr_callstack())
+			return -EOPNOTSUPP;
+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+			return -EINVAL;
+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+			X86_BR_CALL_STACK;
+	}
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
 	 */
 	event->hw.branch_reg.reg = mask;
+	return 0;
 }
 
 /*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 {
 	struct hw_perf_event_extra *reg;
 	u64 br_type = event->attr.branch_sample_type;
-	u64 mask = 0, m;
-	u64 v;
+	u64 mask = 0, v;
+	int i;
 
-	for_each_branch_sample_type(m) {
-		if (!(br_type & m))
+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+		if (!(br_type & (1ULL << i)))
 			continue;
 
-		v = x86_pmu.lbr_sel_map[m];
+		v = x86_pmu.lbr_sel_map[i];
 		if (v == LBR_NOT_SUPP)
 			return -EOPNOTSUPP;
 
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
-	/* LBR_SELECT operates in suppress mode so invert mask */
-	reg->config = ~mask & x86_pmu.lbr_sel_mask;
+	/*
+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+	 * in suppress mode. So LBR_SELECT should be set to
+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 */
+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
 
 	return 0;
 }
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 	/*
 	 * setup SW LBR filter
 	 */
-	intel_pmu_setup_sw_lbr_filter(event);
+	ret = intel_pmu_setup_sw_lbr_filter(event);
+	if (ret)
+		return ret;
 
 	/*
 	 * setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 		ret = X86_BR_INT;
 		break;
 	case 0xe8: /* call near rel */
+		insn_get_immediate(&insn);
+		if (insn.immediate1.value == 0) {
+			/* zero length call */
+			ret = X86_BR_ZERO_CALL;
+			break;
+		}
 	case 0x9a: /* call far absolute */
 		ret = X86_BR_CALL;
 		break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 /*
  * Map interface branch filters onto LBR filters
  */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP
-					| LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
+						| LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
 	 */
-	[PERF_SAMPLE_BRANCH_ANY_CALL] =
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
 	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
 	 */
-	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
 };
 
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL
-					| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_RETURN | LBR_CALL_STACK,
 };
 
 /* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
 	pr_cont("16-deep LBR, ");
 }
 
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..f2770641c0fd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1103 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)						\
+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
+			    .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+	const char	*name;
+	u32		leaf;
+	u8		reg;
+	u32		mask;
+} pt_caps[] = {
+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+	struct pt_cap_desc *cd = &pt_caps[cap];
+	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	unsigned int shift = __ffs(cd->mask);
+
+	return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	enum pt_capabilities cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+	.name	= "caps",
+};
+
+PMU_FORMAT_ATTR(tsc,		"config:10"	);
+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+
+static struct attribute *pt_formats_attr[] = {
+	&format_attr_tsc.attr,
+	&format_attr_noretcomp.attr,
+	NULL,
+};
+
+static struct attribute_group pt_format_group = {
+	.name	= "format",
+	.attrs	= pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+	&pt_cap_group,
+	&pt_format_group,
+	NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+	struct dev_ext_attribute *de_attrs;
+	struct attribute **attrs;
+	size_t size;
+	int ret;
+	long i;
+
+	attrs = NULL;
+	ret = -ENODEV;
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		goto fail;
+
+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
+		cpuid_count(20, i,
+			    &pt_pmu.caps[CR_EAX + i*4],
+			    &pt_pmu.caps[CR_EBX + i*4],
+			    &pt_pmu.caps[CR_ECX + i*4],
+			    &pt_pmu.caps[CR_EDX + i*4]);
+	}
+
+	ret = -ENOMEM;
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+	attrs = kzalloc(size, GFP_KERNEL);
+	if (!attrs)
+		goto fail;
+
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+	de_attrs = kzalloc(size, GFP_KERNEL);
+	if (!de_attrs)
+		goto fail;
+
+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+		struct dev_ext_attribute *de_attr = de_attrs + i;
+
+		de_attr->attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attrs->attr.attr);
+
+		de_attr->attr.attr.mode		= S_IRUGO;
+		de_attr->attr.show		= pt_cap_show;
+		de_attr->var			= (void *)i;
+
+		attrs[i] = &de_attr->attr.attr;
+	}
+
+	pt_cap_group.attrs = attrs;
+
+	return 0;
+
+fail:
+	kfree(attrs);
+
+	return ret;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+	u64 config = event->attr.config;
+
+	if ((config & PT_CONFIG_MASK) != config)
+		return false;
+
+	return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+	u64 reg;
+
+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+	if (!event->attr.exclude_kernel)
+		reg |= RTIT_CTL_OS;
+	if (!event->attr.exclude_user)
+		reg |= RTIT_CTL_USR;
+
+	reg |= (event->attr.config & PT_CONFIG_MASK);
+
+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (start)
+		ctl |= RTIT_CTL_TRACEEN;
+	else
+		ctl &= ~RTIT_CTL_TRACEEN;
+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	/*
+	 * A wrmsr that disables trace generation serializes other PT
+	 * registers and causes all data packets to be written to memory,
+	 * but a fence is required for the data to become globally visible.
+	 *
+	 * The below WMB, separating data store and aux_head store matches
+	 * the consumer's RMB that separates aux_head load and data load.
+	 */
+	if (!start)
+		wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+			     unsigned int output_off)
+{
+	u64 reg;
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:	actual ToPA table entries, as understood by PT hardware
+ * @list:	linkage to struct pt_buffer's list of tables
+ * @phys:	physical address of this page
+ * @offset:	offset of the first entry in this table in the buffer
+ * @size:	total size of all entries in this table
+ * @last:	index of the last initialized entry in this table
+ */
+struct topa {
+	struct topa_entry	table[TENTS_PER_PAGE];
+	struct list_head	list;
+	u64			phys;
+	u64			offset;
+	size_t			size;
+	int			last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:	CPU on which to allocate.
+ * @gfp:	Allocation flags.
+ *
+ * Return:	On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+	int node = cpu_to_node(cpu);
+	struct topa *topa;
+	struct page *p;
+
+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+	if (!p)
+		return NULL;
+
+	topa = page_address(p);
+	topa->last = 0;
+	topa->phys = page_to_phys(p);
+
+	/*
+	 * In case of singe-entry ToPA, always put the self-referencing END
+	 * link as the 2nd entry in the table
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(topa, 1)->end = 1;
+	}
+
+	return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:	Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+	free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:	 PT buffer that's being extended.
+ * @topa:	 New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+	struct topa *last = buf->last;
+
+	list_add_tail(&topa->list, &buf->tables);
+
+	if (!buf->first) {
+		buf->first = buf->last = buf->cur = topa;
+		return;
+	}
+
+	topa->offset = last->offset + last->size;
+	buf->last = topa;
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return;
+
+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+	TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:	ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+	/* single-entry ToPA is a special case */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return !!topa->last;
+
+	return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:	PT buffer being initialized.
+ * @gfp:	Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:	0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+	struct topa *topa = buf->last;
+	int order = 0;
+	struct page *p;
+
+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (topa_table_full(topa)) {
+		topa = topa_alloc(buf->cpu, gfp);
+		if (!topa)
+			return -ENOMEM;
+
+		topa_insert_table(buf, topa);
+	}
+
+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+	TOPA_ENTRY(topa, -1)->size = order;
+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, -1)->intr = 1;
+		TOPA_ENTRY(topa, -1)->stop = 1;
+	}
+
+	topa->last++;
+	topa->size += sizes(order);
+
+	buf->nr_pages += 1ul << order;
+
+	return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:	PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+	struct topa *topa;
+
+	list_for_each_entry(topa, &buf->tables, list) {
+		int i;
+
+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+			 topa->phys, topa->offset, topa->size);
+		for (i = 0; i < TENTS_PER_PAGE; i++) {
+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+				 &topa->table[i],
+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
+				 sizes(topa->table[i].size),
+				 topa->table[i].end ?  'E' : ' ',
+				 topa->table[i].intr ? 'I' : ' ',
+				 topa->table[i].stop ? 'S' : ' ',
+				 *(u64 *)&topa->table[i]);
+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+			     topa->table[i].stop) ||
+			    topa->table[i].end)
+				break;
+		}
+	}
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:	PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+	buf->output_off = 0;
+	buf->cur_idx++;
+
+	if (buf->cur_idx == buf->cur->last) {
+		if (buf->cur == buf->last)
+			buf->cur = buf->first;
+		else
+			buf->cur = list_entry(buf->cur->list.next, struct topa,
+					      list);
+		buf->cur_idx = 0;
+	}
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:		Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	u64 topa_idx, base, old;
+
+	/* offset of the first region in this table from the beginning of buf */
+	base = buf->cur->offset + buf->output_off;
+
+	/* offset of the current output region within this table */
+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+		base += sizes(buf->cur->table[topa_idx].size);
+
+	if (buf->snapshot) {
+		local_set(&buf->data_size, base);
+	} else {
+		old = (local64_xchg(&buf->head, base) &
+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
+		if (base < old)
+			base += buf->nr_pages << PAGE_SHIFT;
+
+		local_add(base - old, &buf->data_size);
+	}
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:	PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:	PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+	return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:		Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	int advance = 0;
+	u64 status;
+
+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+	if (status & RTIT_STATUS_ERROR) {
+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+		pt_topa_dump(buf);
+		status &= ~RTIT_STATUS_ERROR;
+	}
+
+	if (status & RTIT_STATUS_STOPPED) {
+		status &= ~RTIT_STATUS_STOPPED;
+
+		/*
+		 * On systems that only do single-entry ToPA, hitting STOP
+		 * means we are already losing data; need to let the decoder
+		 * know.
+		 */
+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+			local_inc(&buf->lost);
+			advance++;
+		}
+	}
+
+	/*
+	 * Also on single-entry ToPA implementations, interrupt will come
+	 * before the output reaches its output region's boundary.
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+		void *head = pt_buffer_region(buf);
+
+		/* everything within this margin needs to be zeroed out */
+		memset(head + buf->output_off, 0,
+		       pt_buffer_region_size(buf) -
+		       buf->output_off);
+		advance++;
+	}
+
+	if (advance)
+		pt_buffer_advance(buf);
+
+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:	PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+	u64 offset, base_topa;
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+	buf->cur = phys_to_virt(base_topa);
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	/* offset within current output region */
+	buf->output_off = offset >> 32;
+	/* index of current output region within this table */
+	buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:	PT buffer.
+ * @pg:		Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+	struct topa_entry *te = buf->topa_index[pg];
+
+	/* one region */
+	if (buf->first == buf->last && buf->first->last == 1)
+		return pg;
+
+	do {
+		pg++;
+		pg &= buf->nr_pages - 1;
+	} while (buf->topa_index[pg] == te);
+
+	return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:	PT buffer.
+ * @handle:	Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+				   struct perf_output_handle *handle)
+
+{
+	unsigned long idx, npages, end;
+
+	if (buf->snapshot)
+		return 0;
+
+	/* can't stop in the middle of an output region */
+	if (buf->output_off + handle->size + 1 <
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+		return -EINVAL;
+
+
+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return 0;
+
+	/* clear STOP and INT from current entry */
+	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->intr_pos]->intr = 0;
+
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		npages = (handle->size + 1) >> PAGE_SHIFT;
+		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+		/*if (end > handle->wakeup >> PAGE_SHIFT)
+		  end = handle->wakeup >> PAGE_SHIFT;*/
+		idx = end & (buf->nr_pages - 1);
+		buf->stop_pos = idx;
+		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+		idx &= buf->nr_pages - 1;
+		buf->intr_pos = idx;
+	}
+
+	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->intr_pos]->intr = 1;
+
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:	PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+	struct topa *cur = buf->first, *prev = buf->last;
+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
+	int pg = 0, idx = 0, ntopa = 0;
+
+	while (pg < buf->nr_pages) {
+		int tidx;
+
+		/* pages within one topa entry */
+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+			buf->topa_index[pg] = te_prev;
+
+		te_prev = te_cur;
+
+		if (idx == cur->last - 1) {
+			/* advance to next topa table */
+			idx = 0;
+			cur = list_entry(cur->list.next, struct topa, list);
+			ntopa++;
+		} else
+			idx++;
+		te_cur = TOPA_ENTRY(cur, idx);
+	}
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:	PT buffer.
+ * @head:	Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+	int pg;
+
+	if (buf->snapshot)
+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+	pg = pt_topa_next_entry(buf, pg);
+
+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+	local64_set(&buf->head, head);
+	local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:	PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+	struct topa *topa, *iter;
+
+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+		/*
+		 * right now, this is in free_aux() path only, so
+		 * no need to unlink this table from the list
+		 */
+		topa_free(topa);
+	}
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:	PT buffer.
+ * @size:	Total size of all regions within this ToPA.
+ * @gfp:	Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+			       gfp_t gfp)
+{
+	struct topa *topa;
+	int err;
+
+	topa = topa_alloc(buf->cpu, gfp);
+	if (!topa)
+		return -ENOMEM;
+
+	topa_insert_table(buf, topa);
+
+	while (buf->nr_pages < nr_pages) {
+		err = topa_insert_pages(buf, gfp);
+		if (err) {
+			pt_buffer_fini_topa(buf);
+			return -ENOMEM;
+		}
+	}
+
+	pt_buffer_setup_topa_index(buf);
+
+	/* link last table to the first one, unless we're double buffering */
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(buf->last, -1)->end = 1;
+	}
+
+	pt_topa_dump(buf);
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:	Cpu on which to allocate, -1 means current.
+ * @pages:	Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:	Number of pages in the buffer.
+ * @snapshot:	If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:	Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+	struct pt_buffer *buf;
+	int node, ret;
+
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+			   GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->cpu = cpu;
+	buf->snapshot = snapshot;
+	buf->data_pages = pages;
+
+	INIT_LIST_HEAD(&buf->tables);
+
+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:	PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+	struct pt_buffer *buf = data;
+
+	pt_buffer_fini_topa(buf);
+	kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:	PT buffer.
+ * @pt:		Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= pt->handle.size)
+		return true;
+
+	return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+	struct perf_event *event = pt->handle.event;
+
+	/*
+	 * There may be a dangling PT bit in the interrupt status register
+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
+	 * do anything (particularly, re-enable) for this event here.
+	 */
+	if (!ACCESS_ONCE(pt->handle_nmi))
+		return;
+
+	pt_config_start(false);
+
+	if (!event)
+		return;
+
+	buf = perf_get_aux(&pt->handle);
+	if (!buf)
+		return;
+
+	pt_read_offset(buf);
+
+	pt_handle_status(pt);
+
+	pt_update_head(pt);
+
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+			    local_xchg(&buf->lost, 0));
+
+	if (!event->hw.state) {
+		int ret;
+
+		buf = perf_aux_output_begin(&pt->handle, event);
+		if (!buf) {
+			event->hw.state = PERF_HES_STOPPED;
+			return;
+		}
+
+		pt_buffer_reset_offsets(buf, pt->handle.head);
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			return;
+		}
+
+		pt_config_buffer(buf->cur->table, buf->cur_idx,
+				 buf->output_off);
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+		pt_config(event);
+	}
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+		event->hw.state = PERF_HES_STOPPED;
+		return;
+	}
+
+	ACCESS_ONCE(pt->handle_nmi) = 1;
+	event->hw.state = 0;
+
+	pt_config_buffer(buf->cur->table, buf->cur_idx,
+			 buf->output_off);
+	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	/*
+	 * Protect against the PMI racing with disabling wrmsr,
+	 * see comment in intel_pt_interrupt().
+	 */
+	ACCESS_ONCE(pt->handle_nmi) = 0;
+	pt_config_start(false);
+
+	if (event->hw.state == PERF_HES_STOPPED)
+		return;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_UPDATE) {
+		struct pt *pt = this_cpu_ptr(&pt_ctx);
+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+		if (!buf)
+			return;
+
+		if (WARN_ON_ONCE(pt->handle.event != event))
+			return;
+
+		pt_read_offset(buf);
+
+		pt_handle_status(pt);
+
+		pt_update_head(pt);
+	}
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+
+	pt_event_stop(event, PERF_EF_UPDATE);
+
+	buf = perf_get_aux(&pt->handle);
+
+	if (buf) {
+		if (buf->snapshot)
+			pt->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+				    local_xchg(&buf->lost, 0));
+	}
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+	struct pt_buffer *buf;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	if (pt->handle.event)
+		goto out;
+
+	buf = perf_aux_output_begin(&pt->handle, event);
+	if (!buf) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			goto out;
+		}
+	}
+
+	if (mode & PERF_EF_START) {
+		pt_event_start(event, 0);
+		if (hwc->state == PERF_HES_STOPPED) {
+			pt_event_del(event, 0);
+			ret = -EBUSY;
+		}
+	} else {
+		hwc->state = PERF_HES_STOPPED;
+	}
+
+	ret = 0;
+out:
+
+	if (ret)
+		hwc->state = PERF_HES_STOPPED;
+
+	return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+	if (event->attr.type != pt_pmu.pmu.type)
+		return -ENOENT;
+
+	if (!pt_event_valid(event))
+		return -EINVAL;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
+		return -EBUSY;
+
+	event->destroy = pt_event_destroy;
+
+	return 0;
+}
+
+static __init int pt_init(void)
+{
+	int ret, cpu, prior_warn = 0;
+
+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		u64 ctl;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
+			prior_warn++;
+	}
+	put_online_cpus();
+
+	if (prior_warn) {
+		x86_add_exclusive(x86_lbr_exclusive_pt);
+		pr_warn("PT is enabled at boot time, doing nothing\n");
+
+		return -EBUSY;
+	}
+
+	ret = pt_pmu_hw_init();
+	if (ret)
+		return ret;
+
+	if (!pt_cap_get(PT_CAP_topa_output)) {
+		pr_warn("ToPA output is not supported on this CPU\n");
+		return -ENODEV;
+	}
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		pt_pmu.pmu.capabilities =
+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
+	pt_pmu.pmu.event_init	= pt_event_init;
+	pt_pmu.pmu.add		= pt_event_add;
+	pt_pmu.pmu.del		= pt_event_del;
+	pt_pmu.pmu.start	= pt_event_start;
+	pt_pmu.pmu.stop		= pt_event_stop;
+	pt_pmu.pmu.read		= pt_event_read;
+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+	return ret;
+}
+
+module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
 		}
 	}
 
-	if (ubox_dev)
-		pci_dev_put(ubox_dev);
+	pci_dev_put(ubox_dev);
 
 	return err ? pcibios_err_to_errno(err) : 0;
 }
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
+		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 24d079604fd5..1deffe6cc873 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
+	int length;
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
 		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
+	length = insn.length;
+
 	/* Another subsystem puts a breakpoint, failed to recover */
 	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
-	memcpy(dest, insn.kaddr, insn.length);
+	memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
 	if (insn_rip_relative(&insn)) {
 		s64 newdisp;
 		u8 *disp;
-		kernel_insn_init(&insn, dest, insn.length);
+		kernel_insn_init(&insn, dest, length);
 		insn_get_displacement(&insn);
 		/*
 		 * The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 		*(s32 *) disp = (s32) newdisp;
 	}
 #endif
-	return insn.length;
+	return length;
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
-#else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
-#endif
 struct bpf_prog *bpf_prog_get(u32 ufd);
+#else
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
+#endif
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 112cf49d9576..46e83c2156c6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -252,6 +253,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +267,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +277,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
@@ -303,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2b621982938d..61992cf2e977 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
 #include <linux/sysfs.h>
 #include <linux/perf_regs.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -118,10 +119,19 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 		struct { /* tracepoint */
-			struct task_struct	*tp_target;
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* intel_cqm */
+			int			cqm_state;
+			int			cqm_rmid;
+			struct list_head	cqm_events_entry;
+			struct list_head	cqm_groups_entry;
+			struct list_head	cqm_group_entry;
+		};
+		struct { /* itrace */
+			int			itrace_started;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -129,12 +139,12 @@ struct hw_perf_event {
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
-			struct task_struct		*bp_target;
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
 		};
 #endif
 	};
+	struct task_struct		*target;
 	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
@@ -166,6 +176,11 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
+#define PERF_PMU_CAP_AUX_NO_SG			0x04
+#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
+#define PERF_PMU_CAP_EXCLUSIVE			0x10
+#define PERF_PMU_CAP_ITRACE			0x20
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -186,6 +201,7 @@ struct pmu {
 
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
+	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
@@ -262,9 +278,32 @@ struct pmu {
 	int (*event_idx)		(struct perf_event *event); /*optional */
 
 	/*
-	 * flush branch stack on context-switches (needed in cpu-wide mode)
+	 * context-switches callback
 	 */
-	void (*flush_branch_stack)	(void);
+	void (*sched_task)		(struct perf_event_context *ctx,
+					bool sched_in);
+	/*
+	 * PMU specific data size
+	 */
+	size_t				task_ctx_size;
+
+
+	/*
+	 * Return the count value for a counter.
+	 */
+	u64 (*count)			(struct perf_event *event); /*optional*/
+
+	/*
+	 * Set up pmu-private data structures for an AUX area
+	 */
+	void *(*setup_aux)		(int cpu, void **pages,
+					 int nr_pages, bool overwrite);
+					/* optional */
+
+	/*
+	 * Free pmu-private AUX data structures
+	 */
+	void (*free_aux)		(void *aux); /* optional */
 };
 
 /**
@@ -300,6 +339,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
+#define PERF_ATTACH_TASK_DATA	0x08
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -438,6 +478,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
@@ -504,7 +545,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
-	int				nr_branch_stack; /* branch_stack evt */
+	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
@@ -536,12 +577,52 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	void				*addr;
+	union {
+		void			*addr;
+		unsigned long		head;
+	};
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64				time;
+	u64				timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_event_cgrp_id),
+			    struct perf_cgroup, css);
+}
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
+extern void *perf_aux_output_begin(struct perf_output_handle *handle,
+				   struct perf_event *event);
+extern void perf_aux_output_end(struct perf_output_handle *handle,
+				unsigned long size, bool truncated);
+extern int perf_aux_output_skip(struct perf_output_handle *handle,
+				unsigned long size);
+extern void *perf_get_aux(struct perf_output_handle *handle);
+
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
@@ -558,6 +639,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu);
+extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
@@ -731,6 +814,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }
 
+static inline u64 __perf_event_count(struct perf_event *event)
+{
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -800,6 +888,16 @@ static inline bool has_branch_stack(struct perf_event *event)
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool needs_branch_stack(struct perf_event *event)
+{
+	return event->attr.branch_sample_type != 0;
+}
+
+static inline bool has_aux(struct perf_event *event)
+{
+	return event->pmu->setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
@@ -815,6 +913,17 @@ extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
+static inline void *
+perf_aux_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event)				{ return NULL; }
+static inline void
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+		    bool truncated)					{ }
+static inline int
+perf_aux_output_skip(struct perf_output_handle *handle,
+		     unsigned long size)				{ return -EINVAL; }
+static inline void *
+perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 395b70e0eccf..a746bf5216f8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
+#else
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
+#endif
+
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..cc47ef41076a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,9 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..309211b3eb67 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -152,21 +152,42 @@ enum perf_event_sample_format {
  * The branch types can be combined, however BRANCH_ANY covers all types
  * of branches and therefore it supersedes all the other types.
  */
+enum perf_branch_sample_type_shift {
+	PERF_SAMPLE_BRANCH_USER_SHIFT		= 0, /* user branches */
+	PERF_SAMPLE_BRANCH_KERNEL_SHIFT		= 1, /* kernel branches */
+	PERF_SAMPLE_BRANCH_HV_SHIFT		= 2, /* hypervisor branches */
+
+	PERF_SAMPLE_BRANCH_ANY_SHIFT		= 3, /* any branch types */
+	PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT	= 4, /* any call branch */
+	PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT	= 5, /* any return branch */
+	PERF_SAMPLE_BRANCH_IND_CALL_SHIFT	= 6, /* indirect calls */
+	PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT	= 7, /* transaction aborts */
+	PERF_SAMPLE_BRANCH_IN_TX_SHIFT		= 8, /* in transaction */
+	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
+	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
+
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
+
+	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
+};
+
 enum perf_branch_sample_type {
-	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
-	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
-	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */
+	PERF_SAMPLE_BRANCH_USER		= 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
+	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
 
-	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
-	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
-	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
-	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
-	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << 7, /* transaction aborts */
-	PERF_SAMPLE_BRANCH_IN_TX	= 1U << 8, /* in transaction */
-	PERF_SAMPLE_BRANCH_NO_TX	= 1U << 9, /* not in transaction */
-	PERF_SAMPLE_BRANCH_COND		= 1U << 10, /* conditional branches */
+	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
 
-	PERF_SAMPLE_BRANCH_MAX		= 1U << 11, /* non-ABI */
+	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+
+	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
@@ -240,6 +261,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
+#define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -305,7 +327,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -334,8 +357,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
@@ -345,6 +367,12 @@ struct perf_event_attr {
 	 * See asm/perf_regs.h for details.
 	 */
 	__u64	sample_regs_intr;
+
+	/*
+	 * Wakeup watermark for AUX area
+	 */
+	__u32	aux_watermark;
+	__u32	__reserved_2;	/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
@@ -360,6 +388,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -500,9 +529,30 @@ struct perf_event_mmap_page {
 	 * In this case the kernel will not over-write unread data.
 	 *
 	 * See perf_output_put_handle() for the data ordering.
+	 *
+	 * data_{offset,size} indicate the location and size of the perf record
+	 * buffer within the mmapped area.
 	 */
 	__u64   data_head;		/* head in the data section */
 	__u64	data_tail;		/* user-space written tail */
+	__u64	data_offset;		/* where the buffer starts */
+	__u64	data_size;		/* data buffer size */
+
+	/*
+	 * AUX area is defined by aux_{offset,size} fields that should be set
+	 * by the userspace, so that
+	 *
+	 *   aux_offset >= data_offset + data_size
+	 *
+	 * prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+	 *
+	 * Ring buffer pointers aux_{head,tail} have the same semantics as
+	 * data_{head,tail} and same ordering rules apply.
+	 */
+	__u64	aux_head;
+	__u64	aux_tail;
+	__u64	aux_offset;
+	__u64	aux_size;
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
@@ -725,6 +775,31 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_MMAP2			= 10,
 
+	/*
+	 * Records that new data landed in the AUX buffer part.
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u64				aux_offset;
+	 * 	u64				aux_size;
+	 *	u64				flags;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_AUX				= 11,
+
+	/*
+	 * Indicates that instruction trace has started
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 * };
+	 */
+	PERF_RECORD_ITRACE_START		= 12,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -742,6 +817,12 @@ enum perf_callchain_context {
 	PERF_CONTEXT_MAX		= (__u64)-4095,
 };
 
+/**
+ * PERF_RECORD_AUX::flags bits
+ */
+#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
+
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
diff --git a/init/Kconfig b/init/Kconfig
index 9a0592516f48..a905b7301e10 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1526,7 +1526,7 @@ config EVENTFD
 
 # syscall, maps, verifier
 config BPF_SYSCALL
-	bool "Enable bpf() system call" if EXPERT
+	bool "Enable bpf() system call"
 	select ANON_INODES
 	select BPF
 	default n
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2fabc0627165..06917d537302 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -153,7 +155,7 @@ enum event_type_t {
  */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 
 #ifdef CONFIG_CGROUP_PERF
 
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-	u64				time;
-	u64				timestamp;
-};
-
-struct perf_cgroup {
-	struct cgroup_subsys_state	css;
-	struct perf_cgroup_info	__percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-	return container_of(task_css(task, perf_event_cgrp_id),
-			    struct perf_cgroup, css);
-}
-
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
 	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_event_context *ctx;
+
+	ctx = container_of(head, struct perf_event_context, rcu_head);
+	kfree(ctx->task_ctx_data);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_event_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
 			put_ctx(ctx->parent_ctx);
 		if (ctx->task)
 			put_task_struct(ctx->task);
-		kfree_rcu(ctx, rcu_head);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack++;
-
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 			cpuctx->cgrp = NULL;
 	}
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack--;
-
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
 
 static int
 event_sched_in(struct perf_event *event,
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
+	event->tstamp_running += tstamp - event->tstamp_stopped;
+
+	perf_set_shadow_time(event, ctx, tstamp);
+
+	perf_log_itrace_start(event);
+
 	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
 		goto out;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
-
-	perf_set_shadow_time(event, ctx, tstamp);
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	if (!ctx->nr_active++)
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 			next->perf_event_ctxp[ctxn] = ctx;
 			ctx->task = next;
 			next_ctx->task = task;
+
+			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
 			do_switch = 0;
 
 			perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2567,56 @@ unlock:
 	}
 }
 
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+	this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+	this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	if (prev == next)
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		if (pmu->sched_task) {
+			cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+			perf_pmu_disable(pmu);
+
+			pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+			perf_pmu_enable(pmu);
+
+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		}
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
 #define for_each_task_context_nr(ctxn)					\
 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 
@@ -2754,64 +2797,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	perf_ctx_unlock(cpuctx, ctx);
 }
 
-/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-				       struct task_struct *task)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	/* no need to flush branch stack if not changing task */
-	if (prev == task)
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		/*
-		 * check if the context has at least one
-		 * event using PERF_SAMPLE_BRANCH_STACK
-		 */
-		if (cpuctx->ctx.nr_branch_stack > 0
-		    && pmu->flush_branch_stack) {
-
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
-			perf_pmu_disable(pmu);
-
-			pmu->flush_branch_stack();
-
-			perf_pmu_enable(pmu);
-
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		}
-	}
-
-	rcu_read_unlock();
-
-	local_irq_restore(flags);
-}
-
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
 
-	/* check for system-wide branch_stack events */
-	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
-		perf_branch_stack_sched_in(prev, task);
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return local64_read(&event->count) + atomic64_read(&event->child_count);
+	if (event->pmu->count)
+		return event->pmu->count(event);
+
+	return __perf_event_count(event);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3308,15 @@ errout:
  * Returns a matching context with refcount and pincount.
  */
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+		struct perf_event *event)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
 	struct perf_cpu_context *cpuctx;
+	void *task_ctx_data = NULL;
 	unsigned long flags;
 	int ctxn, err;
+	int cpu = event->cpu;
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 	if (ctxn < 0)
 		goto errout;
 
+	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		if (!task_ctx_data) {
+			err = -ENOMEM;
+			goto errout;
+		}
+	}
+
 retry:
 	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		clone_ctx = unclone_ctx(ctx);
 		++ctx->pin_count;
+
+		if (task_ctx_data && !ctx->task_ctx_data) {
+			ctx->task_ctx_data = task_ctx_data;
+			task_ctx_data = NULL;
+		}
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
 		if (clone_ctx)
@@ -3369,6 +3372,11 @@ retry:
 		if (!ctx)
 			goto errout;
 
+		if (task_ctx_data) {
+			ctx->task_ctx_data = task_ctx_data;
+			task_ctx_data = NULL;
+		}
+
 		err = 0;
 		mutex_lock(&task->perf_event_mutex);
 		/*
@@ -3395,13 +3403,16 @@ retry:
 		}
 	}
 
+	kfree(task_ctx_data);
 	return ctx;
 
 errout:
+	kfree(task_ctx_data);
 	return ERR_PTR(err);
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
-static void ring_buffer_put(struct ring_buffer *rb);
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
 	unaccount_event_cpu(event, event->cpu);
 }
 
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ *  1) cpu-wide events in the presence of per-task events,
+ *  2) per-task events in the presence of cpu-wide events,
+ *  3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return 0;
+
+	/*
+	 * Prevent co-existence of per-task and cpu-wide events on the
+	 * same exclusive pmu.
+	 *
+	 * Negative pmu::exclusive_cnt means there are cpu-wide
+	 * events on this "exclusive" pmu, positive means there are
+	 * per-task events.
+	 *
+	 * Since this is called in perf_event_alloc() path, event::ctx
+	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+	 * to mean "per-task event", because unlike other attach states it
+	 * never gets cleared.
+	 */
+	if (event->attach_state & PERF_ATTACH_TASK) {
+		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+			return -EBUSY;
+	} else {
+		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void exclusive_event_destroy(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return;
+
+	/* see comment in exclusive_event_init() */
+	if (event->attach_state & PERF_ATTACH_TASK)
+		atomic_dec(&pmu->exclusive_cnt);
+	else
+		atomic_inc(&pmu->exclusive_cnt);
+}
+
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+	if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+	    (e1->cpu == e2->cpu ||
+	     e1->cpu == -1 ||
+	     e2->cpu == -1))
+		return true;
+	return false;
+}
+
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+					struct perf_event_context *ctx)
+{
+	struct perf_event *iter_event;
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return true;
+
+	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+		if (exclusive_event_match(iter_event, event))
+			return false;
+	}
+
+	return true;
+}
+
 static void __free_event(struct perf_event *event)
 {
 	if (!event->parent) {
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	if (event->pmu)
+	if (event->pmu) {
+		exclusive_event_destroy(event);
 		module_put(event->pmu->module);
+	}
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
 	/* Allow new userspace to detect that bit 0 is deprecated */
 	userpg->cap_bit0_is_deprecated = 1;
 	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+	userpg->data_offset = PAGE_SIZE;
+	userpg->data_size = perf_data_size(rb);
 
 unlock:
 	rcu_read_unlock();
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
 	rb_free(rb);
 }
 
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
 	struct ring_buffer *rb;
 
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
 {
 	if (!atomic_dec_and_test(&rb->refcount))
 		return;
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 	atomic_inc(&event->mmap_count);
 	atomic_inc(&event->rb->mmap_count);
 
+	if (vma->vm_pgoff)
+		atomic_inc(&event->rb->aux_mmap_count);
+
 	if (event->pmu->event_mapped)
 		event->pmu->event_mapped(event);
 }
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event);
 
+	/*
+	 * rb->aux_mmap_count will always drop before rb->mmap_count and
+	 * event->mmap_count, so it is ok to use event->mmap_mutex to
+	 * serialize with perf_mmap here.
+	 */
+	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+		rb_free_aux(rb);
+		mutex_unlock(&event->mmap_mutex);
+	}
+
 	atomic_dec(&rb->mmap_count);
 
 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4509,7 @@ out_put:
 
 static const struct vm_operations_struct perf_mmap_vmops = {
 	.open		= perf_mmap_open,
-	.close		= perf_mmap_close,
+	.close		= perf_mmap_close, /* non mergable */
 	.fault		= perf_mmap_fault,
 	.page_mkwrite	= perf_mmap_fault,
 };
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct ring_buffer *rb;
+	struct ring_buffer *rb = NULL;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	long user_extra, extra;
+	long user_extra = 0, extra = 0;
 	int ret = 0, flags = 0;
 
 	/*
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
-	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	if (vma->vm_pgoff == 0) {
+		nr_pages = (vma_size / PAGE_SIZE) - 1;
+	} else {
+		/*
+		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+		 * mapped, all subsequent mappings should have the same size
+		 * and offset. Must be above the normal perf buffer.
+		 */
+		u64 aux_offset, aux_size;
+
+		if (!event->rb)
+			return -EINVAL;
+
+		nr_pages = vma_size / PAGE_SIZE;
+
+		mutex_lock(&event->mmap_mutex);
+		ret = -EINVAL;
+
+		rb = event->rb;
+		if (!rb)
+			goto aux_unlock;
+
+		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+
+		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+			goto aux_unlock;
+
+		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+			goto aux_unlock;
+
+		/* already mapped with a different offset */
+		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+			goto aux_unlock;
+
+		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+			goto aux_unlock;
+
+		/* already mapped with a different size */
+		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+			goto aux_unlock;
+
+		if (!is_power_of_2(nr_pages))
+			goto aux_unlock;
+
+		if (!atomic_inc_not_zero(&rb->mmap_count))
+			goto aux_unlock;
+
+		if (rb_has_aux(rb)) {
+			atomic_inc(&rb->aux_mmap_count);
+			ret = 0;
+			goto unlock;
+		}
+
+		atomic_set(&rb->aux_mmap_count, 1);
+		user_extra = nr_pages;
+
+		goto accounting;
+	}
 
 	/*
 	 * If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 		return -EINVAL;
 
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 again:
 	mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4632,8 @@ again:
 	}
 
 	user_extra = nr_pages + 1;
+
+accounting:
 	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
 
 	/*
@@ -4468,7 +4643,6 @@ again:
 
 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-	extra = 0;
 	if (user_locked > user_lock_limit)
 		extra = user_locked - user_lock_limit;
 
@@ -4482,35 +4656,46 @@ again:
 		goto unlock;
 	}
 
-	WARN_ON(event->rb);
+	WARN_ON(!rb && event->rb);
 
 	if (vma->vm_flags & VM_WRITE)
 		flags |= RING_BUFFER_WRITABLE;
 
-	rb = rb_alloc(nr_pages, 
-		event->attr.watermark ? event->attr.wakeup_watermark : 0,
-		event->cpu, flags);
-
 	if (!rb) {
-		ret = -ENOMEM;
-		goto unlock;
+		rb = rb_alloc(nr_pages,
+			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
+			      event->cpu, flags);
+
+		if (!rb) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
+
+		atomic_set(&rb->mmap_count, 1);
+		rb->mmap_user = get_current_user();
+		rb->mmap_locked = extra;
+
+		ring_buffer_attach(event, rb);
+
+		perf_event_init_userpage(event);
+		perf_event_update_userpage(event);
+	} else {
+		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+				   event->attr.aux_watermark, flags);
+		if (!ret)
+			rb->aux_mmap_locked = extra;
 	}
 
-	atomic_set(&rb->mmap_count, 1);
-	rb->mmap_locked = extra;
-	rb->mmap_user = get_current_user();
-
-	atomic_long_add(user_extra, &user->locked_vm);
-	vma->vm_mm->pinned_vm += extra;
-
-	ring_buffer_attach(event, rb);
-
-	perf_event_init_userpage(event);
-	perf_event_update_userpage(event);
-
 unlock:
-	if (!ret)
+	if (!ret) {
+		atomic_long_add(user_extra, &user->locked_vm);
+		vma->vm_mm->pinned_vm += extra;
+
 		atomic_inc(&event->mmap_count);
+	} else if (rb) {
+		atomic_dec(&rb->mmap_count);
+	}
+aux_unlock:
 	mutex_unlock(&event->mmap_mutex);
 
 	/*
@@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
 	perf_event_mmap_event(&mmap_event);
 }
 
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+			  unsigned long size, u64 flags)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	struct perf_aux_event {
+		struct perf_event_header	header;
+		u64				offset;
+		u64				size;
+		u64				flags;
+	} rec = {
+		.header = {
+			.type = PERF_RECORD_AUX,
+			.misc = 0,
+			.size = sizeof(rec),
+		},
+		.offset		= head,
+		.size		= size,
+		.flags		= flags,
+	};
+	int ret;
+
+	perf_event_header__init_id(&rec.header, &sample, event);
+	ret = perf_output_begin(&handle, event, rec.header.size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, rec);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
 /*
  * IRQ throttle logging
  */
@@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+static void perf_log_itrace_start(struct perf_event *event)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	struct perf_aux_event {
+		struct perf_event_header        header;
+		u32				pid;
+		u32				tid;
+	} rec;
+	int ret;
+
+	if (event->parent)
+		event = event->parent;
+
+	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+	    event->hw.itrace_started)
+		return;
+
+	event->hw.itrace_started = 1;
+
+	rec.header.type	= PERF_RECORD_ITRACE_START;
+	rec.header.misc	= 0;
+	rec.header.size	= sizeof(rec);
+	rec.pid	= perf_event_pid(event, current);
+	rec.tid	= perf_event_tid(event, current);
+
+	perf_event_header__init_id(&rec.header, &sample, event);
+	ret = perf_output_begin(&handle, event, rec.header.size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, rec);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
 /*
  * Generic event overflow handling, sampling.
  */
@@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
 	}
 
 	hlist_add_head_rcu(&event->hlist_entry, head);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
@@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
 	if (flags & PERF_EF_START)
 		cpu_clock_event_start(event, flags);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
@@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
 {
 	if (flags & PERF_EF_START)
 		task_clock_event_start(event, flags);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
@@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -6993,6 +7312,7 @@ got_cpu_context:
 		pmu->event_idx = perf_event_idx_default;
 
 	list_add_rcu(&pmu->entry, &pmus);
+	atomic_set(&pmu->exclusive_cnt, 0);
 	ret = 0;
 unlock:
 	mutex_unlock(&pmus_lock);
@@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
+	struct perf_event_context *ctx = NULL;
 	int ret;
 
 	if (!try_module_get(pmu->module))
 		return -ENODEV;
+
+	if (event->group_leader != event) {
+		ctx = perf_event_ctx_lock(event->group_leader);
+		BUG_ON(!ctx);
+	}
+
 	event->pmu = pmu;
 	ret = pmu->event_init(event);
+
+	if (ctx)
+		perf_event_ctx_unlock(event->group_leader, ctx);
+
 	if (ret)
 		module_put(pmu->module);
 
@@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		 struct perf_event *group_leader,
 		 struct perf_event *parent_event,
 		 perf_overflow_handler_t overflow_handler,
-		 void *context)
+		 void *context, int cgroup_fd)
 {
 	struct pmu *pmu;
 	struct perf_event *event;
@@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
-
-		if (attr->type == PERF_TYPE_TRACEPOINT)
-			event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
-		 * hw_breakpoint is a bit difficult here..
+		 * XXX pmu::event_init needs to know what task to account to
+		 * and we cannot use the ctx information because we need the
+		 * pmu before we get a ctx.
 		 */
-		else if (attr->type == PERF_TYPE_BREAKPOINT)
-			event->hw.bp_target = task;
-#endif
+		event->hw.target = task;
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto err_ns;
 
+	if (!has_branch_stack(event))
+		event->attr.branch_sample_type = 0;
+
+	if (cgroup_fd != -1) {
+		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+		if (err)
+			goto err_ns;
+	}
+
 	pmu = perf_init_event(event);
 	if (!pmu)
 		goto err_ns;
@@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		goto err_ns;
 	}
 
+	err = exclusive_event_init(event);
+	if (err)
+		goto err_pmu;
+
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers();
 			if (err)
-				goto err_pmu;
+				goto err_per_task;
 		}
 	}
 
 	return event;
 
+err_per_task:
+	exclusive_event_destroy(event);
+
 err_pmu:
 	if (event->destroy)
 		event->destroy(event);
 	module_put(pmu->module);
 err_ns:
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
 	if (event->ns)
 		put_pid_ns(event->ns);
 	kfree(event);
@@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
+	/*
+	 * If both events generate aux data, they must be on the same PMU
+	 */
+	if (has_aux(event) && has_aux(output_event) &&
+	    event->pmu != output_event->pmu)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
 
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int move_group = 0;
 	int err;
 	int f_flags = O_RDWR;
+	int cgroup_fd = -1;
 
 	/* for future expandability... */
 	if (flags & ~PERF_FLAG_ALL)
@@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	get_online_cpus();
 
+	if (flags & PERF_FLAG_PID_CGROUP)
+		cgroup_fd = pid;
+
 	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-				 NULL, NULL);
+				 NULL, NULL, cgroup_fd);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err_cpus;
 	}
 
-	if (flags & PERF_FLAG_PID_CGROUP) {
-		err = perf_cgroup_connect(pid, event, &attr, group_leader);
-		if (err) {
-			__free_event(event);
-			goto err_cpus;
-		}
-	}
-
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -ENOTSUPP;
@@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
 	/*
 	 * Get the target context (task or percpu):
 	 */
-	ctx = find_get_context(pmu, task, event->cpu);
+	ctx = find_get_context(pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_alloc;
 	}
 
+	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+		err = -EBUSY;
+		goto err_context;
+	}
+
 	if (task) {
 		put_task_struct(task);
 		task = NULL;
@@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
@@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		get_ctx(ctx);
 	}
 
+	if (!exclusive_event_installable(event, ctx)) {
+		err = -EBUSY;
+		mutex_unlock(&ctx->mutex);
+		fput(event_file);
+		goto err_context;
+	}
+
 	perf_install_in_context(ctx, event, event->cpu);
 	perf_unpin_context(ctx);
 
@@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 */
 
 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-				 overflow_handler, context);
+				 overflow_handler, context, -1);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err;
@@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	account_event(event);
 
-	ctx = find_get_context(event->pmu, task, cpu);
+	ctx = find_get_context(event->pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_free;
@@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+	if (!exclusive_event_installable(event, ctx)) {
+		mutex_unlock(&ctx->mutex);
+		perf_unpin_context(ctx);
+		put_ctx(ctx);
+		err = -EBUSY;
+		goto err_free;
+	}
+
 	perf_install_in_context(ctx, event, cpu);
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
@@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
 					   parent_event->cpu,
 					   child,
 					   group_leader, parent_event,
-				           NULL, NULL);
+					   NULL, NULL, -1);
 	if (IS_ERR(child_event))
 		return child_event;
 
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  */
 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
-	struct task_struct *tsk = bp->hw.bp_target;
+	struct task_struct *tsk = bp->hw.target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->hw.bp_target == tsk &&
+		if (iter->hw.target == tsk &&
 		    find_slot_idx(iter) == type &&
 		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		int nr;
 
 		nr = info->cpu_pinned;
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			nr += max_task_bp_pinned(cpu, type);
 		else
 			nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 		weight = -weight;
 
 	/* Pinned counter cpu profiling */
-	if (!bp->hw.bp_target) {
+	if (!bp->hw.target) {
 		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
 		return;
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
 	local_t				lost;		/* nr records lost   */
 
 	long				watermark;	/* wakeup watermark  */
+	long				aux_watermark;
 	/* poll crap */
 	spinlock_t			event_lock;
 	struct list_head		event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
 	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
 
+	/* AUX area */
+	local_t				aux_head;
+	local_t				aux_nest;
+	local_t				aux_wakeup;
+	unsigned long			aux_pgoff;
+	int				aux_nr_pages;
+	int				aux_overwrite;
+	atomic_t			aux_mmap_count;
+	unsigned long			aux_mmap_locked;
+	void				(*free_aux)(void *);
+	atomic_t			aux_refcount;
+	void				**aux_pages;
+	void				*aux_priv;
+
 	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+			pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
+
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+	return !!rb->aux_nr_pages;
+}
+
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+			  unsigned long size, u64 flags);
 
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
 
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+	return rb->aux_nr_pages << PAGE_SHIFT;
+}
+
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
 static inline unsigned long						\
 func_name(struct perf_output_handle *handle,				\
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 	spin_lock_init(&rb->event_lock);
 }
 
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+			    struct perf_event *event)
+{
+	struct perf_event *output_event = event;
+	unsigned long aux_head, aux_tail;
+	struct ring_buffer *rb;
+
+	if (output_event->parent)
+		output_event = output_event->parent;
+
+	/*
+	 * Since this will typically be open across pmu::add/pmu::del, we
+	 * grab ring_buffer's refcount instead of holding rcu read lock
+	 * to make sure it doesn't disappear under us.
+	 */
+	rb = ring_buffer_get(output_event);
+	if (!rb)
+		return NULL;
+
+	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+		goto err;
+
+	/*
+	 * Nesting is not supported for AUX area, make sure nested
+	 * writers are caught early
+	 */
+	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+		goto err_put;
+
+	aux_head = local_read(&rb->aux_head);
+
+	handle->rb = rb;
+	handle->event = event;
+	handle->head = aux_head;
+	handle->size = 0;
+
+	/*
+	 * In overwrite mode, AUX data stores do not depend on aux_tail,
+	 * therefore (A) control dependency barrier does not exist. The
+	 * (B) <-> (C) ordering is still observed by the pmu driver.
+	 */
+	if (!rb->aux_overwrite) {
+		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+		if (aux_head - aux_tail < perf_aux_size(rb))
+			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+
+		/*
+		 * handle->size computation depends on aux_tail load; this forms a
+		 * control dependency barrier separating aux_tail load from aux data
+		 * store that will be enabled on successful return
+		 */
+		if (!handle->size) { /* A, matches D */
+			event->pending_disable = 1;
+			perf_output_wakeup(handle);
+			local_set(&rb->aux_nest, 0);
+			goto err_put;
+		}
+	}
+
+	return handle->rb->aux_priv;
+
+err_put:
+	rb_free_aux(rb);
+
+err:
+	ring_buffer_put(rb);
+	handle->event = NULL;
+
+	return NULL;
+}
+
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+			 bool truncated)
+{
+	struct ring_buffer *rb = handle->rb;
+	unsigned long aux_head;
+	u64 flags = 0;
+
+	if (truncated)
+		flags |= PERF_AUX_FLAG_TRUNCATED;
+
+	/* in overwrite mode, driver provides aux_head via handle */
+	if (rb->aux_overwrite) {
+		flags |= PERF_AUX_FLAG_OVERWRITE;
+
+		aux_head = handle->head;
+		local_set(&rb->aux_head, aux_head);
+	} else {
+		aux_head = local_read(&rb->aux_head);
+		local_add(size, &rb->aux_head);
+	}
+
+	if (size || flags) {
+		/*
+		 * Only send RECORD_AUX if we have something useful to communicate
+		 */
+
+		perf_event_aux_event(handle->event, aux_head, size, flags);
+	}
+
+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+
+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+		perf_output_wakeup(handle);
+		local_add(rb->aux_watermark, &rb->aux_wakeup);
+	}
+	handle->event = NULL;
+
+	local_set(&rb->aux_nest, 0);
+	rb_free_aux(rb);
+	ring_buffer_put(rb);
+}
+
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+	struct ring_buffer *rb = handle->rb;
+	unsigned long aux_head;
+
+	if (size > handle->size)
+		return -ENOSPC;
+
+	local_add(size, &rb->aux_head);
+
+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+		perf_output_wakeup(handle);
+		local_add(rb->aux_watermark, &rb->aux_wakeup);
+		handle->wakeup = local_read(&rb->aux_wakeup) +
+				 rb->aux_watermark;
+	}
+
+	handle->head = aux_head;
+	handle->size -= size;
+
+	return 0;
+}
+
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+	/* this is only valid between perf_aux_output_begin and *_end */
+	if (!handle->event)
+		return NULL;
+
+	return handle->rb->aux_priv;
+}
+
+#define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+	struct page *page;
+
+	if (order > MAX_ORDER)
+		order = MAX_ORDER;
+
+	do {
+		page = alloc_pages_node(node, PERF_AUX_GFP, order);
+	} while (!page && order--);
+
+	if (page && order) {
+		/*
+		 * Communicate the allocation size to the driver
+		 */
+		split_page(page, order);
+		SetPagePrivate(page);
+		set_page_private(page, order);
+	}
+
+	return page;
+}
+
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+	struct page *page = virt_to_page(rb->aux_pages[idx]);
+
+	ClearPagePrivate(page);
+	page->mapping = NULL;
+	__free_page(page);
+}
+
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+	int ret = -ENOMEM, max_order = 0;
+
+	if (!has_aux(event))
+		return -ENOTSUPP;
+
+	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+		/*
+		 * We need to start with the max_order that fits in nr_pages,
+		 * not the other way around, hence ilog2() and not get_order.
+		 */
+		max_order = ilog2(nr_pages);
+
+		/*
+		 * PMU requests more than one contiguous chunks of memory
+		 * for SW double buffering
+		 */
+		if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+		    !overwrite) {
+			if (!max_order)
+				return -EINVAL;
+
+			max_order--;
+		}
+	}
+
+	rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+	if (!rb->aux_pages)
+		return -ENOMEM;
+
+	rb->free_aux = event->pmu->free_aux;
+	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+		struct page *page;
+		int last, order;
+
+		order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+		page = rb_alloc_aux_page(node, order);
+		if (!page)
+			goto out;
+
+		for (last = rb->aux_nr_pages + (1 << page_private(page));
+		     last > rb->aux_nr_pages; rb->aux_nr_pages++)
+			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+	}
+
+	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+					     overwrite);
+	if (!rb->aux_priv)
+		goto out;
+
+	ret = 0;
+
+	/*
+	 * aux_pages (and pmu driver's private data, aux_priv) will be
+	 * referenced in both producer's and consumer's contexts, thus
+	 * we keep a refcount here to make sure either of the two can
+	 * reference them safely.
+	 */
+	atomic_set(&rb->aux_refcount, 1);
+
+	rb->aux_overwrite = overwrite;
+	rb->aux_watermark = watermark;
+
+	if (!rb->aux_watermark && !rb->aux_overwrite)
+		rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+
+out:
+	if (!ret)
+		rb->aux_pgoff = pgoff;
+	else
+		rb_free_aux(rb);
+
+	return ret;
+}
+
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+	int pg;
+
+	if (rb->aux_priv) {
+		rb->free_aux(rb->aux_priv);
+		rb->free_aux = NULL;
+		rb->aux_priv = NULL;
+	}
+
+	for (pg = 0; pg < rb->aux_nr_pages; pg++)
+		rb_free_aux_page(rb, pg);
+
+	kfree(rb->aux_pages);
+	rb->aux_nr_pages = 0;
+}
+
+void rb_free_aux(struct ring_buffer *rb)
+{
+	if (atomic_dec_and_test(&rb->aux_refcount))
+		__rb_free_aux(rb);
+}
+
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
  */
 
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
 	if (pgoff > rb->nr_pages)
 		return NULL;
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
 	return rb->nr_pages << page_order(rb);
 }
 
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
 	/* The '>' counts in the user page. */
 	if (pgoff > data_page_nr(rb))
@@ -416,3 +719,19 @@ fail:
 }
 
 #endif
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+	if (rb->aux_nr_pages) {
+		/* above AUX space */
+		if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+			return NULL;
+
+		/* AUX space */
+		if (pgoff >= rb->aux_pgoff)
+			return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+	}
+
+	return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fedbdd7d5d1e..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
 	  This option is required if you plan to use perf-probe subcommand
 	  of perf tools on user space applications.
 
+config BPF_EVENTS
+	depends on BPF_SYSCALL
+	depends on KPROBE_EVENT
+	bool
+	default y
+	help
+	  This allows the user to attach BPF programs to kprobe events.
+
 config PROBE_EVENTS
 	def_bool n
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	preempt_disable();
+
+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		/*
+		 * since some bpf program is already running on this cpu,
+		 * don't call into another bpf program (same or different)
+		 * and don't send kprobe event into ring-buffer,
+		 * so return zero here
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+ out:
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+	.func		= bpf_probe_read,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int mod[3] = {};
+	int fmt_cnt = 0;
+	int i;
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			return -EINVAL;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= 3)
+			return -EINVAL;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		} else if (fmt[i] == 'p') {
+			mod[fmt_cnt]++;
+			i++;
+			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+				return -EINVAL;
+			fmt_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		}
+
+		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+			return -EINVAL;
+		fmt_cnt++;
+	}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+	.func		= bpf_trace_printk,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_probe_read:
+		return &bpf_probe_read_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+
+	case BPF_FUNC_trace_printk:
+		/*
+		 * this program might be calling bpf_trace_printk,
+		 * so allocate per-cpu printk buffers
+		 */
+		trace_printk_init_buffers();
+
+		return &bpf_trace_printk_proto;
+	default:
+		return NULL;
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto  = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops	= &kprobe_prog_ops,
+	.type	= BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9ba3f43f580e..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1135,11 +1135,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 74865465e0b7..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 		return true;
 
 	list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-		if (event->hw.tp_target->mm == mm)
+		if (event->hw.target->mm == mm)
 			return true;
 	}
 
@@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 static inline bool
 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
 {
-	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+	return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
 }
 
 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
 	bool done;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		list_del(&event->hw.tp_list);
 		done = tu->filter.nr_systemwide ||
-			(event->hw.tp_target->flags & PF_EXITING) ||
+			(event->hw.target->flags & PF_EXITING) ||
 			uprobe_filter_event(tu, event);
 	} else {
 		tu->filter.nr_systemwide--;
@@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
 	int err;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		/*
 		 * event->parent != NULL means copy_process(), we can avoid
 		 * uprobe_apply(). current->mm must be probed and we can rely
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..9a056f5bc02c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
 		cpu0_err = 0;
 	}
 }
+
+void watchdog_nmi_enable_all(void)
+{
+	int cpu;
+
+	if (!watchdog_user_enabled)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		watchdog_nmi_enable(cpu);
+	put_online_cpus();
+}
+
+void watchdog_nmi_disable_all(void)
+{
+	int cpu;
+
+	if (!watchdog_running)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		watchdog_nmi_disable(cpu);
+	put_online_cpus();
+}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..fe98fb226e6e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,39 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += tracex1
+hostprogs-y += tracex2
+hostprogs-y += tracex3
+hostprogs-y += tracex4
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
+always += tracex2_kern.o
+always += tracex3_kern.o
+always += tracex4_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf -lrt
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_trace_printk;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..38dac5a53b51 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
+static int kern_version;
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
+	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	enum bpf_prog_type prog_type;
+	char buf[256];
+	int fd, efd, err, id;
+	struct perf_event_attr attr = {};
 
-	if (!is_socket)
-		/* tracing events tbd */
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket) {
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	} else if (is_kprobe || is_kretprobe) {
+		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else {
+		printf("Unknown event '%s'\n", event);
 		return -1;
+	}
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	if (is_kprobe || is_kretprobe) {
+		if (is_kprobe)
+			event += 7;
+		else
+			event += 10;
+
+		snprintf(buf, sizeof(buf),
+			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+			 is_kprobe ? 'p' : 'r', event, event);
+		err = system(buf);
+		if (err < 0) {
+			printf("failed to create kprobe '%s' error '%s'\n",
+			       event, strerror(errno));
+			return -1;
+		}
+	}
+
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcpy(buf, DEBUGFS);
+	strcat(buf, "events/kprobes/");
+	strcat(buf, event);
+	strcat(buf, "/id");
+
+	efd = open(buf, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
 		return 1;
 
+	/* clear all kprobes */
+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
 	/* scan over all elf sections to get license and map info */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
 		if (strcmp(shname, "license") == 0) {
 			processed_sec[i] = true;
 			memcpy(license, data->d_buf, data->d_size);
+		} else if (strcmp(shname, "version") == 0) {
+			processed_sec[i] = true;
+			if (data->d_size != sizeof(int)) {
+				printf("invalid size of version section %zd\n",
+				       data->d_size);
+				return 1;
+			}
+			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
 			processed_sec[i] = true;
 			if (load_maps(data->d_buf, data->d_size))
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
 			if (parse_relo_and_apply(data, symbols, &shdr, insns))
 				continue;
 
-			if (memcmp(shname_prog, "events/", 7) == 0 ||
+			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
+			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
 			continue;
 
-		if (memcmp(shname, "events/", 7) == 0 ||
+		if (memcmp(shname, "kprobe/", 7) == 0 ||
+		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz > 0) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int prog_len,
-		  const char *license)
+		  const char *license, int kern_version)
 {
 	union bpf_attr attr = {
 		.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		.log_level = 1,
 	};
 
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = kern_version;
+
 	bpf_log_buf[0] = 0;
 
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
-		  const char *license);
+		  const char *license, int kern_version);
 
 #define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 	};
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
-				"GPL");
+				"GPL", 0);
 	if (prog_fd < 0) {
 		printf("failed to load prog '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
 					prog_len * sizeof(struct bpf_insn),
-					"GPL");
+					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
 			if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..31620463701a
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	/* attaches to kprobe netif_receive_skb,
+	 * looks for packets on loobpack device and prints them
+	 */
+	char devname[IFNAMSIZ] = {};
+	struct net_device *dev;
+	struct sk_buff *skb;
+	int len;
+
+	/* non-portable! works for the given kernel only */
+	skb = (struct sk_buff *) ctx->di;
+
+	dev = _(skb->dev);
+
+	len = _(skb->len);
+
+	bpf_probe_read(devname, sizeof(devname), dev->name);
+
+	if (devname[0] == 'l' && devname[1] == 'o') {
+		char fmt[] = "skb %p len %d\n";
+		/* using bpf_trace_printk() for DEBUG ONLY */
+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..19ec1cfc45db
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long loc = 0;
+	long init_val = 1;
+	long *value;
+
+	/* x64 specific: read ip of kfree_skb caller.
+	 * non-portable version of __builtin_return_address(0)
+	 */
+	bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+	long write_size = ctx->dx; /* arg3 */
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..255ff2792366
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+	int i = -(n == 0);
+	S(32); S(16); S(8); S(4); S(2); S(1);
+	return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = SLOTS,
+};
+
+SEC("kprobe/blk_update_request")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 *value, l, base;
+	u32 index;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = cur_time - *value;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	/* the lines below are computing index = log10(delta)*10
+	 * using integer arithmetic
+	 * index = 29 ~ 1 usec
+	 * index = 59 ~ 1 msec
+	 * index = 89 ~ 1 sec
+	 * index = 99 ~ 10sec or more
+	 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	 */
+	l = log2l(delta);
+	base = 1ll << l;
+	index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+	if (index >= SLOTS)
+		index = SLOTS - 1;
+
+	value = bpf_map_lookup_elem(&lat_map, &index);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..0aaa933ab938
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+	__u32 key;
+	__u64 value = 0;
+
+	for (key = 0; key < SLOTS; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+	" ",
+	" ",
+	".",
+	".",
+	"*",
+	"*",
+	"o",
+	"o",
+	"O",
+	"O",
+	"#",
+	"#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+	if (full_range)
+		printf("|1ns     |10ns     |100ns    |1us      |10us     |100us"
+		       "    |1ms      |10ms     |100ms    |1s       |10s\n");
+	else
+		printf("|1us      |10us     |100us    |1ms      |10ms     "
+		       "|100ms    |1s       |10s\n");
+}
+
+static void print_hist(int fd)
+{
+	__u32 key;
+	__u64 value;
+	__u64 cnt[SLOTS];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+
+	for (key = 0; key < SLOTS; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+
+		if (text_only)
+			printf("%s", sym[c]);
+		else
+			printf("%s %s", color[c], nocolor);
+	}
+	printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 1; i < ac; i++) {
+		if (strcmp(argv[i], "-a") == 0) {
+			full_range = true;
+		} else if (strcmp(argv[i], "-t") == 0) {
+			text_only = true;
+		} else if (strcmp(argv[i], "-h") == 0) {
+			printf("Usage:\n"
+			       "  -a display wider latency range\n"
+			       "  -t text only\n");
+			return 1;
+		}
+	}
+
+	printf("  heatmap of IO latency\n");
+	if (text_only)
+		printf("  %s", sym[num_colors - 1]);
+	else
+		printf("  %s %s", color[num_colors - 1], nocolor);
+	printf(" - many events with this latency\n");
+
+	if (text_only)
+		printf("  %s", sym[0]);
+	else
+		printf("  %s %s", color[0], nocolor);
+	printf(" - few events\n");
+
+	for (i = 0; ; i++) {
+		if (i % 20 == 0)
+			print_banner();
+		print_hist(map_fd[1]);
+		sleep(2);
+	}
+
+	return 0;
+}
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..126b80512228
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct pair {
+	u64 val;
+	u64 ip;
+};
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(struct pair),
+	.max_entries = 1000000,
+};
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long ptr = ctx->si;
+
+	bpf_map_delete_elem(&my_map, &ptr);
+	return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long ptr = ctx->ax;
+	long ip = 0;
+
+	/* get ip address of kmem_cache_alloc_node() caller */
+	bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip)));
+
+	struct pair v = {
+		.val = bpf_ktime_get_ns(),
+		.ip = ip,
+	};
+
+	bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..bc4a3bdea6ed
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+struct pair {
+	long long val;
+	__u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+	long long val = time_get_ns();
+	__u64 key, next_key;
+	struct pair v;
+
+	key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+
+	key = -1;
+	while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+		bpf_lookup_elem(map_fd[0], &next_key, &v);
+		key = next_key;
+		if (val - v.val < 1000000000ll)
+			/* object was allocated more then 1 sec ago */
+			continue;
+		printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+		       next_key, (val - v.val) / 1000000000ll, v.ip);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; ; i++) {
+		print_old_objects(map_fd[1]);
+		sleep(1);
+	}
+
+	return 0;
+}
diff --git a/tools/build/Build.include b/tools/build/Build.include
new file mode 100644
index 000000000000..4c8daaccb82a
--- /dev/null
+++ b/tools/build/Build.include
@@ -0,0 +1,81 @@
+###
+# build: Generic definitions
+#
+#  Lots of this code have been borrowed or heavily inspired from parts
+#  of kbuild code, which is not credited, but mostly developed by:
+#
+#  Copyright (C) Sam Ravnborg <sam@mars.ravnborg.org>, 2015
+#  Copyright (C) Linus Torvalds <torvalds@linux-foundation.org>, 2015
+#
+
+###
+# Convenient variables
+comma   := ,
+squote  := '
+
+###
+# Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
+dot-target = $(dir $@).$(notdir $@)
+
+###
+# filename of target with directory and extension stripped
+basetarget = $(basename $(notdir $@))
+
+###
+# The temporary file to save gcc -MD generated dependencies must not
+# contain a comma
+depfile = $(subst $(comma),_,$(dot-target).d)
+
+###
+# Check if both arguments has same arguments. Result is empty string if equal.
+arg-check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \
+                    $(filter-out $(cmd_$@),   $(cmd_$(1))) )
+
+###
+# Escape single quote for use in echo statements
+escsq = $(subst $(squote),'\$(squote)',$1)
+
+# Echo command
+# Short version is used, if $(quiet) equals `quiet_', otherwise full one.
+echo-cmd = $(if $($(quiet)cmd_$(1)),\
+           echo '  $(call escsq,$($(quiet)cmd_$(1)))';)
+
+###
+# Replace >$< with >$$< to preserve $ when reloading the .cmd file
+# (needed for make)
+# Replace >#< with >\#< to avoid starting a comment in the .cmd file
+# (needed for make)
+# Replace >'< with >'\''< to be able to enclose the whole string in '...'
+# (needed for the shell)
+make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
+
+###
+# Find any prerequisites that is newer than target or that does not exist.
+# PHONY targets skipped in both cases.
+any-prereq = $(filter-out $(PHONY),$?) $(filter-out $(PHONY) $(wildcard $^),$^)
+
+###
+# if_changed_dep  - execute command if any prerequisite is newer than
+#                   target, or command line has changed and update
+#                   dependencies in the cmd file
+if_changed_dep = $(if $(strip $(any-prereq) $(arg-check)),         \
+	@set -e;                                                   \
+	$(echo-cmd) $(cmd_$(1));                                   \
+	cat $(depfile) > $(dot-target).cmd;                        \
+	printf '%s\n' 'cmd_$@ := $(make-cmd)' >> $(dot-target).cmd)
+
+# if_changed      - execute command if any prerequisite is newer than
+#                   target, or command line has changed
+if_changed = $(if $(strip $(any-prereq) $(arg-check)),             \
+	@set -e;                                                   \
+	$(echo-cmd) $(cmd_$(1));                                   \
+	printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd)
+
+###
+# C flags to be used in rule definitions, includes:
+# - depfile generation
+# - global $(CFLAGS)
+# - per target C flags
+# - per object C flags
+# - BUILD_STR macro to allow '-D"$(variable)"' constructs
+c_flags = -Wp,-MD,$(depfile),-MT,$@ $(CFLAGS) -D"BUILD_STR(s)=\#s" $(CFLAGS_$(basetarget).o) $(CFLAGS_$(obj))
diff --git a/tools/build/Documentation/Build.txt b/tools/build/Documentation/Build.txt
new file mode 100644
index 000000000000..00ad2d608727
--- /dev/null
+++ b/tools/build/Documentation/Build.txt
@@ -0,0 +1,139 @@
+Build Framework
+===============
+
+The perf build framework was adopted from the kernel build system, hence the
+idea and the way how objects are built is the same.
+
+Basically the user provides set of 'Build' files that list objects and
+directories to nest for specific target to be build.
+
+Unlike the kernel we don't have a single build object 'obj-y' list that where
+we setup source objects, but we support more. This allows one 'Build' file to
+carry a sources list for multiple build objects.
+
+a) Build framework makefiles
+----------------------------
+
+The build framework consists of 2 Makefiles:
+
+  Build.include
+  Makefile.build
+
+While the 'Build.include' file contains just some generic definitions, the
+'Makefile.build' file is the makefile used from the outside. It's
+interface/usage is following:
+
+  $ make -f tools/build/Makefile srctree=$(KSRC) dir=$(DIR) obj=$(OBJECT)
+
+where:
+
+  KSRC   - is the path to kernel sources
+  DIR    - is the path to the project to be built
+  OBJECT - is the name of the build object
+
+When succefully finished the $(DIR) directory contains the final object file
+called $(OBJECT)-in.o:
+
+  $ ls $(DIR)/$(OBJECT)-in.o
+
+which includes all compiled sources described in 'Build' makefiles.
+
+a) Build makefiles
+------------------
+
+The user supplies 'Build' makefiles that contains a objects list, and connects
+the build to nested directories.
+
+Assume we have the following project structure:
+
+  ex/a.c
+    /b.c
+    /c.c
+    /d.c
+    /arch/e.c
+    /arch/f.c
+
+Out of which you build the 'ex' binary ' and the 'libex.a' library:
+
+  'ex'      - consists of 'a.o', 'b.o' and libex.a
+  'libex.a' - consists of 'c.o', 'd.o', 'e.o' and 'f.o'
+
+The build framework does not create the 'ex' and 'libex.a' binaries for you, it
+only prepares proper objects to be compiled and grouped together.
+
+To follow the above example, the user provides following 'Build' files:
+
+  ex/Build:
+    ex-y += a.o
+    ex-y += b.o
+
+    libex-y += c.o
+    libex-y += d.o
+    libex-y += arch/
+
+  ex/arch/Build:
+    libex-y += e.o
+    libex-y += f.o
+
+and runs:
+
+  $ make -f tools/build/Makefile.build dir=. obj=ex
+  $ make -f tools/build/Makefile.build dir=. obj=libex
+
+which creates the following objects:
+
+  ex/ex-in.o
+  ex/libex-in.o
+
+that contain request objects names in Build files.
+
+It's only a matter of 2 single commands to create the final binaries:
+
+  $ ar  rcs libex.a libex-in.o
+  $ gcc -o ex ex-in.o libex.a
+
+You can check the 'ex' example in 'tools/build/tests/ex' for more details.
+
+b) Rules
+--------
+
+The build framework provides standard compilation rules to handle .S and .c
+compilation.
+
+It's possible to include special rule if needed (like we do for flex or bison
+code generation).
+
+c) CFLAGS
+---------
+
+It's possible to alter the standard object C flags in the following way:
+
+  CFLAGS_perf.o += '...' - alters CFLAGS for perf.o object
+  CFLAGS_gtk += '...'    - alters CFLAGS for gtk build object
+
+This C flags changes has the scope of the Build makefile they are defined in.
+
+
+d) Dependencies
+---------------
+
+For each built object file 'a.o' the '.a.cmd' is created and holds:
+
+  - Command line used to built that object
+    (for each object)
+
+  - Dependency rules generated by 'gcc -Wp,-MD,...'
+    (for compiled object)
+
+All existing '.cmd' files are included in the Build process to follow properly
+the dependencies and trigger a rebuild when necessary.
+
+
+e) Single rules
+---------------
+
+It's possible to build single object file by choice, like:
+
+  $ make util/map.o    # objects
+  $ make util/map.i    # preprocessor
+  $ make util/map.s    # assembly
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
new file mode 100644
index 000000000000..10df57237a66
--- /dev/null
+++ b/tools/build/Makefile.build
@@ -0,0 +1,130 @@
+###
+# Main build makefile.
+#
+#  Lots of this code have been borrowed or heavily inspired from parts
+#  of kbuild code, which is not credited, but mostly developed by:
+#
+#  Copyright (C) Sam Ravnborg <sam@mars.ravnborg.org>, 2015
+#  Copyright (C) Linus Torvalds <torvalds@linux-foundation.org>, 2015
+#
+
+PHONY := __build
+__build:
+
+ifeq ($(V),1)
+  quiet =
+  Q =
+else
+  quiet=quiet_
+  Q=@
+endif
+
+build-dir := $(srctree)/tools/build
+
+# Generic definitions
+include $(build-dir)/Build.include
+
+# do not force detected configuration
+-include .config-detected
+
+# Init all relevant variables used in build files so
+# 1) they have correct type
+# 2) they do not inherit any value from the environment
+subdir-y     :=
+obj-y        :=
+subdir-y     :=
+subdir-obj-y :=
+
+# Build definitions
+build-file := $(dir)/Build
+include $(build-file)
+
+quiet_cmd_flex  = FLEX     $@
+quiet_cmd_bison = BISON    $@
+
+# Create directory unless it exists
+quiet_cmd_mkdir = MKDIR    $(dir $@)
+      cmd_mkdir = mkdir -p $(dir $@)
+     rule_mkdir = $(if $(wildcard $(dir $@)),,@$(call echo-cmd,mkdir) $(cmd_mkdir))
+
+# Compile command
+quiet_cmd_cc_o_c = CC       $@
+      cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
+
+quiet_cmd_cc_i_c = CPP      $@
+      cmd_cc_i_c = $(CC) $(c_flags) -E -o $@ $<
+
+quiet_cmd_cc_s_c = AS       $@
+      cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $<
+
+# Link agregate command
+# If there's nothing to link, create empty $@ object.
+quiet_cmd_ld_multi = LD       $@
+      cmd_ld_multi = $(if $(strip $(obj-y)),\
+		       $(LD) -r -o $@ $(obj-y),rm -f $@; $(AR) rcs $@)
+
+# Build rules
+$(OUTPUT)%.o: %.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
+$(OUTPUT)%.o: %.S FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
+$(OUTPUT)%.i: %.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_i_c)
+
+$(OUTPUT)%.i: %.S FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_i_c)
+
+$(OUTPUT)%.s: %.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_s_c)
+
+# Gather build data:
+#   obj-y        - list of build objects
+#   subdir-y     - list of directories to nest
+#   subdir-obj-y - list of directories objects 'dir/$(obj)-in.o'
+obj-y        := $($(obj)-y)
+subdir-y     := $(patsubst %/,%,$(filter %/, $(obj-y)))
+obj-y        := $(patsubst %/, %/$(obj)-in.o, $(obj-y))
+subdir-obj-y := $(filter %/$(obj)-in.o, $(obj-y))
+
+# '$(OUTPUT)/dir' prefix to all objects
+prefix       := $(subst ./,,$(OUTPUT)$(dir)/)
+obj-y        := $(addprefix $(prefix),$(obj-y))
+subdir-obj-y := $(addprefix $(prefix),$(subdir-obj-y))
+
+# Final '$(obj)-in.o' object
+in-target := $(prefix)$(obj)-in.o
+
+PHONY += $(subdir-y)
+
+$(subdir-y):
+	$(Q)$(MAKE) -f $(build-dir)/Makefile.build dir=$(dir)/$@ obj=$(obj)
+
+$(sort $(subdir-obj-y)): $(subdir-y) ;
+
+$(in-target): $(obj-y) FORCE
+	$(call rule_mkdir)
+	$(call if_changed,ld_multi)
+
+__build: $(in-target)
+	@:
+
+PHONY += FORCE
+FORCE:
+
+# Include all cmd files to get all the dependency rules
+# for all objects included
+targets   := $(wildcard $(sort $(obj-y) $(in-target) $(MAKECMDGOALS)))
+cmd_files := $(wildcard $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd))
+
+ifneq ($(cmd_files),)
+  include $(cmd_files)
+endif
+
+.PHONY: $(PHONY)
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
new file mode 100644
index 000000000000..3a0b0ca2a28c
--- /dev/null
+++ b/tools/build/Makefile.feature
@@ -0,0 +1,171 @@
+feature_dir := $(srctree)/tools/build/feature
+
+ifneq ($(OUTPUT),)
+  OUTPUT_FEATURES = $(OUTPUT)feature/
+  $(shell mkdir -p $(OUTPUT_FEATURES))
+endif
+
+feature_check = $(eval $(feature_check_code))
+define feature_check_code
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
+endef
+
+feature_set = $(eval $(feature_set_code))
+define feature_set_code
+  feature-$(1) := 1
+endef
+
+#
+# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output:
+#
+
+#
+# Note that this is not a complete list of all feature tests, just
+# those that are typically built on a fully configured system.
+#
+# [ Feature tests not mentioned here have to be built explicitly in
+#   the rule that uses them - an example for that is the 'bionic'
+#   feature check. ]
+#
+FEATURE_TESTS =			\
+	backtrace			\
+	dwarf				\
+	fortify-source			\
+	sync-compare-and-swap		\
+	glibc				\
+	gtk2				\
+	gtk2-infobar			\
+	libaudit			\
+	libbfd				\
+	libelf				\
+	libelf-getphdrnum		\
+	libelf-mmap			\
+	libnuma				\
+	libperl				\
+	libpython			\
+	libpython-version		\
+	libslang			\
+	libunwind			\
+	pthread-attr-setaffinity-np	\
+	stackprotector-all		\
+	timerfd				\
+	libdw-dwarf-unwind		\
+	zlib				\
+	lzma
+
+FEATURE_DISPLAY =			\
+	dwarf				\
+	glibc				\
+	gtk2				\
+	libaudit			\
+	libbfd				\
+	libelf				\
+	libnuma				\
+	libperl				\
+	libpython			\
+	libslang			\
+	libunwind			\
+	libdw-dwarf-unwind		\
+	zlib				\
+	lzma
+
+# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
+# If in the future we need per-feature checks/flags for features not
+# mentioned in this list we need to refactor this ;-).
+set_test_all_flags = $(eval $(set_test_all_flags_code))
+define set_test_all_flags_code
+  FEATURE_CHECK_CFLAGS-all  += $(FEATURE_CHECK_CFLAGS-$(1))
+  FEATURE_CHECK_LDFLAGS-all += $(FEATURE_CHECK_LDFLAGS-$(1))
+endef
+
+$(foreach feat,$(FEATURE_TESTS),$(call set_test_all_flags,$(feat)))
+
+#
+# Special fast-path for the 'all features are available' case:
+#
+$(call feature_check,all,$(MSG))
+
+#
+# Just in case the build freshly failed, make sure we print the
+# feature matrix:
+#
+ifeq ($(feature-all), 1)
+  #
+  # test-all.c passed - just set all the core feature flags to 1:
+  #
+  $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
+else
+  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C $(feature_dir) $(addsuffix .bin,$(FEATURE_TESTS)) >/dev/null 2>&1)
+  $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
+endif
+
+#
+# Print the result of the feature test:
+#
+feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG))
+
+define feature_print_status_code
+  ifeq ($(feature-$(1)), 1)
+    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
+  else
+    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
+  endif
+endef
+
+feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG))
+define feature_print_text_code
+    MSG = $(shell printf '...%30s: %s' $(1) $(2))
+endef
+
+FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat))))
+FEATURE_DUMP_FILE := $(shell touch $(OUTPUT)FEATURE-DUMP; cat $(OUTPUT)FEATURE-DUMP)
+
+ifeq ($(dwarf-post-unwind),1)
+  FEATURE_DUMP += dwarf-post-unwind($(dwarf-post-unwind-text))
+endif
+
+# The $(feature_display) controls the default detection message
+# output. It's set if:
+# - detected features differes from stored features from
+#   last build (in FEATURE-DUMP file)
+# - one of the $(FEATURE_DISPLAY) is not detected
+# - VF is enabled
+
+ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)")
+  $(shell echo "$(FEATURE_DUMP)" > $(OUTPUT)FEATURE-DUMP)
+  feature_display := 1
+endif
+
+feature_display_check = $(eval $(feature_check_code))
+define feature_display_check_code
+  ifneq ($(feature-$(1)), 1)
+    feature_display := 1
+  endif
+endef
+
+$(foreach feat,$(FEATURE_DISPLAY),$(call feature_display_check,$(feat)))
+
+ifeq ($(VF),1)
+  feature_display := 1
+  feature_verbose := 1
+endif
+
+ifeq ($(feature_display),1)
+  $(info )
+  $(info Auto-detecting system features:)
+  $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
+
+  ifeq ($(dwarf-post-unwind),1)
+    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
+  endif
+
+  ifneq ($(feature_verbose),1)
+    $(info )
+  endif
+endif
+
+ifeq ($(feature_verbose),1)
+  TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
+  $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
+  $(info )
+endif
diff --git a/tools/perf/config/feature-checks/.gitignore b/tools/build/feature/.gitignore
similarity index 52%
rename from tools/perf/config/feature-checks/.gitignore
rename to tools/build/feature/.gitignore
index 80f3da0c3515..09b335b98842 100644
--- a/tools/perf/config/feature-checks/.gitignore
+++ b/tools/build/feature/.gitignore
@@ -1,2 +1,3 @@
 *.d
 *.bin
+*.output
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/build/feature/Makefile
similarity index 72%
rename from tools/perf/config/feature-checks/Makefile
rename to tools/build/feature/Makefile
index b32ff3372514..463ed8f2a267 100644
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/build/feature/Makefile
@@ -29,33 +29,36 @@ FILES=					\
 	test-stackprotector-all.bin	\
 	test-timerfd.bin		\
 	test-libdw-dwarf-unwind.bin	\
+	test-libbabeltrace.bin		\
 	test-compile-32.bin		\
 	test-compile-x32.bin		\
-	test-zlib.bin
+	test-zlib.bin			\
+	test-lzma.bin
 
 CC := $(CROSS_COMPILE)gcc -MD
 PKG_CONFIG := $(CROSS_COMPILE)pkg-config
 
 all: $(FILES)
 
-BUILD = $(CC) $(CFLAGS) -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
+__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
+  BUILD = $(__BUILD) > $(OUTPUT)$(@:.bin=.make.output) 2>&1
 
 ###############################
 
 test-all.bin:
-	$(BUILD) -Werror -fstack-protector-all -O2 -Werror -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz
+	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
 
 test-hello.bin:
 	$(BUILD)
 
 test-pthread-attr-setaffinity-np.bin:
-	$(BUILD) -D_GNU_SOURCE -Werror -lpthread
+	$(BUILD) -D_GNU_SOURCE -lpthread
 
 test-stackprotector-all.bin:
-	$(BUILD) -Werror -fstack-protector-all
+	$(BUILD) -fstack-protector-all
 
 test-fortify-source.bin:
-	$(BUILD) -O2 -Werror -D_FORTIFY_SOURCE=2
+	$(BUILD) -O2 -D_FORTIFY_SOURCE=2
 
 test-bionic.bin:
 	$(BUILD)
@@ -118,10 +121,10 @@ test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
 
 test-liberty.bin:
-	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
+	$(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
 
 test-liberty-z.bin:
-	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
+	$(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
 
 test-cplus-demangle.bin:
 	$(BUILD) -liberty
@@ -133,10 +136,13 @@ test-timerfd.bin:
 	$(BUILD)
 
 test-libdw-dwarf-unwind.bin:
-	$(BUILD)
+	$(BUILD) # -ldw provided by $(FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind)
+
+test-libbabeltrace.bin:
+	$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)
 
 test-sync-compare-and-swap.bin:
-	$(BUILD) -Werror
+	$(BUILD)
 
 test-compile-32.bin:
 	$(CC) -m32 -o $(OUTPUT)$@ test-compile.c
@@ -147,9 +153,12 @@ test-compile-x32.bin:
 test-zlib.bin:
 	$(BUILD) -lz
 
+test-lzma.bin:
+	$(BUILD) -llzma
+
 -include *.d
 
 ###############################
 
 clean:
-	rm -f $(FILES) *.d
+	rm -f $(FILES) *.d $(FILES:.bin=.make.output)
diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/build/feature/test-all.c
similarity index 87%
rename from tools/perf/config/feature-checks/test-all.c
rename to tools/build/feature/test-all.c
index 6d4d09323922..84689a67814a 100644
--- a/tools/perf/config/feature-checks/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -98,7 +98,23 @@
 #undef main
 
 #define main main_test_pthread_attr_setaffinity_np
-# include "test-pthread_attr_setaffinity_np.c"
+# include "test-pthread-attr-setaffinity-np.c"
+#undef main
+
+# if 0
+/*
+ * Disable libbabeltrace check for test-all, because the requested
+ * library version is not released yet in most distributions. Will
+ * reenable later.
+ */
+
+#define main main_test_libbabeltrace
+# include "test-libbabeltrace.c"
+#undef main
+#endif
+
+#define main main_test_lzma
+# include "test-lzma.c"
 #undef main
 
 int main(int argc, char *argv[])
@@ -126,6 +142,7 @@ int main(int argc, char *argv[])
 	main_test_sync_compare_and_swap(argc, argv);
 	main_test_zlib();
 	main_test_pthread_attr_setaffinity_np();
+	main_test_lzma();
 
 	return 0;
 }
diff --git a/tools/perf/config/feature-checks/test-backtrace.c b/tools/build/feature/test-backtrace.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-backtrace.c
rename to tools/build/feature/test-backtrace.c
diff --git a/tools/perf/config/feature-checks/test-bionic.c b/tools/build/feature/test-bionic.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-bionic.c
rename to tools/build/feature/test-bionic.c
diff --git a/tools/perf/config/feature-checks/test-compile.c b/tools/build/feature/test-compile.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-compile.c
rename to tools/build/feature/test-compile.c
diff --git a/tools/perf/config/feature-checks/test-cplus-demangle.c b/tools/build/feature/test-cplus-demangle.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-cplus-demangle.c
rename to tools/build/feature/test-cplus-demangle.c
diff --git a/tools/perf/config/feature-checks/test-dwarf.c b/tools/build/feature/test-dwarf.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-dwarf.c
rename to tools/build/feature/test-dwarf.c
diff --git a/tools/perf/config/feature-checks/test-fortify-source.c b/tools/build/feature/test-fortify-source.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-fortify-source.c
rename to tools/build/feature/test-fortify-source.c
diff --git a/tools/perf/config/feature-checks/test-glibc.c b/tools/build/feature/test-glibc.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-glibc.c
rename to tools/build/feature/test-glibc.c
diff --git a/tools/perf/config/feature-checks/test-gtk2-infobar.c b/tools/build/feature/test-gtk2-infobar.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-gtk2-infobar.c
rename to tools/build/feature/test-gtk2-infobar.c
diff --git a/tools/perf/config/feature-checks/test-gtk2.c b/tools/build/feature/test-gtk2.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-gtk2.c
rename to tools/build/feature/test-gtk2.c
diff --git a/tools/perf/config/feature-checks/test-hello.c b/tools/build/feature/test-hello.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-hello.c
rename to tools/build/feature/test-hello.c
diff --git a/tools/perf/config/feature-checks/test-libaudit.c b/tools/build/feature/test-libaudit.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libaudit.c
rename to tools/build/feature/test-libaudit.c
diff --git a/tools/build/feature/test-libbabeltrace.c b/tools/build/feature/test-libbabeltrace.c
new file mode 100644
index 000000000000..9cf802a04885
--- /dev/null
+++ b/tools/build/feature/test-libbabeltrace.c
@@ -0,0 +1,9 @@
+
+#include <babeltrace/ctf-writer/writer.h>
+#include <babeltrace/ctf-ir/stream-class.h>
+
+int main(void)
+{
+	bt_ctf_stream_class_get_packet_context_type((void *) 0);
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libbfd.c b/tools/build/feature/test-libbfd.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libbfd.c
rename to tools/build/feature/test-libbfd.c
diff --git a/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c b/tools/build/feature/test-libdw-dwarf-unwind.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c
rename to tools/build/feature/test-libdw-dwarf-unwind.c
diff --git a/tools/perf/config/feature-checks/test-libelf-getphdrnum.c b/tools/build/feature/test-libelf-getphdrnum.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libelf-getphdrnum.c
rename to tools/build/feature/test-libelf-getphdrnum.c
diff --git a/tools/perf/config/feature-checks/test-libelf-mmap.c b/tools/build/feature/test-libelf-mmap.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libelf-mmap.c
rename to tools/build/feature/test-libelf-mmap.c
diff --git a/tools/perf/config/feature-checks/test-libelf.c b/tools/build/feature/test-libelf.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libelf.c
rename to tools/build/feature/test-libelf.c
diff --git a/tools/perf/config/feature-checks/test-libnuma.c b/tools/build/feature/test-libnuma.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libnuma.c
rename to tools/build/feature/test-libnuma.c
diff --git a/tools/perf/config/feature-checks/test-libperl.c b/tools/build/feature/test-libperl.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libperl.c
rename to tools/build/feature/test-libperl.c
diff --git a/tools/perf/config/feature-checks/test-libpython-version.c b/tools/build/feature/test-libpython-version.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libpython-version.c
rename to tools/build/feature/test-libpython-version.c
diff --git a/tools/perf/config/feature-checks/test-libpython.c b/tools/build/feature/test-libpython.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libpython.c
rename to tools/build/feature/test-libpython.c
diff --git a/tools/perf/config/feature-checks/test-libslang.c b/tools/build/feature/test-libslang.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libslang.c
rename to tools/build/feature/test-libslang.c
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/build/feature/test-libunwind-debug-frame.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libunwind-debug-frame.c
rename to tools/build/feature/test-libunwind-debug-frame.c
diff --git a/tools/perf/config/feature-checks/test-libunwind.c b/tools/build/feature/test-libunwind.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-libunwind.c
rename to tools/build/feature/test-libunwind.c
diff --git a/tools/build/feature/test-lzma.c b/tools/build/feature/test-lzma.c
new file mode 100644
index 000000000000..95adc8ced3dd
--- /dev/null
+++ b/tools/build/feature/test-lzma.c
@@ -0,0 +1,10 @@
+#include <lzma.h>
+
+int main(void)
+{
+	lzma_stream strm = LZMA_STREAM_INIT;
+	int ret;
+
+	ret = lzma_stream_decoder(&strm, UINT64_MAX, LZMA_CONCATENATED);
+	return ret ? -1 : 0;
+}
diff --git a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c b/tools/build/feature/test-pthread-attr-setaffinity-np.c
similarity index 77%
rename from tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
rename to tools/build/feature/test-pthread-attr-setaffinity-np.c
index 2b81b72eca23..fdada5e8d454 100644
--- a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
+++ b/tools/build/feature/test-pthread-attr-setaffinity-np.c
@@ -1,5 +1,6 @@
 #include <stdint.h>
 #include <pthread.h>
+#include <sched.h>
 
 int main(void)
 {
@@ -8,7 +9,8 @@ int main(void)
 	cpu_set_t cs;
 
 	pthread_attr_init(&thread_attr);
-	/* don't care abt exact args, just the API itself in libpthread */
+	CPU_ZERO(&cs);
+
 	ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cs), &cs);
 
 	return ret;
diff --git a/tools/perf/config/feature-checks/test-stackprotector-all.c b/tools/build/feature/test-stackprotector-all.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-stackprotector-all.c
rename to tools/build/feature/test-stackprotector-all.c
diff --git a/tools/perf/config/feature-checks/test-sync-compare-and-swap.c b/tools/build/feature/test-sync-compare-and-swap.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-sync-compare-and-swap.c
rename to tools/build/feature/test-sync-compare-and-swap.c
diff --git a/tools/perf/config/feature-checks/test-timerfd.c b/tools/build/feature/test-timerfd.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-timerfd.c
rename to tools/build/feature/test-timerfd.c
diff --git a/tools/perf/config/feature-checks/test-zlib.c b/tools/build/feature/test-zlib.c
similarity index 100%
rename from tools/perf/config/feature-checks/test-zlib.c
rename to tools/build/feature/test-zlib.c
diff --git a/tools/build/tests/ex/Build b/tools/build/tests/ex/Build
new file mode 100644
index 000000000000..0e6c3e6767e6
--- /dev/null
+++ b/tools/build/tests/ex/Build
@@ -0,0 +1,8 @@
+ex-y += ex.o
+ex-y += a.o
+ex-y += b.o
+ex-y += empty/
+
+libex-y += c.o
+libex-y += d.o
+libex-y += arch/
diff --git a/tools/build/tests/ex/Makefile b/tools/build/tests/ex/Makefile
new file mode 100644
index 000000000000..52d2476073a3
--- /dev/null
+++ b/tools/build/tests/ex/Makefile
@@ -0,0 +1,23 @@
+export srctree := ../../../..
+export CC      := gcc
+export LD      := ld
+export AR      := ar
+
+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
+ex: ex-in.o libex-in.o
+	gcc -o $@ $^
+
+ex.%: FORCE
+	make -f $(srctree)/tools/build/Makefile.build dir=. $@
+
+ex-in.o: FORCE
+	make $(build)=ex
+
+libex-in.o: FORCE
+	make $(build)=libex
+
+clean:
+	find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	rm -f ex ex.i ex.s
+
+.PHONY: FORCE
diff --git a/tools/build/tests/ex/a.c b/tools/build/tests/ex/a.c
new file mode 100644
index 000000000000..851762798c83
--- /dev/null
+++ b/tools/build/tests/ex/a.c
@@ -0,0 +1,5 @@
+
+int a(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/arch/Build b/tools/build/tests/ex/arch/Build
new file mode 100644
index 000000000000..55506189efae
--- /dev/null
+++ b/tools/build/tests/ex/arch/Build
@@ -0,0 +1,2 @@
+libex-y += e.o
+libex-y += f.o
diff --git a/tools/build/tests/ex/arch/e.c b/tools/build/tests/ex/arch/e.c
new file mode 100644
index 000000000000..beaa4a1d7ba8
--- /dev/null
+++ b/tools/build/tests/ex/arch/e.c
@@ -0,0 +1,5 @@
+
+int e(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/arch/f.c b/tools/build/tests/ex/arch/f.c
new file mode 100644
index 000000000000..7c3e9e9da5b7
--- /dev/null
+++ b/tools/build/tests/ex/arch/f.c
@@ -0,0 +1,5 @@
+
+int f(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/b.c b/tools/build/tests/ex/b.c
new file mode 100644
index 000000000000..c24ff9ca9a97
--- /dev/null
+++ b/tools/build/tests/ex/b.c
@@ -0,0 +1,5 @@
+
+int b(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/c.c b/tools/build/tests/ex/c.c
new file mode 100644
index 000000000000..e216d0217499
--- /dev/null
+++ b/tools/build/tests/ex/c.c
@@ -0,0 +1,5 @@
+
+int c(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/d.c b/tools/build/tests/ex/d.c
new file mode 100644
index 000000000000..80dc0f06151b
--- /dev/null
+++ b/tools/build/tests/ex/d.c
@@ -0,0 +1,5 @@
+
+int d(void)
+{
+	return 0;
+}
diff --git a/tools/build/tests/ex/empty/Build b/tools/build/tests/ex/empty/Build
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/build/tests/ex/ex.c b/tools/build/tests/ex/ex.c
new file mode 100644
index 000000000000..dc42eb2e1a67
--- /dev/null
+++ b/tools/build/tests/ex/ex.c
@@ -0,0 +1,19 @@
+
+int a(void);
+int b(void);
+int c(void);
+int d(void);
+int e(void);
+int f(void);
+
+int main(void)
+{
+	a();
+	b();
+	c();
+	d();
+	e();
+	f();
+
+	return 0;
+}
diff --git a/tools/build/tests/run.sh b/tools/build/tests/run.sh
new file mode 100755
index 000000000000..5494f8ea7567
--- /dev/null
+++ b/tools/build/tests/run.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+function test_ex {
+	make -C ex V=1 clean > ex.out 2>&1
+	make -C ex V=1 >> ex.out 2>&1
+
+	if [ ! -x ./ex/ex ]; then
+	  echo FAILED
+	  exit -1
+	fi
+
+	make -C ex V=1 clean > /dev/null 2>&1
+	rm -f ex.out
+}
+
+function test_ex_suffix {
+	make -C ex V=1 clean > ex.out 2>&1
+
+	# use -rR to disable make's builtin rules
+	make -rR -C ex V=1 ex.o >> ex.out 2>&1
+	make -rR -C ex V=1 ex.i >> ex.out 2>&1
+	make -rR -C ex V=1 ex.s >> ex.out 2>&1
+
+	if [ -x ./ex/ex ]; then
+	  echo FAILED
+	  exit -1
+	fi
+
+	if [ ! -f ./ex/ex.o -o ! -f ./ex/ex.i -o ! -f ./ex/ex.s ]; then
+	  echo FAILED
+	  exit -1
+	fi
+
+	make -C ex V=1 clean > /dev/null 2>&1
+	rm -f ex.out
+}
+echo -n Testing..
+
+test_ex
+test_ex_suffix
+
+echo OK
diff --git a/tools/lib/api/Build b/tools/lib/api/Build
new file mode 100644
index 000000000000..3653965cf481
--- /dev/null
+++ b/tools/lib/api/Build
@@ -0,0 +1,2 @@
+libapi-y += fd/
+libapi-y += fs/
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index 36c08b1f4afb..d8fe29fc19a4 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -1,49 +1,43 @@
 include ../../scripts/Makefile.include
 include ../../perf/config/utilities.mak		# QUIET_CLEAN
 
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
 CC = $(CROSS_COMPILE)gcc
 AR = $(CROSS_COMPILE)ar
 
-# guard against environment variables
-LIB_H=
-LIB_OBJS=
+MAKEFLAGS += --no-print-directory
 
-LIB_H += fs/debugfs.h
-LIB_H += fs/fs.h
-# See comment below about piggybacking...
-LIB_H += fd/array.h
+LIBFILE = $(OUTPUT)libapi.a
 
-LIB_OBJS += $(OUTPUT)fs/debugfs.o
-LIB_OBJS += $(OUTPUT)fs/fs.o
-# XXX piggybacking here, need to introduce libapikfd, or rename this
-# to plain libapik.a and make it have it all api goodies
-LIB_OBJS += $(OUTPUT)fd/array.o
-
-LIBFILE = libapikfs.a
-
-CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) -fPIC
-EXTLIBS = -lelf -lpthread -lrt -lm
-ALL_CFLAGS = $(CFLAGS) $(BASIC_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
-ALL_LDFLAGS = $(LDFLAGS)
+CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -D_FORTIFY_SOURCE=2 -fPIC
+CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
 
 RM = rm -f
 
-$(LIBFILE): $(LIB_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $(OUTPUT)$@ $(LIB_OBJS)
+build  := -f $(srctree)/tools/build/Makefile.build dir=. obj
+API_IN := $(OUTPUT)libapi-in.o
 
-$(LIB_OBJS): $(LIB_H)
+export srctree OUTPUT CC LD CFLAGS V
 
-libapi_dirs:
-	$(QUIET_MKDIR)mkdir -p $(OUTPUT)fd $(OUTPUT)fs
+all: $(LIBFILE)
 
-$(OUTPUT)%.o: %.c libapi_dirs
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
-$(OUTPUT)%.s: %.c libapi_dirs
-	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
-$(OUTPUT)%.o: %.S libapi_dirs
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
+$(API_IN): FORCE
+	@$(MAKE) $(build)=libapi
+
+$(LIBFILE): $(API_IN)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(API_IN)
 
 clean:
-	$(call QUIET_CLEAN, libapi) $(RM) $(LIB_OBJS) $(LIBFILE)
+	$(call QUIET_CLEAN, libapi) $(RM) $(LIBFILE); \
+	find $(if $(OUTPUT),$(OUTPUT),.) -name \*.o | xargs $(RM)
 
-.PHONY: clean
+FORCE:
+
+.PHONY: clean FORCE
diff --git a/tools/lib/api/fd/Build b/tools/lib/api/fd/Build
new file mode 100644
index 000000000000..605d99f6d71a
--- /dev/null
+++ b/tools/lib/api/fd/Build
@@ -0,0 +1 @@
+libapi-y += array.o
diff --git a/tools/lib/api/fs/Build b/tools/lib/api/fs/Build
new file mode 100644
index 000000000000..6de5a4f0b501
--- /dev/null
+++ b/tools/lib/api/fs/Build
@@ -0,0 +1,4 @@
+libapi-y += fs.o
+libapi-y += debugfs.o
+libapi-y += findfs.o
+libapi-y += tracefs.o
diff --git a/tools/lib/api/fs/debugfs.c b/tools/lib/api/fs/debugfs.c
index d2b18e887071..8305b3e9d48e 100644
--- a/tools/lib/api/fs/debugfs.c
+++ b/tools/lib/api/fs/debugfs.c
@@ -3,75 +3,50 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 #include <stdbool.h>
 #include <sys/vfs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/mount.h>
 #include <linux/kernel.h>
 
 #include "debugfs.h"
 
-char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug";
+#ifndef DEBUGFS_DEFAULT_PATH
+#define DEBUGFS_DEFAULT_PATH		"/sys/kernel/debug"
+#endif
+
+char debugfs_mountpoint[PATH_MAX + 1] = DEBUGFS_DEFAULT_PATH;
 
 static const char * const debugfs_known_mountpoints[] = {
-	"/sys/kernel/debug",
+	DEBUGFS_DEFAULT_PATH,
 	"/debug",
 	0,
 };
 
 static bool debugfs_found;
 
+bool debugfs_configured(void)
+{
+	return debugfs_find_mountpoint() != NULL;
+}
+
 /* find the path to the mounted debugfs */
 const char *debugfs_find_mountpoint(void)
 {
-	const char * const *ptr;
-	char type[100];
-	FILE *fp;
+	const char *ret;
 
 	if (debugfs_found)
 		return (const char *)debugfs_mountpoint;
 
-	ptr = debugfs_known_mountpoints;
-	while (*ptr) {
-		if (debugfs_valid_mountpoint(*ptr) == 0) {
-			debugfs_found = true;
-			strcpy(debugfs_mountpoint, *ptr);
-			return debugfs_mountpoint;
-		}
-		ptr++;
-	}
+	ret = find_mountpoint("debugfs", (long) DEBUGFS_MAGIC,
+			      debugfs_mountpoint, PATH_MAX + 1,
+			      debugfs_known_mountpoints);
+	if (ret)
+		debugfs_found = true;
 
-	/* give up and parse /proc/mounts */
-	fp = fopen("/proc/mounts", "r");
-	if (fp == NULL)
-		return NULL;
-
-	while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n",
-		      debugfs_mountpoint, type) == 2) {
-		if (strcmp(type, "debugfs") == 0)
-			break;
-	}
-	fclose(fp);
-
-	if (strcmp(type, "debugfs") != 0)
-		return NULL;
-
-	debugfs_found = true;
-
-	return debugfs_mountpoint;
-}
-
-/* verify that a mountpoint is actually a debugfs instance */
-
-int debugfs_valid_mountpoint(const char *debugfs)
-{
-	struct statfs st_fs;
-
-	if (statfs(debugfs, &st_fs) < 0)
-		return -ENOENT;
-	else if ((long)st_fs.f_type != (long)DEBUGFS_MAGIC)
-		return -ENOENT;
-
-	return 0;
+	return ret;
 }
 
 /* mount the debugfs somewhere if it's not mounted */
@@ -87,7 +62,7 @@ char *debugfs_mount(const char *mountpoint)
 		mountpoint = getenv(PERF_DEBUGFS_ENVIRONMENT);
 		/* if no environment variable, use default */
 		if (mountpoint == NULL)
-			mountpoint = "/sys/kernel/debug";
+			mountpoint = DEBUGFS_DEFAULT_PATH;
 	}
 
 	if (mount(NULL, mountpoint, "debugfs", 0, NULL) < 0)
diff --git a/tools/lib/api/fs/debugfs.h b/tools/lib/api/fs/debugfs.h
index 0739881a9897..455023698d2b 100644
--- a/tools/lib/api/fs/debugfs.h
+++ b/tools/lib/api/fs/debugfs.h
@@ -1,16 +1,7 @@
 #ifndef __API_DEBUGFS_H__
 #define __API_DEBUGFS_H__
 
-#define _STR(x) #x
-#define STR(x) _STR(x)
-
-/*
- * On most systems <limits.h> would have given us this, but  not on some systems
- * (e.g. GNU/Hurd).
- */
-#ifndef PATH_MAX
-#define PATH_MAX 4096
-#endif
+#include "findfs.h"
 
 #ifndef DEBUGFS_MAGIC
 #define DEBUGFS_MAGIC          0x64626720
@@ -20,8 +11,8 @@
 #define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR"
 #endif
 
+bool debugfs_configured(void);
 const char *debugfs_find_mountpoint(void);
-int debugfs_valid_mountpoint(const char *debugfs);
 char *debugfs_mount(const char *mountpoint);
 
 extern char debugfs_mountpoint[];
diff --git a/tools/lib/api/fs/findfs.c b/tools/lib/api/fs/findfs.c
new file mode 100644
index 000000000000..49946cb6d7af
--- /dev/null
+++ b/tools/lib/api/fs/findfs.c
@@ -0,0 +1,63 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <sys/vfs.h>
+
+#include "findfs.h"
+
+/* verify that a mountpoint is actually the type we want */
+
+int valid_mountpoint(const char *mount, long magic)
+{
+	struct statfs st_fs;
+
+	if (statfs(mount, &st_fs) < 0)
+		return -ENOENT;
+	else if ((long)st_fs.f_type != magic)
+		return -ENOENT;
+
+	return 0;
+}
+
+/* find the path to a mounted file system */
+const char *find_mountpoint(const char *fstype, long magic,
+			    char *mountpoint, int len,
+			    const char * const *known_mountpoints)
+{
+	const char * const *ptr;
+	char format[128];
+	char type[100];
+	FILE *fp;
+
+	if (known_mountpoints) {
+		ptr = known_mountpoints;
+		while (*ptr) {
+			if (valid_mountpoint(*ptr, magic) == 0) {
+				strncpy(mountpoint, *ptr, len - 1);
+				mountpoint[len-1] = 0;
+				return mountpoint;
+			}
+			ptr++;
+		}
+	}
+
+	/* give up and parse /proc/mounts */
+	fp = fopen("/proc/mounts", "r");
+	if (fp == NULL)
+		return NULL;
+
+	snprintf(format, 128, "%%*s %%%ds %%99s %%*s %%*d %%*d\n", len);
+
+	while (fscanf(fp, format, mountpoint, type) == 2) {
+		if (strcmp(type, fstype) == 0)
+			break;
+	}
+	fclose(fp);
+
+	if (strcmp(type, fstype) != 0)
+		return NULL;
+
+	return mountpoint;
+}
diff --git a/tools/lib/api/fs/findfs.h b/tools/lib/api/fs/findfs.h
new file mode 100644
index 000000000000..b6f5d05acc42
--- /dev/null
+++ b/tools/lib/api/fs/findfs.h
@@ -0,0 +1,23 @@
+#ifndef __API_FINDFS_H__
+#define __API_FINDFS_H__
+
+#include <stdbool.h>
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
+/*
+ * On most systems <limits.h> would have given us this, but  not on some systems
+ * (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+const char *find_mountpoint(const char *fstype, long magic,
+			    char *mountpoint, int len,
+			    const char * const *known_mountpoints);
+
+int valid_mountpoint(const char *mount, long magic);
+
+#endif /* __API_FINDFS_H__ */
diff --git a/tools/lib/api/fs/tracefs.c b/tools/lib/api/fs/tracefs.c
new file mode 100644
index 000000000000..e4aa9688b71e
--- /dev/null
+++ b/tools/lib/api/fs/tracefs.c
@@ -0,0 +1,78 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <sys/vfs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <linux/kernel.h>
+
+#include "tracefs.h"
+
+#ifndef TRACEFS_DEFAULT_PATH
+#define TRACEFS_DEFAULT_PATH		"/sys/kernel/tracing"
+#endif
+
+char tracefs_mountpoint[PATH_MAX + 1] = TRACEFS_DEFAULT_PATH;
+
+static const char * const tracefs_known_mountpoints[] = {
+	TRACEFS_DEFAULT_PATH,
+	"/sys/kernel/debug/tracing",
+	"/tracing",
+	"/trace",
+	0,
+};
+
+static bool tracefs_found;
+
+bool tracefs_configured(void)
+{
+	return tracefs_find_mountpoint() != NULL;
+}
+
+/* find the path to the mounted tracefs */
+const char *tracefs_find_mountpoint(void)
+{
+	const char *ret;
+
+	if (tracefs_found)
+		return (const char *)tracefs_mountpoint;
+
+	ret = find_mountpoint("tracefs", (long) TRACEFS_MAGIC,
+			      tracefs_mountpoint, PATH_MAX + 1,
+			      tracefs_known_mountpoints);
+
+	if (ret)
+		tracefs_found = true;
+
+	return ret;
+}
+
+/* mount the tracefs somewhere if it's not mounted */
+char *tracefs_mount(const char *mountpoint)
+{
+	/* see if it's already mounted */
+	if (tracefs_find_mountpoint())
+		goto out;
+
+	/* if not mounted and no argument */
+	if (mountpoint == NULL) {
+		/* see if environment variable set */
+		mountpoint = getenv(PERF_TRACEFS_ENVIRONMENT);
+		/* if no environment variable, use default */
+		if (mountpoint == NULL)
+			mountpoint = TRACEFS_DEFAULT_PATH;
+	}
+
+	if (mount(NULL, mountpoint, "tracefs", 0, NULL) < 0)
+		return NULL;
+
+	/* save the mountpoint */
+	tracefs_found = true;
+	strncpy(tracefs_mountpoint, mountpoint, sizeof(tracefs_mountpoint));
+out:
+	return tracefs_mountpoint;
+}
diff --git a/tools/lib/api/fs/tracefs.h b/tools/lib/api/fs/tracefs.h
new file mode 100644
index 000000000000..da780ac49acb
--- /dev/null
+++ b/tools/lib/api/fs/tracefs.h
@@ -0,0 +1,21 @@
+#ifndef __API_TRACEFS_H__
+#define __API_TRACEFS_H__
+
+#include "findfs.h"
+
+#ifndef TRACEFS_MAGIC
+#define TRACEFS_MAGIC          0x74726163
+#endif
+
+#ifndef PERF_TRACEFS_ENVIRONMENT
+#define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
+#endif
+
+bool tracefs_configured(void);
+const char *tracefs_find_mountpoint(void);
+int tracefs_valid_mountpoint(const char *debugfs);
+char *tracefs_mount(const char *mountpoint);
+
+extern char tracefs_mountpoint[];
+
+#endif /* __API_DEBUGFS_H__ */
diff --git a/tools/lib/lockdep/Build b/tools/lib/lockdep/Build
new file mode 100644
index 000000000000..6f667355b068
--- /dev/null
+++ b/tools/lib/lockdep/Build
@@ -0,0 +1 @@
+liblockdep-y += common.o lockdep.o preload.o rbtree.o
diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile
index 4b866c54f624..0c356fb65022 100644
--- a/tools/lib/lockdep/Makefile
+++ b/tools/lib/lockdep/Makefile
@@ -35,6 +35,10 @@ bindir = $(prefix)/$(bindir_relative)
 
 export DESTDIR DESTDIR_SQ INSTALL
 
+MAKEFLAGS += --no-print-directory
+
+include ../../scripts/Makefile.include
+
 # copy a bit from Linux kbuild
 
 ifeq ("$(origin V)", "command line")
@@ -44,56 +48,21 @@ ifndef VERBOSE
   VERBOSE = 0
 endif
 
-ifeq ("$(origin O)", "command line")
-  BUILD_OUTPUT := $(O)
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
 endif
 
-ifeq ($(BUILD_SRC),)
-ifneq ($(BUILD_OUTPUT),)
-
-define build_output
-	$(if $(VERBOSE:1=),@)$(MAKE) -C $(BUILD_OUTPUT)	\
-	BUILD_SRC=$(CURDIR) -f $(CURDIR)/Makefile $1
-endef
-
-saved-output := $(BUILD_OUTPUT)
-BUILD_OUTPUT := $(shell cd $(BUILD_OUTPUT) && /bin/pwd)
-$(if $(BUILD_OUTPUT),, \
-     $(error output directory "$(saved-output)" does not exist))
-
-all: sub-make
-
-gui: force
-	$(call build_output, all_cmd)
-
-$(filter-out gui,$(MAKECMDGOALS)): sub-make
-
-sub-make: force
-	$(call build_output, $(MAKECMDGOALS))
-
-
-# Leave processing to above invocation of make
-skip-makefile := 1
-
-endif # BUILD_OUTPUT
-endif # BUILD_SRC
-
-# We process the rest of the Makefile if this is the final invocation of make
-ifeq ($(skip-makefile),)
-
-srctree		:= $(realpath $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR)))
-objtree		:= $(realpath $(CURDIR))
-src		:= $(srctree)
-obj		:= $(objtree)
-
-export prefix libdir bindir src obj
-
 # Shell quotes
 libdir_SQ = $(subst ','\'',$(libdir))
 bindir_SQ = $(subst ','\'',$(bindir))
 
-LIB_FILE = liblockdep.a liblockdep.so.$(LIBLOCKDEP_VERSION)
+LIB_IN := $(OUTPUT)liblockdep-in.o
+
 BIN_FILE = lockdep
+LIB_FILE = $(OUTPUT)liblockdep.a $(OUTPUT)liblockdep.so.$(LIBLOCKDEP_VERSION)
 
 CONFIG_INCLUDES =
 CONFIG_LIBS	=
@@ -108,33 +77,23 @@ INCLUDES = -I. -I./uinclude -I./include -I../../include $(CONFIG_INCLUDES)
 
 # Set compile option CFLAGS if not set elsewhere
 CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g
+CFLAGS += -fPIC
 
 override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
 
 ifeq ($(VERBOSE),1)
   Q =
-  print_compile =
-  print_app_build =
-  print_fpic_compile =
   print_shared_lib_compile =
   print_install =
 else
   Q = @
-  print_compile =		echo '  CC                 '$(OBJ);
-  print_app_build =		echo '  BUILD              '$(OBJ);
-  print_fpic_compile =		echo '  CC FPIC            '$(OBJ);
-  print_shared_lib_compile =	echo '  BUILD SHARED LIB   '$(OBJ);
-  print_static_lib_build =	echo '  BUILD STATIC LIB   '$(OBJ);
-  print_install =		echo '  INSTALL     '$1'	to	$(DESTDIR_SQ)$2';
+  print_shared_lib_compile =	echo '  LD       '$(OBJ);
+  print_static_lib_build =	echo '  LD       '$(OBJ);
+  print_install =		echo '  INSTALL  '$1'	to	$(DESTDIR_SQ)$2';
 endif
 
-do_fpic_compile =					\
-	($(print_fpic_compile)				\
-	$(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@)
-
-do_app_build =						\
-	($(print_app_build)				\
-	$(CC) $^ -rdynamic -o $@ $(CONFIG_LIBS) $(LIBS))
+export srctree OUTPUT CC LD CFLAGS V
+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
 do_compile_shared_library =			\
 	($(print_shared_lib_compile)		\
@@ -144,22 +103,6 @@ do_build_static_lib =				\
 	($(print_static_lib_build)		\
 	$(RM) $@;  $(AR) rcs $@ $^)
 
-
-define do_compile
-	$(print_compile)						\
-	$(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@;
-endef
-
-$(obj)/%.o: $(src)/%.c
-	$(Q)$(call do_compile)
-
-%.o: $(src)/%.c
-	$(Q)$(call do_compile)
-
-PEVENT_LIB_OBJS = common.o lockdep.o preload.o rbtree.o
-
-ALL_OBJS = $(PEVENT_LIB_OBJS)
-
 CMD_TARGETS = $(LIB_FILE)
 
 TARGETS = $(CMD_TARGETS)
@@ -169,42 +112,15 @@ all: all_cmd
 
 all_cmd: $(CMD_TARGETS)
 
-liblockdep.so.$(LIBLOCKDEP_VERSION): $(PEVENT_LIB_OBJS)
+$(LIB_IN): force
+	$(Q)$(MAKE) $(build)=liblockdep
+
+liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN)
 	$(Q)$(do_compile_shared_library)
 
-liblockdep.a: $(PEVENT_LIB_OBJS)
+liblockdep.a: $(LIB_IN)
 	$(Q)$(do_build_static_lib)
 
-$(PEVENT_LIB_OBJS): %.o: $(src)/%.c
-	$(Q)$(do_fpic_compile)
-
-## make deps
-
-all_objs := $(sort $(ALL_OBJS))
-all_deps := $(all_objs:%.o=.%.d)
-
-# let .d file also depends on the source and header files
-define check_deps
-		@set -e; $(RM) $@; \
-		$(CC) -MM $(CFLAGS) $< > $@.$$$$; \
-		sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
-		$(RM) $@.$$$$
-endef
-
-$(all_deps): .%.d: $(src)/%.c
-	$(Q)$(call check_deps)
-
-$(all_objs) : %.o : .%.d
-
-dep_includes := $(wildcard $(all_deps))
-
-ifneq ($(dep_includes),)
- include $(dep_includes)
-endif
-
-### Detect environment changes
-TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE)
-
 tags:	force
 	$(RM) tags
 	find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \
@@ -233,8 +149,6 @@ clean:
 	$(RM) *.o *~ $(TARGETS) *.a *liblockdep*.so* $(VERSION_FILES) .*.d
 	$(RM) tags TAGS
 
-endif # skip-makefile
-
 PHONY += force
 force:
 
diff --git a/tools/lib/traceevent/Build b/tools/lib/traceevent/Build
new file mode 100644
index 000000000000..c681d0575d16
--- /dev/null
+++ b/tools/lib/traceevent/Build
@@ -0,0 +1,17 @@
+libtraceevent-y += event-parse.o
+libtraceevent-y += event-plugin.o
+libtraceevent-y += trace-seq.o
+libtraceevent-y += parse-filter.o
+libtraceevent-y += parse-utils.o
+libtraceevent-y += kbuffer-parse.o
+
+plugin_jbd2-y         += plugin_jbd2.o
+plugin_hrtimer-y      += plugin_hrtimer.o
+plugin_kmem-y         += plugin_kmem.o
+plugin_kvm-y          += plugin_kvm.o
+plugin_mac80211-y     += plugin_mac80211.o
+plugin_sched_switch-y += plugin_sched_switch.o
+plugin_function-y     += plugin_function.o
+plugin_xen-y          += plugin_xen.o
+plugin_scsi-y         += plugin_scsi.o
+plugin_cfg80211-y     += plugin_cfg80211.o
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
index 005c9cc06935..d410da335e3d 100644
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -67,7 +67,7 @@ PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)"
 PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))'
 endif
 
-include $(if $(BUILD_SRC),$(BUILD_SRC)/)../../scripts/Makefile.include
+include ../../scripts/Makefile.include
 
 # copy a bit from Linux kbuild
 
@@ -78,40 +78,13 @@ ifndef VERBOSE
   VERBOSE = 0
 endif
 
-ifeq ("$(origin O)", "command line")
-  BUILD_OUTPUT := $(O)
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
 endif
 
-ifeq ($(BUILD_SRC),)
-ifneq ($(OUTPUT),)
-
-define build_output
-  $(if $(VERBOSE:1=),@)+$(MAKE) -C $(OUTPUT) \
-  BUILD_SRC=$(CURDIR)/ -f $(CURDIR)/Makefile $1
-endef
-
-all: sub-make
-
-$(MAKECMDGOALS): sub-make
-
-sub-make: force
-	$(call build_output, $(MAKECMDGOALS))
-
-
-# Leave processing to above invocation of make
-skip-makefile := 1
-
-endif # OUTPUT
-endif # BUILD_SRC
-
-# We process the rest of the Makefile if this is the final invocation of make
-ifeq ($(skip-makefile),)
-
-srctree		:= $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR))
-objtree		:= $(CURDIR)
-src		:= $(srctree)
-obj		:= $(objtree)
-
 export prefix bindir src obj
 
 # Shell quotes
@@ -132,16 +105,19 @@ EXTRAVERSION	= $(EP_EXTRAVERSION)
 OBJ		= $@
 N		=
 
-export Q VERBOSE
-
 EVENT_PARSE_VERSION = $(EP_VERSION).$(EP_PATCHLEVEL).$(EP_EXTRAVERSION)
 
-INCLUDES = -I. -I $(srctree)/../../include $(CONFIG_INCLUDES)
+INCLUDES = -I. -I $(srctree)/tools/include $(CONFIG_INCLUDES)
 
-# Set compile option CFLAGS if not set elsewhere
-CFLAGS ?= -g -Wall
+# Set compile option CFLAGS
+ifdef EXTRA_CFLAGS
+  CFLAGS := $(EXTRA_CFLAGS)
+else
+  CFLAGS := -g -Wall
+endif
 
 # Append required CFLAGS
+override CFLAGS += -fPIC
 override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
 override CFLAGS += $(udis86-flags) -D_GNU_SOURCE
 
@@ -151,74 +127,58 @@ else
   Q = @
 endif
 
-do_compile_shared_library =			\
-	($(print_shared_lib_compile)		\
-	$(CC) --shared $^ -o $@)
+# Disable command line variables (CFLAGS) overide from top
+# level Makefile (perf), otherwise build Makefile will get
+# the same command line setup.
+MAKEOVERRIDES=
 
-do_plugin_build =				\
-	($(print_plugin_build)			\
-	$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $<)
+export srctree OUTPUT CC LD CFLAGS V
+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
-do_build_static_lib =				\
-	($(print_static_lib_build)		\
-	$(RM) $@;  $(AR) rcs $@ $^)
+PLUGINS  = plugin_jbd2.so
+PLUGINS += plugin_hrtimer.so
+PLUGINS += plugin_kmem.so
+PLUGINS += plugin_kvm.so
+PLUGINS += plugin_mac80211.so
+PLUGINS += plugin_sched_switch.so
+PLUGINS += plugin_function.so
+PLUGINS += plugin_xen.so
+PLUGINS += plugin_scsi.so
+PLUGINS += plugin_cfg80211.so
 
+PLUGINS    := $(addprefix $(OUTPUT),$(PLUGINS))
+PLUGINS_IN := $(PLUGINS:.so=-in.o)
 
-do_compile = $(QUIET_CC)$(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@;
-
-$(obj)/%.o: $(src)/%.c
-	$(call do_compile)
-
-%.o: $(src)/%.c
-	$(call do_compile)
-
-PEVENT_LIB_OBJS  = event-parse.o
-PEVENT_LIB_OBJS += event-plugin.o
-PEVENT_LIB_OBJS += trace-seq.o
-PEVENT_LIB_OBJS += parse-filter.o
-PEVENT_LIB_OBJS += parse-utils.o
-PEVENT_LIB_OBJS += kbuffer-parse.o
-
-PLUGIN_OBJS  = plugin_jbd2.o
-PLUGIN_OBJS += plugin_hrtimer.o
-PLUGIN_OBJS += plugin_kmem.o
-PLUGIN_OBJS += plugin_kvm.o
-PLUGIN_OBJS += plugin_mac80211.o
-PLUGIN_OBJS += plugin_sched_switch.o
-PLUGIN_OBJS += plugin_function.o
-PLUGIN_OBJS += plugin_xen.o
-PLUGIN_OBJS += plugin_scsi.o
-PLUGIN_OBJS += plugin_cfg80211.o
-
-PLUGINS := $(PLUGIN_OBJS:.o=.so)
-
-ALL_OBJS = $(PEVENT_LIB_OBJS) $(PLUGIN_OBJS)
+TE_IN    := $(OUTPUT)libtraceevent-in.o
+LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
 
 CMD_TARGETS = $(LIB_FILE) $(PLUGINS)
 
 TARGETS = $(CMD_TARGETS)
 
-
 all: all_cmd
 
 all_cmd: $(CMD_TARGETS)
 
-libtraceevent.so: $(PEVENT_LIB_OBJS)
+$(TE_IN): force
+	$(Q)$(MAKE) $(build)=libtraceevent
+
+$(OUTPUT)libtraceevent.so: $(TE_IN)
 	$(QUIET_LINK)$(CC) --shared $^ -o $@
 
-libtraceevent.a: $(PEVENT_LIB_OBJS)
+$(OUTPUT)libtraceevent.a: $(TE_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
 plugins: $(PLUGINS)
 
-$(PEVENT_LIB_OBJS): %.o: $(src)/%.c TRACEEVENT-CFLAGS
-	$(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@
+__plugin_obj = $(notdir $@)
+  plugin_obj = $(__plugin_obj:-in.o=)
 
-$(PLUGIN_OBJS): %.o : $(src)/%.c
-	$(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) -fPIC -o $@ $<
+$(PLUGINS_IN): force
+	$(Q)$(MAKE) $(build)=$(plugin_obj)
 
-$(PLUGINS): %.so: %.o
-	$(QUIET_LINK)$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $<
+$(OUTPUT)%.so: $(OUTPUT)%-in.o
+	$(QUIET_LINK)$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $^
 
 define make_version.h
   (echo '/* This file is automatically generated. Do not modify. */';		\
@@ -255,40 +215,6 @@ define update_dir
    fi);
 endef
 
-## make deps
-
-all_objs := $(sort $(ALL_OBJS))
-all_deps := $(all_objs:%.o=.%.d)
-
-# let .d file also depends on the source and header files
-define check_deps
-  @set -e; $(RM) $@; \
-  $(CC) -MM $(CFLAGS) $< > $@.$$$$; \
-  sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
-  $(RM) $@.$$$$
-endef
-
-$(all_deps): .%.d: $(src)/%.c
-	$(Q)$(call check_deps)
-
-$(all_objs) : %.o : .%.d
-
-dep_includes := $(wildcard $(all_deps))
-
-ifneq ($(dep_includes),)
- include $(dep_includes)
-endif
-
-### Detect environment changes
-TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE)
-
-TRACEEVENT-CFLAGS: force
-	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat TRACEEVENT-CFLAGS 2>/dev/null`" ; then \
-		echo 1>&2 "  FLAGS:   * new build flags or cross compiler"; \
-		echo "$$FLAGS" >TRACEEVENT-CFLAGS; \
-            fi
-
 tags:	force
 	$(RM) tags
 	find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \
@@ -327,14 +253,9 @@ clean:
 		$(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d \
 		$(RM) TRACEEVENT-CFLAGS tags TAGS
 
-endif # skip-makefile
-
 PHONY += force plugins
 force:
 
-plugins:
-	@echo > /dev/null
-
 # Declare the contents of the .PHONY variable as phony.  We keep that
 # information in a variable so we can use it in if_changed and friends.
 .PHONY: $(PHONY)
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 2c0bd8f2aad0..e0917c0f5d9f 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -304,7 +304,10 @@ int pevent_register_comm(struct pevent *pevent, const char *comm, int pid)
 	if (!item)
 		return -1;
 
-	item->comm = strdup(comm);
+	if (comm)
+		item->comm = strdup(comm);
+	else
+		item->comm = strdup("<...>");
 	if (!item->comm) {
 		free(item);
 		return -1;
@@ -318,9 +321,14 @@ int pevent_register_comm(struct pevent *pevent, const char *comm, int pid)
 	return 0;
 }
 
-void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock)
+int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock)
 {
-	pevent->trace_clock = trace_clock;
+	pevent->trace_clock = strdup(trace_clock);
+	if (!pevent->trace_clock) {
+		errno = ENOMEM;
+		return -1;
+	}
+	return 0;
 }
 
 struct func_map {
@@ -758,6 +766,11 @@ static void free_arg(struct print_arg *arg)
 		free_arg(arg->hex.field);
 		free_arg(arg->hex.size);
 		break;
+	case PRINT_INT_ARRAY:
+		free_arg(arg->int_array.field);
+		free_arg(arg->int_array.count);
+		free_arg(arg->int_array.el_size);
+		break;
 	case PRINT_TYPE:
 		free(arg->typecast.type);
 		free_arg(arg->typecast.item);
@@ -1926,7 +1939,22 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
 			goto out_warn_free;
 
 		type = process_arg_token(event, right, tok, type);
-		arg->op.right = right;
+
+		if (right->type == PRINT_OP &&
+		    get_op_prio(arg->op.op) < get_op_prio(right->op.op)) {
+			struct print_arg tmp;
+
+			/* rotate ops according to the priority */
+			arg->op.right = right->op.left;
+
+			tmp = *arg;
+			*arg = *right;
+			*right = tmp;
+
+			arg->op.left = right;
+		} else {
+			arg->op.right = right;
+		}
 
 	} else if (strcmp(token, "[") == 0) {
 
@@ -2014,6 +2042,38 @@ process_entry(struct event_format *event __maybe_unused, struct print_arg *arg,
 	return EVENT_ERROR;
 }
 
+static int alloc_and_process_delim(struct event_format *event, char *next_token,
+				   struct print_arg **print_arg)
+{
+	struct print_arg *field;
+	enum event_type type;
+	char *token;
+	int ret = 0;
+
+	field = alloc_arg();
+	if (!field) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		errno = ENOMEM;
+		return -1;
+	}
+
+	type = process_arg(event, field, &token);
+
+	if (test_type_token(type, token, EVENT_DELIM, next_token)) {
+		errno = EINVAL;
+		ret = -1;
+		free_arg(field);
+		goto out_free_token;
+	}
+
+	*print_arg = field;
+
+out_free_token:
+	free_token(token);
+
+	return ret;
+}
+
 static char *arg_eval (struct print_arg *arg);
 
 static unsigned long long
@@ -2486,49 +2546,46 @@ out_free:
 static enum event_type
 process_hex(struct event_format *event, struct print_arg *arg, char **tok)
 {
-	struct print_arg *field;
-	enum event_type type;
-	char *token = NULL;
-
 	memset(arg, 0, sizeof(*arg));
 	arg->type = PRINT_HEX;
 
-	field = alloc_arg();
-	if (!field) {
-		do_warning_event(event, "%s: not enough memory!", __func__);
-		goto out_free;
-	}
+	if (alloc_and_process_delim(event, ",", &arg->hex.field))
+		goto out;
 
-	type = process_arg(event, field, &token);
+	if (alloc_and_process_delim(event, ")", &arg->hex.size))
+		goto free_field;
 
-	if (test_type_token(type, token, EVENT_DELIM, ","))
-		goto out_free;
+	return read_token_item(tok);
 
-	arg->hex.field = field;
+free_field:
+	free_arg(arg->hex.field);
+out:
+	*tok = NULL;
+	return EVENT_ERROR;
+}
 
-	free_token(token);
+static enum event_type
+process_int_array(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	memset(arg, 0, sizeof(*arg));
+	arg->type = PRINT_INT_ARRAY;
 
-	field = alloc_arg();
-	if (!field) {
-		do_warning_event(event, "%s: not enough memory!", __func__);
-		*tok = NULL;
-		return EVENT_ERROR;
-	}
+	if (alloc_and_process_delim(event, ",", &arg->int_array.field))
+		goto out;
 
-	type = process_arg(event, field, &token);
+	if (alloc_and_process_delim(event, ",", &arg->int_array.count))
+		goto free_field;
 
-	if (test_type_token(type, token, EVENT_DELIM, ")"))
-		goto out_free;
+	if (alloc_and_process_delim(event, ")", &arg->int_array.el_size))
+		goto free_size;
 
-	arg->hex.size = field;
+	return read_token_item(tok);
 
-	free_token(token);
-	type = read_token_item(tok);
-	return type;
-
- out_free:
-	free_arg(field);
-	free_token(token);
+free_size:
+	free_arg(arg->int_array.count);
+free_field:
+	free_arg(arg->int_array.field);
+out:
 	*tok = NULL;
 	return EVENT_ERROR;
 }
@@ -2828,6 +2885,10 @@ process_function(struct event_format *event, struct print_arg *arg,
 		free_token(token);
 		return process_hex(event, arg, tok);
 	}
+	if (strcmp(token, "__print_array") == 0) {
+		free_token(token);
+		return process_int_array(event, arg, tok);
+	}
 	if (strcmp(token, "__get_str") == 0) {
 		free_token(token);
 		return process_str(event, arg, tok);
@@ -3356,6 +3417,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg
 		break;
 	case PRINT_FLAGS:
 	case PRINT_SYMBOL:
+	case PRINT_INT_ARRAY:
 	case PRINT_HEX:
 		break;
 	case PRINT_TYPE:
@@ -3568,7 +3630,7 @@ static const struct flag flags[] = {
 	{ "HRTIMER_RESTART", 1 },
 };
 
-static unsigned long long eval_flag(const char *flag)
+static long long eval_flag(const char *flag)
 {
 	int i;
 
@@ -3584,7 +3646,7 @@ static unsigned long long eval_flag(const char *flag)
 		if (strcmp(flags[i].name, flag) == 0)
 			return flags[i].value;
 
-	return 0;
+	return -1LL;
 }
 
 static void print_str_to_seq(struct trace_seq *s, const char *format,
@@ -3658,7 +3720,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size,
 	struct print_flag_sym *flag;
 	struct format_field *field;
 	struct printk_map *printk;
-	unsigned long long val, fval;
+	long long val, fval;
 	unsigned long addr;
 	char *str;
 	unsigned char *hex;
@@ -3717,11 +3779,11 @@ static void print_str_arg(struct trace_seq *s, void *data, int size,
 		print = 0;
 		for (flag = arg->flags.flags; flag; flag = flag->next) {
 			fval = eval_flag(flag->value);
-			if (!val && !fval) {
+			if (!val && fval < 0) {
 				print_str_to_seq(s, format, len_arg, flag->str);
 				break;
 			}
-			if (fval && (val & fval) == fval) {
+			if (fval > 0 && (val & fval) == fval) {
 				if (print && arg->flags.delim)
 					trace_seq_puts(s, arg->flags.delim);
 				print_str_to_seq(s, format, len_arg, flag->str);
@@ -3766,6 +3828,54 @@ static void print_str_arg(struct trace_seq *s, void *data, int size,
 		}
 		break;
 
+	case PRINT_INT_ARRAY: {
+		void *num;
+		int el_size;
+
+		if (arg->int_array.field->type == PRINT_DYNAMIC_ARRAY) {
+			unsigned long offset;
+			struct format_field *field =
+				arg->int_array.field->dynarray.field;
+			offset = pevent_read_number(pevent,
+						    data + field->offset,
+						    field->size);
+			num = data + (offset & 0xffff);
+		} else {
+			field = arg->int_array.field->field.field;
+			if (!field) {
+				str = arg->int_array.field->field.name;
+				field = pevent_find_any_field(event, str);
+				if (!field)
+					goto out_warning_field;
+				arg->int_array.field->field.field = field;
+			}
+			num = data + field->offset;
+		}
+		len = eval_num_arg(data, size, event, arg->int_array.count);
+		el_size = eval_num_arg(data, size, event,
+				       arg->int_array.el_size);
+		for (i = 0; i < len; i++) {
+			if (i)
+				trace_seq_putc(s, ' ');
+
+			if (el_size == 1) {
+				trace_seq_printf(s, "%u", *(uint8_t *)num);
+			} else if (el_size == 2) {
+				trace_seq_printf(s, "%u", *(uint16_t *)num);
+			} else if (el_size == 4) {
+				trace_seq_printf(s, "%u", *(uint32_t *)num);
+			} else if (el_size == 8) {
+				trace_seq_printf(s, "%lu", *(uint64_t *)num);
+			} else {
+				trace_seq_printf(s, "BAD SIZE:%d 0x%x",
+						 el_size, *(uint8_t *)num);
+				el_size = 1;
+			}
+
+			num += el_size;
+		}
+		break;
+	}
 	case PRINT_TYPE:
 		break;
 	case PRINT_STRING: {
@@ -3997,6 +4107,10 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc
 				goto process_again;
 			case '.':
 				goto process_again;
+			case 'z':
+			case 'Z':
+				ls = 1;
+				goto process_again;
 			case 'p':
 				ls = 1;
 				/* fall through */
@@ -4939,6 +5053,96 @@ const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid)
 	return comm;
 }
 
+static struct cmdline *
+pid_from_cmdlist(struct pevent *pevent, const char *comm, struct cmdline *next)
+{
+	struct cmdline_list *cmdlist = (struct cmdline_list *)next;
+
+	if (cmdlist)
+		cmdlist = cmdlist->next;
+	else
+		cmdlist = pevent->cmdlist;
+
+	while (cmdlist && strcmp(cmdlist->comm, comm) != 0)
+		cmdlist = cmdlist->next;
+
+	return (struct cmdline *)cmdlist;
+}
+
+/**
+ * pevent_data_pid_from_comm - return the pid from a given comm
+ * @pevent: a handle to the pevent
+ * @comm: the cmdline to find the pid from
+ * @next: the cmdline structure to find the next comm
+ *
+ * This returns the cmdline structure that holds a pid for a given
+ * comm, or NULL if none found. As there may be more than one pid for
+ * a given comm, the result of this call can be passed back into
+ * a recurring call in the @next paramater, and then it will find the
+ * next pid.
+ * Also, it does a linear seach, so it may be slow.
+ */
+struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *comm,
+					  struct cmdline *next)
+{
+	struct cmdline *cmdline;
+
+	/*
+	 * If the cmdlines have not been converted yet, then use
+	 * the list.
+	 */
+	if (!pevent->cmdlines)
+		return pid_from_cmdlist(pevent, comm, next);
+
+	if (next) {
+		/*
+		 * The next pointer could have been still from
+		 * a previous call before cmdlines were created
+		 */
+		if (next < pevent->cmdlines ||
+		    next >= pevent->cmdlines + pevent->cmdline_count)
+			next = NULL;
+		else
+			cmdline  = next++;
+	}
+
+	if (!next)
+		cmdline = pevent->cmdlines;
+
+	while (cmdline < pevent->cmdlines + pevent->cmdline_count) {
+		if (strcmp(cmdline->comm, comm) == 0)
+			return cmdline;
+		cmdline++;
+	}
+	return NULL;
+}
+
+/**
+ * pevent_cmdline_pid - return the pid associated to a given cmdline
+ * @cmdline: The cmdline structure to get the pid from
+ *
+ * Returns the pid for a give cmdline. If @cmdline is NULL, then
+ * -1 is returned.
+ */
+int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline)
+{
+	struct cmdline_list *cmdlist = (struct cmdline_list *)cmdline;
+
+	if (!cmdline)
+		return -1;
+
+	/*
+	 * If cmdlines have not been created yet, or cmdline is
+	 * not part of the array, then treat it as a cmdlist instead.
+	 */
+	if (!pevent->cmdlines ||
+	    cmdline < pevent->cmdlines ||
+	    cmdline >= pevent->cmdlines + pevent->cmdline_count)
+		return cmdlist->pid;
+
+	return cmdline->pid;
+}
+
 /**
  * pevent_data_comm_from_pid - parse the data into the print format
  * @s: the trace_seq to write to
@@ -5256,6 +5460,15 @@ static void print_args(struct print_arg *args)
 		print_args(args->hex.size);
 		printf(")");
 		break;
+	case PRINT_INT_ARRAY:
+		printf("__print_array(");
+		print_args(args->int_array.field);
+		printf(", ");
+		print_args(args->int_array.count);
+		printf(", ");
+		print_args(args->int_array.el_size);
+		printf(")");
+		break;
 	case PRINT_STRING:
 	case PRINT_BSTRING:
 		printf("__get_str(%s)", args->string.string);
@@ -6228,15 +6441,20 @@ void pevent_ref(struct pevent *pevent)
 	pevent->ref_count++;
 }
 
+void pevent_free_format_field(struct format_field *field)
+{
+	free(field->type);
+	free(field->name);
+	free(field);
+}
+
 static void free_format_fields(struct format_field *field)
 {
 	struct format_field *next;
 
 	while (field) {
 		next = field->next;
-		free(field->type);
-		free(field->name);
-		free(field);
+		pevent_free_format_field(field);
 		field = next;
 	}
 }
@@ -6341,6 +6559,7 @@ void pevent_free(struct pevent *pevent)
 		free_handler(handle);
 	}
 
+	free(pevent->trace_clock);
 	free(pevent->events);
 	free(pevent->sort_events);
 
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 7a3873ff9a4f..86a5839fb048 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -22,6 +22,7 @@
 
 #include <stdbool.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <regex.h>
 #include <string.h>
 
@@ -91,6 +92,7 @@ extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
 
 extern void trace_seq_terminate(struct trace_seq *s);
 
+extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp);
 extern int trace_seq_do_printf(struct trace_seq *s);
 
 
@@ -114,7 +116,7 @@ struct pevent_plugin_option {
 	char				*name;
 	char				*plugin_alias;
 	char				*description;
-	char				*value;
+	const char			*value;
 	void				*priv;
 	int				set;
 };
@@ -152,6 +154,10 @@ struct pevent_plugin_option {
  *   .plugin_alias is used to give a shorter name to access
  *   the vairable. Useful if a plugin handles more than one event.
  *
+ *   If .value is not set, then it is considered a boolean and only
+ *   .set will be processed. If .value is defined, then it is considered
+ *   a string option and .set will be ignored.
+ *
  * PEVENT_PLUGIN_ALIAS: (optional)
  *   The name to use for finding options (uses filename if not defined)
  */
@@ -245,6 +251,12 @@ struct print_arg_hex {
 	struct print_arg	*size;
 };
 
+struct print_arg_int_array {
+	struct print_arg	*field;
+	struct print_arg	*count;
+	struct print_arg	*el_size;
+};
+
 struct print_arg_dynarray {
 	struct format_field	*field;
 	struct print_arg	*index;
@@ -273,6 +285,7 @@ enum print_arg_type {
 	PRINT_FLAGS,
 	PRINT_SYMBOL,
 	PRINT_HEX,
+	PRINT_INT_ARRAY,
 	PRINT_TYPE,
 	PRINT_STRING,
 	PRINT_BSTRING,
@@ -292,6 +305,7 @@ struct print_arg {
 		struct print_arg_flags		flags;
 		struct print_arg_symbol		symbol;
 		struct print_arg_hex		hex;
+		struct print_arg_int_array	int_array;
 		struct print_arg_func		func;
 		struct print_arg_string		string;
 		struct print_arg_bitmask	bitmask;
@@ -597,7 +611,7 @@ enum trace_flag_type {
 };
 
 int pevent_register_comm(struct pevent *pevent, const char *comm, int pid);
-void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock);
+int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock);
 int pevent_register_function(struct pevent *pevent, char *name,
 			     unsigned long long addr, char *mod);
 int pevent_register_print_string(struct pevent *pevent, const char *fmt,
@@ -617,6 +631,7 @@ enum pevent_errno pevent_parse_format(struct pevent *pevent,
 				      const char *buf,
 				      unsigned long size, const char *sys);
 void pevent_free_format(struct event_format *event);
+void pevent_free_format_field(struct format_field *field);
 
 void *pevent_get_field_raw(struct trace_seq *s, struct event_format *event,
 			   const char *name, struct pevent_record *record,
@@ -675,6 +690,11 @@ int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
 struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type);
 int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
 const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
+struct cmdline;
+struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *comm,
+					  struct cmdline *next);
+int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline);
+
 void pevent_event_info(struct trace_seq *s, struct event_format *event,
 		       struct pevent_record *record);
 int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,
diff --git a/tools/lib/traceevent/event-plugin.c b/tools/lib/traceevent/event-plugin.c
index 136162c03af1..a16756ae3526 100644
--- a/tools/lib/traceevent/event-plugin.c
+++ b/tools/lib/traceevent/event-plugin.c
@@ -18,6 +18,7 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
+#include <ctype.h>
 #include <stdio.h>
 #include <string.h>
 #include <dlfcn.h>
@@ -49,6 +50,52 @@ struct plugin_list {
 	void			*handle;
 };
 
+static void lower_case(char *str)
+{
+	if (!str)
+		return;
+	for (; *str; str++)
+		*str = tolower(*str);
+}
+
+static int update_option_value(struct pevent_plugin_option *op, const char *val)
+{
+	char *op_val;
+
+	if (!val) {
+		/* toggle, only if option is boolean */
+		if (op->value)
+			/* Warn? */
+			return 0;
+		op->set ^= 1;
+		return 0;
+	}
+
+	/*
+	 * If the option has a value then it takes a string
+	 * otherwise the option is a boolean.
+	 */
+	if (op->value) {
+		op->value = val;
+		return 0;
+	}
+
+	/* Option is boolean, must be either "1", "0", "true" or "false" */
+
+	op_val = strdup(val);
+	if (!op_val)
+		return -1;
+	lower_case(op_val);
+
+	if (strcmp(val, "1") == 0 || strcmp(val, "true") == 0)
+		op->set = 1;
+	else if (strcmp(val, "0") == 0 || strcmp(val, "false") == 0)
+		op->set = 0;
+	free(op_val);
+
+	return 0;
+}
+
 /**
  * traceevent_plugin_list_options - get list of plugin options
  *
@@ -120,6 +167,7 @@ update_option(const char *file, struct pevent_plugin_option *option)
 {
 	struct trace_plugin_options *op;
 	char *plugin;
+	int ret = 0;
 
 	if (option->plugin_alias) {
 		plugin = strdup(option->plugin_alias);
@@ -144,9 +192,10 @@ update_option(const char *file, struct pevent_plugin_option *option)
 		if (strcmp(op->option, option->name) != 0)
 			continue;
 
-		option->value = op->value;
-		option->set ^= 1;
-		goto out;
+		ret = update_option_value(option, op->value);
+		if (ret)
+			goto out;
+		break;
 	}
 
 	/* first look for unnamed options */
@@ -156,14 +205,13 @@ update_option(const char *file, struct pevent_plugin_option *option)
 		if (strcmp(op->option, option->name) != 0)
 			continue;
 
-		option->value = op->value;
-		option->set ^= 1;
+		ret = update_option_value(option, op->value);
 		break;
 	}
 
  out:
 	free(plugin);
-	return 0;
+	return ret;
 }
 
 /**
diff --git a/tools/lib/traceevent/kbuffer-parse.c b/tools/lib/traceevent/kbuffer-parse.c
index dcc665228c71..3bcada3ae05a 100644
--- a/tools/lib/traceevent/kbuffer-parse.c
+++ b/tools/lib/traceevent/kbuffer-parse.c
@@ -372,7 +372,6 @@ translate_data(struct kbuffer *kbuf, void *data, void **rptr,
 	switch (type_len) {
 	case KBUFFER_TYPE_PADDING:
 		*length = read_4(kbuf, data);
-		data += *length;
 		break;
 
 	case KBUFFER_TYPE_TIME_EXTEND:
@@ -730,3 +729,14 @@ void kbuffer_set_old_format(struct kbuffer *kbuf)
 
 	kbuf->next_event = __old_next_event;
 }
+
+/**
+ * kbuffer_start_of_data - return offset of where data starts on subbuffer
+ * @kbuf:	The kbuffer
+ *
+ * Returns the location on the subbuffer where the data starts.
+ */
+int kbuffer_start_of_data(struct kbuffer *kbuf)
+{
+	return kbuf->start;
+}
diff --git a/tools/lib/traceevent/kbuffer.h b/tools/lib/traceevent/kbuffer.h
index c831f64b17a0..03dce757553f 100644
--- a/tools/lib/traceevent/kbuffer.h
+++ b/tools/lib/traceevent/kbuffer.h
@@ -63,5 +63,6 @@ int kbuffer_missed_events(struct kbuffer *kbuf);
 int kbuffer_subbuffer_size(struct kbuffer *kbuf);
 
 void kbuffer_set_old_format(struct kbuffer *kbuf);
+int kbuffer_start_of_data(struct kbuffer *kbuf);
 
 #endif /* _K_BUFFER_H */
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index b50234402fc2..0144b3d1bb77 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1058,6 +1058,7 @@ process_filter(struct event_format *event, struct filter_arg **parg,
 					*parg = current_op;
 				else
 					*parg = current_exp;
+				free(token);
 				return PEVENT_ERRNO__UNBALANCED_PAREN;
 			}
 			break;
@@ -1168,6 +1169,7 @@ process_filter(struct event_format *event, struct filter_arg **parg,
 
 	*parg = current_op;
 
+	free(token);
 	return 0;
 
  fail_alloc:
diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c
index ec3bd16a5488..292dc9f1d233 100644
--- a/tools/lib/traceevent/trace-seq.c
+++ b/tools/lib/traceevent/trace-seq.c
@@ -231,19 +231,24 @@ void trace_seq_terminate(struct trace_seq *s)
 	s->buffer[s->len] = 0;
 }
 
-int trace_seq_do_printf(struct trace_seq *s)
+int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp)
 {
 	TRACE_SEQ_CHECK(s);
 
 	switch (s->state) {
 	case TRACE_SEQ__GOOD:
-		return printf("%.*s", s->len, s->buffer);
+		return fprintf(fp, "%.*s", s->len, s->buffer);
 	case TRACE_SEQ__BUFFER_POISONED:
-		puts("Usage of trace_seq after it was destroyed");
+		fprintf(fp, "%s\n", "Usage of trace_seq after it was destroyed");
 		break;
 	case TRACE_SEQ__MEM_ALLOC_FAILED:
-		puts("Can't allocate trace_seq buffer memory");
+		fprintf(fp, "%s\n", "Can't allocate trace_seq buffer memory");
 		break;
 	}
 	return -1;
 }
+
+int trace_seq_do_printf(struct trace_seq *s)
+{
+	return trace_seq_do_fprintf(s, stdout);
+}
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index 40399c3d97d6..812f904193e8 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -1,6 +1,7 @@
 PERF-CFLAGS
 PERF-GUI-VARS
 PERF-VERSION-FILE
+FEATURE-DUMP
 perf
 perf-read-vdso32
 perf-read-vdsox32
diff --git a/tools/perf/Build b/tools/perf/Build
new file mode 100644
index 000000000000..b77370ef7005
--- /dev/null
+++ b/tools/perf/Build
@@ -0,0 +1,44 @@
+perf-y += builtin-bench.o
+perf-y += builtin-annotate.o
+perf-y += builtin-diff.o
+perf-y += builtin-evlist.o
+perf-y += builtin-help.o
+perf-y += builtin-sched.o
+perf-y += builtin-buildid-list.o
+perf-y += builtin-buildid-cache.o
+perf-y += builtin-list.o
+perf-y += builtin-record.o
+perf-y += builtin-report.o
+perf-y += builtin-stat.o
+perf-y += builtin-timechart.o
+perf-y += builtin-top.o
+perf-y += builtin-script.o
+perf-y += builtin-kmem.o
+perf-y += builtin-lock.o
+perf-y += builtin-kvm.o
+perf-y += builtin-inject.o
+perf-y += builtin-mem.o
+perf-y += builtin-data.o
+
+perf-$(CONFIG_AUDIT) += builtin-trace.o
+perf-$(CONFIG_LIBELF) += builtin-probe.o
+
+perf-y += bench/
+perf-y += tests/
+
+perf-y += perf.o
+
+paths += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))"
+paths += -DPERF_INFO_PATH="BUILD_STR($(infodir_SQ))"
+paths += -DPERF_MAN_PATH="BUILD_STR($(mandir_SQ))"
+
+CFLAGS_builtin-help.o      += $(paths)
+CFLAGS_builtin-timechart.o += $(paths)
+CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" -include $(OUTPUT)PERF-VERSION-FILE
+
+libperf-y += util/
+libperf-y += arch/
+libperf-y += ui/
+libperf-y += scripts/
+
+gtk-y += ui/gtk/
diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt
new file mode 100644
index 000000000000..f6fc6507ba55
--- /dev/null
+++ b/tools/perf/Documentation/Build.txt
@@ -0,0 +1,49 @@
+
+1) perf build
+=============
+The perf build process consists of several separated building blocks,
+which are linked together to form the perf binary:
+  - libperf library (static)
+  - perf builtin commands
+  - traceevent library (static)
+  - GTK ui library
+
+Several makefiles govern the perf build:
+
+  - Makefile
+    top level Makefile working as a wrapper that calls the main
+    Makefile.perf with a -j option to do parallel builds.
+
+  - Makefile.perf
+    main makefile that triggers build of all perf objects including
+    installation and documentation processing.
+
+  - tools/build/Makefile.build
+    main makefile of the build framework
+
+  - tools/build/Build.include
+    build framework generic definitions
+
+  - Build makefiles
+    makefiles that defines build objects
+
+Please refer to tools/build/Documentation/Build.txt for more
+information about build framework.
+
+
+2) perf build
+=============
+The Makefile.perf triggers the build framework for build objects:
+   perf, libperf, gtk
+
+resulting in following objects:
+  $ ls  *-in.o
+  gtk-in.o  libperf-in.o  perf-in.o
+
+Those objects are then used in final linking:
+  libperf-gtk.so <- gtk-in.o  libperf-in.o
+  perf           <- perf-in.o libperf-in.o
+
+
+NOTE this description is omitting other libraries involved, only
+     focusing on build framework outcomes
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 0294c57b1f5e..dd07b55f58d8 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -12,9 +12,9 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This command manages the build-id cache. It can add and remove files to/from
-the cache. In the future it should as well purge older entries, set upper
-limits for the space used by the cache, etc.
+This command manages the build-id cache. It can add, remove, update and purge
+files to/from the cache. In the future it should as well set upper limits for
+the space used by the cache, etc.
 
 OPTIONS
 -------
@@ -36,14 +36,24 @@ OPTIONS
         actually made.
 -r::
 --remove=::
-        Remove specified file from the cache.
+        Remove a cached binary which has same build-id of specified file
+        from the cache.
+-p::
+--purge=::
+        Purge all cached binaries including older caches which have specified
+	path from the cache.
 -M::
 --missing=::
 	List missing build ids in the cache for the specified file.
 -u::
---update::
-	Update specified file of the cache. It can be used to update kallsyms
-	kernel dso to vmlinux in order to support annotation.
+--update=::
+	Update specified file of the cache. Note that this doesn't remove
+	older entires since those may be still needed for annotating old
+	(or remote) perf.data. Only if there is already a cache which has
+	exactly same build-id, that is replaced by new one. It can be used
+	to update kallsyms and kernel dso to vmlinux in order to support
+	annotation.
+
 -v::
 --verbose::
 	Be more verbose.
diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
new file mode 100644
index 000000000000..be8fa1a0a97e
--- /dev/null
+++ b/tools/perf/Documentation/perf-data.txt
@@ -0,0 +1,40 @@
+perf-data(1)
+==============
+
+NAME
+----
+perf-data - Data file related processing
+
+SYNOPSIS
+--------
+[verse]
+'perf data' [<common options>] <command> [<options>]",
+
+DESCRIPTION
+-----------
+Data file related processing.
+
+COMMANDS
+--------
+convert::
+	Converts perf data file into another format (only CTF [1] format is support by now).
+	It's possible to set data-convert debug variable to get debug messages from conversion,
+	like:
+	  perf --debug data-convert data convert ...
+
+OPTIONS for 'convert'
+---------------------
+--to-ctf::
+	Triggers the CTF conversion, specify the path of CTF data directory.
+
+-i::
+	Specify input perf data file path.
+
+-v::
+--verbose::
+        Be more verbose (show counter open errors, etc).
+
+SEE ALSO
+--------
+linkperf:perf[1]
+[1] Common Trace Format - http://www.efficios.com/ctf
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index e463caa3eb49..d1deb573877f 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -20,12 +20,20 @@ If no parameters are passed it will assume perf.data.old and perf.data.
 The differential profile is displayed only for events matching both
 specified perf.data files.
 
+If no parameters are passed the samples will be sorted by dso and symbol.
+As the perf.data files could come from different binaries, the symbols addresses
+could vary. So perf diff is based on the comparison of the files and
+symbols name.
+
 OPTIONS
 -------
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+--kallsyms=<file>::
+        kallsyms pathname
+
 -m::
 --modules::
         Load module symbols. WARNING: use only with -k and LIVE kernel
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index 7c8fbbf3f61c..150253cc3c97 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -25,6 +25,10 @@ OPTIONS
 --input=<file>::
 	Select the input file (default: perf.data unless stdin is a fifo)
 
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
 --caller::
 	Show per-callsite statistics
 
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 3e2aec94f806..bada8933fdd4 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -26,6 +26,7 @@ counted. The following modifiers exist:
  u - user-space counting
  k - kernel counting
  h - hypervisor counting
+ I - non idle counting
  G - guest counting (in KVM guests)
  H - host counting (not in KVM guests)
  p - precise level
@@ -127,6 +128,12 @@ To limit the list use:
 One or more types can be used at the same time, listing the events for the
 types specified.
 
+Support raw format:
+
+. '--raw-dump', shows the raw-dump of all the events.
+. '--raw-dump [hw|sw|cache|tracepoint|pmu|event_glob]', shows the raw-dump of
+  a certain kind of events.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index aaa869be3dc1..239609c09f83 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -47,6 +47,12 @@ OPTIONS
 -v::
 --verbose::
         Be more verbose (show parsed arguments, etc).
+	Can not use with -q.
+
+-q::
+--quiet::
+	Be quiet (do not show any messages including errors).
+	Can not use with -v.
 
 -a::
 --add=::
@@ -96,7 +102,7 @@ OPTIONS
 	Dry run. With this option, --add and --del doesn't execute actual
 	adding and removal operations.
 
---max-probes::
+--max-probes=NUM::
 	Set the maximum number of probe points for an event. Default is 128.
 
 -x::
@@ -104,8 +110,13 @@ OPTIONS
 	Specify path to the executable or shared library file for user
 	space tracing. Can also be used with --funcs option.
 
+--demangle::
+	Demangle application symbols. --no-demangle is also available
+	for disabling demangling.
+
 --demangle-kernel::
-	Demangle kernel symbols.
+	Demangle kernel symbols. --no-demangle-kernel is also available
+	for disabling kernel demangling.
 
 In absence of -m/-x options, perf probe checks if the first argument after
 the options is an absolute path name. If its an absolute path, perf probe
@@ -137,6 +148,7 @@ Each probe argument follows below syntax.
  [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE]
 
 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
+'$vars' special argument is also available for NAME, it is expanded to the local variables which can access at given probe point.
 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
 
 On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 31e977459c51..4847a793de65 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -55,6 +55,11 @@ OPTIONS
           If you want to profile write accesses in [0x1000~1008), just set
           'mem:0x1000/8:w'.
 
+	- a group of events surrounded by a pair of brace ("{event1,event2,...}").
+	  Each event is separated by commas and the group should be quoted to
+	  prevent the shell interpretation.  You also need to use --group on
+	  "perf report" to view group events together.
+
 --filter=<filter>::
         Event filter.
 
@@ -62,9 +67,6 @@ OPTIONS
 --all-cpus::
         System-wide collection from all CPUs.
 
--l::
-        Scale counter values.
-
 -p::
 --pid=::
 	Record events on existing process ID (comma separated list).
@@ -107,6 +109,10 @@ OPTIONS
 	specification with appended unit character - B/K/M/G. The
 	size is rounded up to have nearest pages power of two value.
 
+--group::
+	Put all events in a single event group.  This precedes the --event
+	option and remains only for backward compatibility.  See --event.
+
 -g::
 	Enables call-graph (stack chain/backtrace) recording.
 
@@ -115,13 +121,19 @@ OPTIONS
 	implies -g.
 
 	Allows specifying "fp" (frame pointer) or "dwarf"
-	(DWARF's CFI - Call Frame Information) as the method to collect
+	(DWARF's CFI - Call Frame Information) or "lbr"
+	(Hardware Last Branch Record facility) as the method to collect
 	the information used to show the call graphs.
 
 	In some systems, where binaries are build with gcc
 	--fomit-frame-pointer, using the "fp" method will produce bogus
 	call graphs, using "dwarf", if available (perf tools linked to
 	the libunwind library) should be used instead.
+	Using the "lbr" method doesn't require any compiler options. It
+	will produce call graphs from the hardware LBR registers. The
+	main limition is that it is only available on new Intel
+	platforms, such as Haswell. It can only get user call chain. It
+	doesn't work with branch stack sampling at the same time.
 
 -q::
 --quiet::
@@ -235,6 +247,16 @@ Capture machine state (registers) at interrupt, i.e., on counter overflows for
 each sample. List of captured registers depends on the architecture. This option
 is off by default.
 
+--running-time::
+Record running and enabled time for read events (:S)
+
+-k::
+--clockid::
+Sets the clock id to use for the various time fields in the perf_event_type
+records. See clock_gettime(). In particular CLOCK_MONOTONIC and
+CLOCK_MONOTONIC_RAW are supported, some events might also allow
+CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index dd7cccdde498..4879cf638824 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -40,6 +40,11 @@ OPTIONS
 	Only consider symbols in these comms. CSV that understands
 	file://filename entries.  This option will affect the percentage of
 	the overhead column.  See --percentage for more info.
+--pid=::
+        Only show events for given process ID (comma separated list).
+
+--tid=::
+        Only show events for given thread ID (comma separated list).
 -d::
 --dsos=::
 	Only consider symbols in these dsos. CSV that understands
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index a21eec05bc42..79445750fcb3 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -193,6 +193,12 @@ OPTIONS
 	Only display events for these comms. CSV that understands
 	file://filename entries.
 
+--pid=::
+	Only show events for given process ID (comma separated list).
+
+--tid=::
+	Only show events for given thread ID (comma separated list).
+
 -I::
 --show-info::
 	Display extended information about the perf.data file. This adds
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 7e1b1f2bb83c..ba03fd5d1a54 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -55,6 +55,9 @@ OPTIONS
 --uid=::
         Record events in threads owned by uid. Name or number.
 
+--filter-pids=::
+	Filter out events for these pids and for 'trace' itself (comma separated list).
+
 -v::
 --verbose=::
         Verbosity level.
@@ -115,6 +118,9 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 --syscalls::
 	Trace system calls. This options is enabled by default.
 
+--event::
+	Trace other events, see 'perf list' for a complete list.
+
 PAGEFAULTS
 ----------
 
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
index 1e8e400b4493..2b131776363e 100644
--- a/tools/perf/Documentation/perf.txt
+++ b/tools/perf/Documentation/perf.txt
@@ -13,11 +13,16 @@ SYNOPSIS
 OPTIONS
 -------
 --debug::
-	Setup debug variable (just verbose for now) in value
+	Setup debug variable (see list below) in value
 	range (0, 10). Use like:
 	  --debug verbose   # sets verbose = 1
 	  --debug verbose=2 # sets verbose = 2
 
+	List of debug variables allowed to set:
+	  verbose          - general debug messages
+	  ordered-events   - ordered events object debug messages
+	  data-convert     - data convert command debug messages
+
 --buildid-dir::
 	Setup buildid cache directory. It has higher priority than
 	buildid.dir config file option.
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index fbbfdc39271d..11ccbb22ea2b 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -1,5 +1,6 @@
 tools/perf
 tools/scripts
+tools/build
 tools/lib/traceevent
 tools/lib/api
 tools/lib/symbol/kallsyms.c
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index cb2e5868c8e8..c699dc35eef9 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -24,8 +24,8 @@ unexport MAKEFLAGS
 # (To override it, run 'make JOBS=1' and similar.)
 #
 ifeq ($(JOBS),)
-  JOBS := $(shell grep -c ^processor /proc/cpuinfo 2>/dev/null)
-  ifeq ($(JOBS),)
+  JOBS := $(shell egrep -c '^processor|^CPU' /proc/cpuinfo 2>/dev/null)
+  ifeq ($(JOBS),0)
     JOBS := 1
   endif
 endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index aa6a50447c32..c43a20517591 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -68,7 +68,11 @@ include config/utilities.mak
 # for reading the x32 mode 32-bit compatibility VDSO in 64-bit mode
 #
 # Define NO_ZLIB if you do not want to support compressed kernel modules
-
+#
+# Define LIBBABELTRACE if you DO want libbabeltrace support
+# for CTF data format.
+#
+# Define NO_LZMA if you do not want to support compressed (xz) kernel modules
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(shell pwd)))
@@ -82,13 +86,29 @@ endif
 
 ifneq ($(OUTPUT),)
 #$(info Determined 'OUTPUT' to be $(OUTPUT))
+# Adding $(OUTPUT) as a directory to look for source files,
+# because use generated output files as sources dependency
+# for flex/bison parsers.
+VPATH += $(OUTPUT)
+export VPATH
 endif
 
+ifeq ($(V),1)
+  Q =
+else
+  Q = @
+endif
+
+# Do not use make's built-in rules
+# (this improves performance and avoids hard-to-debug behaviour);
+MAKEFLAGS += -r
+
 $(OUTPUT)PERF-VERSION-FILE: ../../.git/HEAD
-	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
-	@touch $(OUTPUT)PERF-VERSION-FILE
+	$(Q)$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
+	$(Q)touch $(OUTPUT)PERF-VERSION-FILE
 
 CC = $(CROSS_COMPILE)gcc
+LD = $(CROSS_COMPILE)ld
 AR = $(CROSS_COMPILE)ar
 PKG_CONFIG = $(CROSS_COMPILE)pkg-config
 
@@ -127,10 +147,6 @@ export prefix bindir sharedir sysconfdir DESTDIR
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
 # Guard against environment variables
-BUILTIN_OBJS =
-LIB_H =
-LIB_OBJS =
-GTK_OBJS =
 PYRF_OBJS =
 SCRIPT_SH =
 
@@ -155,8 +171,8 @@ endif
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 export LIBTRACEEVENT
 
-LIBAPIKFS = $(LIB_PATH)libapikfs.a
-export LIBAPIKFS
+LIBAPI = $(LIB_PATH)libapi.a
+export LIBAPI
 
 # python extension build directories
 PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
@@ -167,7 +183,7 @@ export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
 python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
 
 PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
-PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPIKFS)
+PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPI)
 
 $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
 	$(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \
@@ -206,297 +222,9 @@ endif
 
 export PERL_PATH
 
-$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
-	$(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
-
-$(OUTPUT)util/parse-events-bison.c: util/parse-events.y
-	$(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_
-
-$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
-	$(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
-
-$(OUTPUT)util/pmu-bison.c: util/pmu.y
-	$(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
-
-$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
-$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
-
 LIB_FILE=$(OUTPUT)libperf.a
 
-LIB_H += ../lib/symbol/kallsyms.h
-LIB_H += ../../include/uapi/linux/perf_event.h
-LIB_H += ../../include/linux/rbtree.h
-LIB_H += ../../include/linux/list.h
-LIB_H += ../../include/uapi/linux/const.h
-LIB_H += ../include/linux/hash.h
-LIB_H += ../../include/linux/stringify.h
-LIB_H += util/include/linux/bitmap.h
-LIB_H += ../include/linux/bitops.h
-LIB_H += ../include/asm-generic/bitops/arch_hweight.h
-LIB_H += ../include/asm-generic/bitops/atomic.h
-LIB_H += ../include/asm-generic/bitops/const_hweight.h
-LIB_H += ../include/asm-generic/bitops/find.h
-LIB_H += ../include/asm-generic/bitops/fls64.h
-LIB_H += ../include/asm-generic/bitops/fls.h
-LIB_H += ../include/asm-generic/bitops/__ffs.h
-LIB_H += ../include/asm-generic/bitops/__fls.h
-LIB_H += ../include/asm-generic/bitops/hweight.h
-LIB_H += ../include/asm-generic/bitops.h
-LIB_H += ../include/linux/compiler.h
-LIB_H += ../include/linux/log2.h
-LIB_H += util/include/linux/const.h
-LIB_H += util/include/linux/ctype.h
-LIB_H += util/include/linux/kernel.h
-LIB_H += util/include/linux/list.h
-LIB_H += ../include/linux/export.h
-LIB_H += util/include/linux/poison.h
-LIB_H += util/include/linux/rbtree.h
-LIB_H += util/include/linux/rbtree_augmented.h
-LIB_H += util/include/linux/string.h
-LIB_H += ../include/linux/types.h
-LIB_H += util/include/linux/linkage.h
-LIB_H += util/include/asm/asm-offsets.h
-LIB_H += ../include/asm/bug.h
-LIB_H += util/include/asm/byteorder.h
-LIB_H += util/include/asm/swab.h
-LIB_H += util/include/asm/system.h
-LIB_H += util/include/asm/uaccess.h
-LIB_H += util/include/dwarf-regs.h
-LIB_H += util/include/asm/dwarf2.h
-LIB_H += util/include/asm/cpufeature.h
-LIB_H += util/include/asm/unistd_32.h
-LIB_H += util/include/asm/unistd_64.h
-LIB_H += perf.h
-LIB_H += util/annotate.h
-LIB_H += util/cache.h
-LIB_H += util/callchain.h
-LIB_H += util/build-id.h
-LIB_H += util/db-export.h
-LIB_H += util/debug.h
-LIB_H += util/pmu.h
-LIB_H += util/event.h
-LIB_H += util/evsel.h
-LIB_H += util/evlist.h
-LIB_H += util/exec_cmd.h
-LIB_H += util/find-vdso-map.c
-LIB_H += util/levenshtein.h
-LIB_H += util/machine.h
-LIB_H += util/map.h
-LIB_H += util/parse-options.h
-LIB_H += util/parse-events.h
-LIB_H += util/quote.h
-LIB_H += util/util.h
-LIB_H += util/xyarray.h
-LIB_H += util/header.h
-LIB_H += util/help.h
-LIB_H += util/session.h
-LIB_H += util/ordered-events.h
-LIB_H += util/strbuf.h
-LIB_H += util/strlist.h
-LIB_H += util/strfilter.h
-LIB_H += util/svghelper.h
-LIB_H += util/tool.h
-LIB_H += util/run-command.h
-LIB_H += util/sigchain.h
-LIB_H += util/dso.h
-LIB_H += util/symbol.h
-LIB_H += util/color.h
-LIB_H += util/values.h
-LIB_H += util/sort.h
-LIB_H += util/hist.h
-LIB_H += util/comm.h
-LIB_H += util/thread.h
-LIB_H += util/thread_map.h
-LIB_H += util/trace-event.h
-LIB_H += util/probe-finder.h
-LIB_H += util/dwarf-aux.h
-LIB_H += util/probe-event.h
-LIB_H += util/pstack.h
-LIB_H += util/cpumap.h
-LIB_H += util/top.h
-LIB_H += $(ARCH_INCLUDE)
-LIB_H += util/cgroup.h
-LIB_H += $(LIB_INCLUDE)traceevent/event-parse.h
-LIB_H += util/target.h
-LIB_H += util/rblist.h
-LIB_H += util/intlist.h
-LIB_H += util/perf_regs.h
-LIB_H += util/unwind.h
-LIB_H += util/vdso.h
-LIB_H += util/tsc.h
-LIB_H += ui/helpline.h
-LIB_H += ui/progress.h
-LIB_H += ui/util.h
-LIB_H += ui/ui.h
-LIB_H += util/data.h
-LIB_H += util/kvm-stat.h
-LIB_H += util/thread-stack.h
-
-LIB_OBJS += $(OUTPUT)util/abspath.o
-LIB_OBJS += $(OUTPUT)util/alias.o
-LIB_OBJS += $(OUTPUT)util/annotate.o
-LIB_OBJS += $(OUTPUT)util/build-id.o
-LIB_OBJS += $(OUTPUT)util/config.o
-LIB_OBJS += $(OUTPUT)util/ctype.o
-LIB_OBJS += $(OUTPUT)util/db-export.o
-LIB_OBJS += $(OUTPUT)util/pmu.o
-LIB_OBJS += $(OUTPUT)util/environment.o
-LIB_OBJS += $(OUTPUT)util/event.o
-LIB_OBJS += $(OUTPUT)util/evlist.o
-LIB_OBJS += $(OUTPUT)util/evsel.o
-LIB_OBJS += $(OUTPUT)util/exec_cmd.o
-LIB_OBJS += $(OUTPUT)util/find_next_bit.o
-LIB_OBJS += $(OUTPUT)util/help.o
-LIB_OBJS += $(OUTPUT)util/kallsyms.o
-LIB_OBJS += $(OUTPUT)util/levenshtein.o
-LIB_OBJS += $(OUTPUT)util/parse-options.o
-LIB_OBJS += $(OUTPUT)util/parse-events.o
-LIB_OBJS += $(OUTPUT)util/path.o
-LIB_OBJS += $(OUTPUT)util/rbtree.o
-LIB_OBJS += $(OUTPUT)util/bitmap.o
-LIB_OBJS += $(OUTPUT)util/hweight.o
-LIB_OBJS += $(OUTPUT)util/run-command.o
-LIB_OBJS += $(OUTPUT)util/quote.o
-LIB_OBJS += $(OUTPUT)util/strbuf.o
-LIB_OBJS += $(OUTPUT)util/string.o
-LIB_OBJS += $(OUTPUT)util/strlist.o
-LIB_OBJS += $(OUTPUT)util/strfilter.o
-LIB_OBJS += $(OUTPUT)util/top.o
-LIB_OBJS += $(OUTPUT)util/usage.o
-LIB_OBJS += $(OUTPUT)util/wrapper.o
-LIB_OBJS += $(OUTPUT)util/sigchain.o
-LIB_OBJS += $(OUTPUT)util/dso.o
-LIB_OBJS += $(OUTPUT)util/symbol.o
-LIB_OBJS += $(OUTPUT)util/symbol-elf.o
-LIB_OBJS += $(OUTPUT)util/color.o
-LIB_OBJS += $(OUTPUT)util/pager.o
-LIB_OBJS += $(OUTPUT)util/header.o
-LIB_OBJS += $(OUTPUT)util/callchain.o
-LIB_OBJS += $(OUTPUT)util/values.o
-LIB_OBJS += $(OUTPUT)util/debug.o
-LIB_OBJS += $(OUTPUT)util/machine.o
-LIB_OBJS += $(OUTPUT)util/map.o
-LIB_OBJS += $(OUTPUT)util/pstack.o
-LIB_OBJS += $(OUTPUT)util/session.o
-LIB_OBJS += $(OUTPUT)util/ordered-events.o
-LIB_OBJS += $(OUTPUT)util/comm.o
-LIB_OBJS += $(OUTPUT)util/thread.o
-LIB_OBJS += $(OUTPUT)util/thread_map.o
-LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
-LIB_OBJS += $(OUTPUT)util/parse-events-flex.o
-LIB_OBJS += $(OUTPUT)util/parse-events-bison.o
-LIB_OBJS += $(OUTPUT)util/pmu-flex.o
-LIB_OBJS += $(OUTPUT)util/pmu-bison.o
-LIB_OBJS += $(OUTPUT)util/trace-event-read.o
-LIB_OBJS += $(OUTPUT)util/trace-event-info.o
-LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o
-LIB_OBJS += $(OUTPUT)util/trace-event.o
-LIB_OBJS += $(OUTPUT)util/svghelper.o
-LIB_OBJS += $(OUTPUT)util/sort.o
-LIB_OBJS += $(OUTPUT)util/hist.o
-LIB_OBJS += $(OUTPUT)util/probe-event.o
-LIB_OBJS += $(OUTPUT)util/util.o
-LIB_OBJS += $(OUTPUT)util/xyarray.o
-LIB_OBJS += $(OUTPUT)util/cpumap.o
-LIB_OBJS += $(OUTPUT)util/cgroup.o
-LIB_OBJS += $(OUTPUT)util/target.o
-LIB_OBJS += $(OUTPUT)util/rblist.o
-LIB_OBJS += $(OUTPUT)util/intlist.o
-LIB_OBJS += $(OUTPUT)util/vdso.o
-LIB_OBJS += $(OUTPUT)util/stat.o
-LIB_OBJS += $(OUTPUT)util/record.o
-LIB_OBJS += $(OUTPUT)util/srcline.o
-LIB_OBJS += $(OUTPUT)util/data.o
-LIB_OBJS += $(OUTPUT)util/tsc.o
-LIB_OBJS += $(OUTPUT)util/cloexec.o
-LIB_OBJS += $(OUTPUT)util/thread-stack.o
-
-LIB_OBJS += $(OUTPUT)ui/setup.o
-LIB_OBJS += $(OUTPUT)ui/helpline.o
-LIB_OBJS += $(OUTPUT)ui/progress.o
-LIB_OBJS += $(OUTPUT)ui/util.o
-LIB_OBJS += $(OUTPUT)ui/hist.o
-LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
-
-LIB_OBJS += $(OUTPUT)arch/common.o
-
-LIB_OBJS += $(OUTPUT)tests/parse-events.o
-LIB_OBJS += $(OUTPUT)tests/dso-data.o
-LIB_OBJS += $(OUTPUT)tests/attr.o
-LIB_OBJS += $(OUTPUT)tests/vmlinux-kallsyms.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall-all-cpus.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall-tp-fields.o
-LIB_OBJS += $(OUTPUT)tests/mmap-basic.o
-LIB_OBJS += $(OUTPUT)tests/perf-record.o
-LIB_OBJS += $(OUTPUT)tests/rdpmc.o
-LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
-LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
-LIB_OBJS += $(OUTPUT)tests/fdarray.o
-LIB_OBJS += $(OUTPUT)tests/pmu.o
-LIB_OBJS += $(OUTPUT)tests/hists_common.o
-LIB_OBJS += $(OUTPUT)tests/hists_link.o
-LIB_OBJS += $(OUTPUT)tests/hists_filter.o
-LIB_OBJS += $(OUTPUT)tests/hists_output.o
-LIB_OBJS += $(OUTPUT)tests/hists_cumulate.o
-LIB_OBJS += $(OUTPUT)tests/python-use.o
-LIB_OBJS += $(OUTPUT)tests/bp_signal.o
-LIB_OBJS += $(OUTPUT)tests/bp_signal_overflow.o
-LIB_OBJS += $(OUTPUT)tests/task-exit.o
-LIB_OBJS += $(OUTPUT)tests/sw-clock.o
-ifeq ($(ARCH),x86)
-LIB_OBJS += $(OUTPUT)tests/perf-time-to-tsc.o
-endif
-LIB_OBJS += $(OUTPUT)tests/code-reading.o
-LIB_OBJS += $(OUTPUT)tests/sample-parsing.o
-LIB_OBJS += $(OUTPUT)tests/parse-no-sample-id-all.o
-ifndef NO_DWARF_UNWIND
-ifeq ($(ARCH),$(filter $(ARCH),x86 arm))
-LIB_OBJS += $(OUTPUT)tests/dwarf-unwind.o
-endif
-endif
-LIB_OBJS += $(OUTPUT)tests/mmap-thread-lookup.o
-LIB_OBJS += $(OUTPUT)tests/thread-mg-share.o
-LIB_OBJS += $(OUTPUT)tests/switch-tracking.o
-
-BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
-BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
-# Benchmark modules
-BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
-BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
-ifeq ($(ARCH), x86)
-ifeq ($(IS_64_BIT), 1)
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
-endif
-endif
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
-BUILTIN_OBJS += $(OUTPUT)bench/futex-hash.o
-BUILTIN_OBJS += $(OUTPUT)bench/futex-wake.o
-BUILTIN_OBJS += $(OUTPUT)bench/futex-requeue.o
-
-BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
-BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
-BUILTIN_OBJS += $(OUTPUT)builtin-help.o
-BUILTIN_OBJS += $(OUTPUT)builtin-sched.o
-BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o
-BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o
-BUILTIN_OBJS += $(OUTPUT)builtin-list.o
-BUILTIN_OBJS += $(OUTPUT)builtin-record.o
-BUILTIN_OBJS += $(OUTPUT)builtin-report.o
-BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
-BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
-BUILTIN_OBJS += $(OUTPUT)builtin-top.o
-BUILTIN_OBJS += $(OUTPUT)builtin-script.o
-BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
-BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
-BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
-BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
-BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
-BUILTIN_OBJS += $(OUTPUT)tests/builtin-test.o
-BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
-
-PERFLIBS = $(LIB_FILE) $(LIBAPIKFS) $(LIBTRACEEVENT)
+PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT)
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
@@ -508,67 +236,9 @@ ifneq ($(OUTPUT),)
   CFLAGS += -I$(OUTPUT)
 endif
 
-ifdef NO_LIBELF
-# Remove ELF/DWARF dependent codes
-LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS))
-
-BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS))
-
-# Use minimal symbol handling
-LIB_OBJS += $(OUTPUT)util/symbol-minimal.o
-
-else # NO_LIBELF
-ifndef NO_DWARF
-  LIB_OBJS += $(OUTPUT)util/probe-finder.o
-  LIB_OBJS += $(OUTPUT)util/dwarf-aux.o
-endif # NO_DWARF
-endif # NO_LIBELF
-
-ifndef NO_LIBDW_DWARF_UNWIND
-  LIB_OBJS += $(OUTPUT)util/unwind-libdw.o
-  LIB_H += util/unwind-libdw.h
-endif
-
-ifndef NO_LIBUNWIND
-  LIB_OBJS += $(OUTPUT)util/unwind-libunwind.o
-endif
-LIB_OBJS += $(OUTPUT)tests/keep-tracking.o
-
-ifndef NO_LIBAUDIT
-  BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
-endif
-
-ifndef NO_SLANG
-  LIB_OBJS += $(OUTPUT)ui/browser.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/map.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/header.o
-  LIB_OBJS += $(OUTPUT)ui/tui/setup.o
-  LIB_OBJS += $(OUTPUT)ui/tui/util.o
-  LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
-  LIB_OBJS += $(OUTPUT)ui/tui/progress.o
-  LIB_H += ui/tui/tui.h
-  LIB_H += ui/browser.h
-  LIB_H += ui/browsers/map.h
-  LIB_H += ui/keysyms.h
-  LIB_H += ui/libslang.h
-endif
-
 ifndef NO_GTK2
   ALL_PROGRAMS += $(OUTPUT)libperf-gtk.so
-
-  GTK_OBJS += $(OUTPUT)ui/gtk/browser.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/hists.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/setup.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/util.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/helpline.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/progress.o
-  GTK_OBJS += $(OUTPUT)ui/gtk/annotate.o
+  GTK_IN := $(OUTPUT)gtk-in.o
 
 install-gtk: $(OUTPUT)libperf-gtk.so
 	$(call QUIET_INSTALL, 'GTK UI') \
@@ -576,31 +246,6 @@ install-gtk: $(OUTPUT)libperf-gtk.so
 		$(INSTALL) $(OUTPUT)libperf-gtk.so '$(DESTDIR_SQ)$(libdir_SQ)'
 endif
 
-ifndef NO_LIBPERL
-  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
-  LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
-endif
-
-ifndef NO_LIBPYTHON
-  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
-  LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
-endif
-
-ifeq ($(NO_PERF_REGS),0)
-  ifeq ($(ARCH),x86)
-    LIB_H += arch/x86/include/perf_regs.h
-  endif
-  LIB_OBJS += $(OUTPUT)util/perf_regs.o
-endif
-
-ifndef NO_LIBNUMA
-  BUILTIN_OBJS += $(OUTPUT)bench/numa.o
-endif
-
-ifndef NO_ZLIB
-  LIB_OBJS += $(OUTPUT)util/zlib.o
-endif
-
 ifdef ASCIIDOC8
   export ASCIIDOC8
 endif
@@ -616,40 +261,30 @@ SHELL = $(SHELL_PATH)
 all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
 
 please_set_SHELL_PATH_to_a_more_modern_shell:
-	@$$(:)
+	$(Q)$$(:)
 
 shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
 
 strip: $(PROGRAMS) $(OUTPUT)perf
 	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
 
-$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		$(CFLAGS) -c $(filter %.c,$^) -o $@
+PERF_IN := $(OUTPUT)perf-in.o
 
-$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OUTPUT)perf.o \
-               $(BUILTIN_OBJS) $(LIBS) -o $@
+export srctree OUTPUT RM CC LD AR CFLAGS V BISON FLEX
+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
-$(GTK_OBJS): $(OUTPUT)%.o: %.c $(LIB_H)
-	$(QUIET_CC)$(CC) -o $@ -c -fPIC $(CFLAGS) $(GTK_CFLAGS) $<
+$(PERF_IN): $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h FORCE
+	$(Q)$(MAKE) $(build)=perf
 
-$(OUTPUT)libperf-gtk.so: $(GTK_OBJS) $(PERFLIBS)
+$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(PERF_IN) $(LIBS) -o $@
+
+$(GTK_IN): FORCE
+	$(Q)$(MAKE) $(build)=gtk
+
+$(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS)
 	$(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS)
 
-$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
-		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
-		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
 $(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
 
 $(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
@@ -659,8 +294,7 @@ $(SCRIPTS) : % : %.sh
 	$(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
 
 # These can record PERF_VERSION
-$(OUTPUT)perf.o perf.spec \
-	$(SCRIPTS) \
+perf.spec $(SCRIPTS) \
 	: $(OUTPUT)PERF-VERSION-FILE
 
 .SUFFIXES:
@@ -683,90 +317,33 @@ endif
 # These two need to be here so that when O= is not used they take precedence
 # over the general rule for .o
 
-$(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -w $<
+# get relative building directory (to $(OUTPUT))
+# and '.' if it's $(OUTPUT) itself
+__build-dir = $(subst $(OUTPUT),,$(dir $@))
+build-dir   = $(if $(__build-dir),$(__build-dir),.)
 
-$(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $<
+single_dep: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h
 
-$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
-$(OUTPUT)%.i: %.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
-$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -S $(CFLAGS) $<
-$(OUTPUT)%.o: %.S
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
-$(OUTPUT)%.s: %.S
-	$(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
+$(OUTPUT)%.o: %.c single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
-		'-DPREFIX="$(prefix_SQ)"' \
-		$<
+$(OUTPUT)%.i: %.c single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-		'-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
-		$<
+$(OUTPUT)%.s: %.c single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-		-DPYTHONPATH='"$(OUTPUT)python"' \
-		-DPYTHON='"$(PYTHON_WORD)"' \
-		$<
+$(OUTPUT)%-bison.o: %.c single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)tests/dwarf-unwind.o: tests/dwarf-unwind.c
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -fno-optimize-sibling-calls $<
+$(OUTPUT)%-flex.o: %.c single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)%.o: %.S single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
-$(OUTPUT)ui/setup.o: ui/setup.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DLIBDIR='"$(libdir_SQ)"' $<
-
-$(OUTPUT)ui/browser.o: ui/browser.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/annotate.o: ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/hists.o: ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/map.o: ui/browsers/map.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/scripts.o: ui/browsers/scripts.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
-
-$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/hweight.o: ../../lib/hweight.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/find_next_bit.o: ../lib/util/find_next_bit.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/parse-events.o: util/parse-events.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-redundant-decls $<
-
-$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default $<
-
-$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default $<
-
-$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
-
-$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+$(OUTPUT)%.i: %.S single_dep FORCE
+	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
 
 $(OUTPUT)perf-%: %.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $(LDFLAGS) $(filter %.o,$^) $(LIBS)
@@ -781,58 +358,34 @@ $(OUTPUT)perf-read-vdsox32: perf-read-vdso.c util/find-vdso-map.c
 	$(QUIET_CC)$(CC) -mx32 $(filter -static,$(LDFLAGS)) -Wall -Werror -o $@ perf-read-vdso.c
 endif
 
-$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+$(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h)
 
-# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
-# we depend the various files onto their directories.
-DIRECTORY_DEPS = $(LIB_OBJS) $(BUILTIN_OBJS) $(GTK_OBJS)
-DIRECTORY_DEPS += $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h
-# no need to add flex objects, because they depend on bison ones
-DIRECTORY_DEPS += $(OUTPUT)util/parse-events-bison.c
-DIRECTORY_DEPS += $(OUTPUT)util/pmu-bison.c
+LIBPERF_IN := $(OUTPUT)libperf-in.o
 
-OUTPUT_DIRECTORIES := $(sort $(dir $(DIRECTORY_DEPS)))
+$(LIBPERF_IN): FORCE
+	$(Q)$(MAKE) $(build)=libperf
 
-$(DIRECTORY_DEPS): | $(OUTPUT_DIRECTORIES)
-# In the second step, we make a rule to actually create these directories
-$(OUTPUT_DIRECTORIES):
-	$(QUIET_MKDIR)$(MKDIR) -p $@ 2>/dev/null
+$(LIB_FILE): $(LIBPERF_IN)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN) $(LIB_OBJS)
 
-$(LIB_FILE): $(LIB_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
-
-# libtraceevent.a
-TE_SOURCES = $(wildcard $(TRACE_EVENT_DIR)*.[ch])
-
-LIBTRACEEVENT_FLAGS  = $(QUIET_SUBDIR1) O=$(OUTPUT)
-LIBTRACEEVENT_FLAGS += CFLAGS="-g -Wall $(EXTRA_CFLAGS)"
 LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ)
 
-$(LIBTRACEEVENT): $(TE_SOURCES) $(OUTPUT)PERF-CFLAGS
-	$(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) libtraceevent.a plugins
+$(LIBTRACEEVENT): FORCE
+	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a plugins
 
 $(LIBTRACEEVENT)-clean:
 	$(call QUIET_CLEAN, libtraceevent)
-	@$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
+	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
 
 install-traceevent-plugins: $(LIBTRACEEVENT)
-	$(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) install_plugins
+	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) install_plugins
 
-LIBAPIKFS_SOURCES = $(wildcard $(LIB_PATH)fs/*.[ch] $(LIB_PATH)fd/*.[ch])
+$(LIBAPI): FORCE
+	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a
 
-# if subdir is set, we've been called from above so target has been built
-# already
-$(LIBAPIKFS): $(LIBAPIKFS_SOURCES)
-ifeq ($(subdir),)
-	$(QUIET_SUBDIR0)$(LIB_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libapikfs.a
-endif
-
-$(LIBAPIKFS)-clean:
-ifeq ($(subdir),)
-	$(call QUIET_CLEAN, libapikfs)
-	@$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
-endif
+$(LIBAPI)-clean:
+	$(call QUIET_CLEAN, libapi)
+	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
 help:
 	@echo 'Perf make targets:'
@@ -888,17 +441,6 @@ cscope:
 	$(QUIET_GEN)$(RM) cscope*; \
 	$(FIND) $(TAG_FOLDERS) -name '*.[hcS]' -print | xargs cscope -b $(TAG_FILES)
 
-### Detect prefix changes
-TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):\
-             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ):$(plugindir_SQ)
-
-$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
-	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \
-		echo 1>&2 "  FLAGS:   * new build flags or prefix"; \
-		echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
-            fi
-
 ### Testing rules
 
 # GNU make supports exporting all variables by "export" without parameters.
@@ -981,12 +523,14 @@ $(INSTALL_DOC_TARGETS):
 #
 config-clean:
 	$(call QUIET_CLEAN, config)
-	@$(MAKE) -C config/feature-checks clean >/dev/null
+	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null
 
-clean: $(LIBTRACEEVENT)-clean $(LIBAPIKFS)-clean config-clean
-	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_OBJS) $(BUILTIN_OBJS) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf.o $(LANG_BINDINGS) $(GTK_OBJS)
+clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean config-clean
+	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
+	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)$(RM) .config-detected
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
-	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-FEATURES $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex*
+	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex*
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
 	$(python-clean)
 
@@ -1000,7 +544,9 @@ else
     GIT-HEAD-PHONY =
 endif
 
+FORCE:
+
 .PHONY: all install clean config-clean strip install-gtk
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope .FORCE-PERF-CFLAGS
+.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE single_dep
 
diff --git a/tools/perf/arch/Build b/tools/perf/arch/Build
new file mode 100644
index 000000000000..109eb75cf7de
--- /dev/null
+++ b/tools/perf/arch/Build
@@ -0,0 +1,2 @@
+libperf-y += common.o
+libperf-y += $(ARCH)/
diff --git a/tools/perf/arch/arm/Build b/tools/perf/arch/arm/Build
new file mode 100644
index 000000000000..41bf61da476a
--- /dev/null
+++ b/tools/perf/arch/arm/Build
@@ -0,0 +1,2 @@
+libperf-y += util/
+libperf-$(CONFIG_DWARF_UNWIND) += tests/
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile
index 09d62153d384..7fbca175099e 100644
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -1,14 +1,3 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
-endif
-ifndef NO_LIBUNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
-endif
-ifndef NO_LIBDW_DWARF_UNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o
-endif
-ifndef NO_DWARF_UNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o
 endif
diff --git a/tools/perf/arch/arm/tests/Build b/tools/perf/arch/arm/tests/Build
new file mode 100644
index 000000000000..b30eff9bcc83
--- /dev/null
+++ b/tools/perf/arch/arm/tests/Build
@@ -0,0 +1,2 @@
+libperf-y += regs_load.o
+libperf-y += dwarf-unwind.o
diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
new file mode 100644
index 000000000000..d22e3d07de3d
--- /dev/null
+++ b/tools/perf/arch/arm/util/Build
@@ -0,0 +1,4 @@
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
+
+libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/arm64/Build b/tools/perf/arch/arm64/Build
new file mode 100644
index 000000000000..54afe4a467e7
--- /dev/null
+++ b/tools/perf/arch/arm64/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index 67e9b3d38e89..7fbca175099e 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,7 +1,3 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
-endif
-ifndef NO_LIBUNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
 endif
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
new file mode 100644
index 000000000000..e58123a8912b
--- /dev/null
+++ b/tools/perf/arch/arm64/util/Build
@@ -0,0 +1,2 @@
+libperf-$(CONFIG_DWARF)     += dwarf-regs.o
+libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/powerpc/Build b/tools/perf/arch/powerpc/Build
new file mode 100644
index 000000000000..54afe4a467e7
--- /dev/null
+++ b/tools/perf/arch/powerpc/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 6f7782bea5dd..7fbca175099e 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,6 +1,3 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/skip-callchain-idx.o
 endif
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
new file mode 100644
index 000000000000..0af6e9b3f728
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/Build
@@ -0,0 +1,4 @@
+libperf-y += header.o
+
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
+libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/s390/Build b/tools/perf/arch/s390/Build
new file mode 100644
index 000000000000..54afe4a467e7
--- /dev/null
+++ b/tools/perf/arch/s390/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 798ac7379c5f..21322e0385b8 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -1,7 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
 HAVE_KVM_STAT_SUPPORT := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/kvm-stat.o
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
new file mode 100644
index 000000000000..8a61372bb47a
--- /dev/null
+++ b/tools/perf/arch/s390/util/Build
@@ -0,0 +1,4 @@
+libperf-y += header.o
+libperf-y += kvm-stat.o
+
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/sh/Build b/tools/perf/arch/sh/Build
new file mode 100644
index 000000000000..54afe4a467e7
--- /dev/null
+++ b/tools/perf/arch/sh/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/sh/Makefile b/tools/perf/arch/sh/Makefile
index 15130b50dfe3..7fbca175099e 100644
--- a/tools/perf/arch/sh/Makefile
+++ b/tools/perf/arch/sh/Makefile
@@ -1,4 +1,3 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
diff --git a/tools/perf/arch/sh/util/Build b/tools/perf/arch/sh/util/Build
new file mode 100644
index 000000000000..954e287bbb89
--- /dev/null
+++ b/tools/perf/arch/sh/util/Build
@@ -0,0 +1 @@
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/sparc/Build b/tools/perf/arch/sparc/Build
new file mode 100644
index 000000000000..54afe4a467e7
--- /dev/null
+++ b/tools/perf/arch/sparc/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile
index 15130b50dfe3..7fbca175099e 100644
--- a/tools/perf/arch/sparc/Makefile
+++ b/tools/perf/arch/sparc/Makefile
@@ -1,4 +1,3 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
diff --git a/tools/perf/arch/sparc/util/Build b/tools/perf/arch/sparc/util/Build
new file mode 100644
index 000000000000..954e287bbb89
--- /dev/null
+++ b/tools/perf/arch/sparc/util/Build
@@ -0,0 +1 @@
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/Build b/tools/perf/arch/x86/Build
new file mode 100644
index 000000000000..41bf61da476a
--- /dev/null
+++ b/tools/perf/arch/x86/Build
@@ -0,0 +1,2 @@
+libperf-y += util/
+libperf-$(CONFIG_DWARF_UNWIND) += tests/
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 9b21881db52f..21322e0385b8 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -1,19 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
-ifndef NO_LIBUNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
-endif
-ifndef NO_LIBDW_DWARF_UNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o
-endif
-ifndef NO_DWARF_UNWIND
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o
-endif
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/tsc.o
-LIB_H += arch/$(ARCH)/util/tsc.h
 HAVE_KVM_STAT_SUPPORT := 1
-LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/kvm-stat.o
diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build
new file mode 100644
index 000000000000..b30eff9bcc83
--- /dev/null
+++ b/tools/perf/arch/x86/tests/Build
@@ -0,0 +1,2 @@
+libperf-y += regs_load.o
+libperf-y += dwarf-unwind.o
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
new file mode 100644
index 000000000000..cfbccc4e3187
--- /dev/null
+++ b/tools/perf/arch/x86/util/Build
@@ -0,0 +1,8 @@
+libperf-y += header.o
+libperf-y += tsc.o
+libperf-y += kvm-stat.o
+
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
+
+libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
new file mode 100644
index 000000000000..5ce98023d518
--- /dev/null
+++ b/tools/perf/bench/Build
@@ -0,0 +1,11 @@
+perf-y += sched-messaging.o
+perf-y += sched-pipe.o
+perf-y += mem-memcpy.o
+perf-y += futex-hash.o
+perf-y += futex-wake.o
+perf-y += futex-requeue.o
+
+perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
+perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
+
+perf-$(CONFIG_NUMA) += numa.o
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 747f86103599..71bf7451c0ca 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -208,7 +208,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
 			goto out;
 	}
 
-	ret = perf_session__process_events(session, &ann->tool);
+	ret = perf_session__process_events(session);
 	if (ret)
 		goto out;
 
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 50e6b66aea1f..d47a0cdc71c9 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -125,8 +125,7 @@ static int build_id_cache__kcore_existing(const char *from_dir, char *to_dir,
 	return ret;
 }
 
-static int build_id_cache__add_kcore(const char *filename, const char *debugdir,
-				     bool force)
+static int build_id_cache__add_kcore(const char *filename, bool force)
 {
 	char dir[32], sbuildid[BUILD_ID_SIZE * 2 + 1];
 	char from_dir[PATH_MAX], to_dir[PATH_MAX];
@@ -143,7 +142,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir,
 		return -1;
 
 	scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s",
-		  debugdir, sbuildid);
+		  buildid_dir, sbuildid);
 
 	if (!force &&
 	    !build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) {
@@ -155,7 +154,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir,
 		return -1;
 
 	scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s",
-		  debugdir, sbuildid, dir);
+		  buildid_dir, sbuildid, dir);
 
 	if (mkdir_p(to_dir, 0755))
 		return -1;
@@ -183,7 +182,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir,
 	return 0;
 }
 
-static int build_id_cache__add_file(const char *filename, const char *debugdir)
+static int build_id_cache__add_file(const char *filename)
 {
 	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 	u8 build_id[BUILD_ID_SIZE];
@@ -195,16 +194,14 @@ static int build_id_cache__add_file(const char *filename, const char *debugdir)
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__add_s(sbuild_id, debugdir, filename,
+	err = build_id_cache__add_s(sbuild_id, filename,
 				    false, false);
-	if (verbose)
-		pr_info("Adding %s %s: %s\n", sbuild_id, filename,
-			err ? "FAIL" : "Ok");
+	pr_debug("Adding %s %s: %s\n", sbuild_id, filename,
+		 err ? "FAIL" : "Ok");
 	return err;
 }
 
-static int build_id_cache__remove_file(const char *filename,
-				       const char *debugdir)
+static int build_id_cache__remove_file(const char *filename)
 {
 	u8 build_id[BUILD_ID_SIZE];
 	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
@@ -217,10 +214,34 @@ static int build_id_cache__remove_file(const char *filename,
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__remove_s(sbuild_id, debugdir);
-	if (verbose)
-		pr_info("Removing %s %s: %s\n", sbuild_id, filename,
-			err ? "FAIL" : "Ok");
+	err = build_id_cache__remove_s(sbuild_id);
+	pr_debug("Removing %s %s: %s\n", sbuild_id, filename,
+		 err ? "FAIL" : "Ok");
+
+	return err;
+}
+
+static int build_id_cache__purge_path(const char *pathname)
+{
+	struct strlist *list;
+	struct str_node *pos;
+	int err;
+
+	err = build_id_cache__list_build_ids(pathname, &list);
+	if (err)
+		goto out;
+
+	strlist__for_each(pos, list) {
+		err = build_id_cache__remove_s(pos->s);
+		pr_debug("Removing %s %s: %s\n", pos->s, pathname,
+			 err ? "FAIL" : "Ok");
+		if (err)
+			break;
+	}
+	strlist__delete(list);
+
+out:
+	pr_debug("Purging %s: %s\n", pathname, err ? "FAIL" : "Ok");
 
 	return err;
 }
@@ -252,13 +273,12 @@ static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *f
 	return 0;
 }
 
-static int build_id_cache__update_file(const char *filename,
-				       const char *debugdir)
+static int build_id_cache__update_file(const char *filename)
 {
 	u8 build_id[BUILD_ID_SIZE];
 	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 
-	int err;
+	int err = 0;
 
 	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
@@ -266,14 +286,14 @@ static int build_id_cache__update_file(const char *filename,
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__remove_s(sbuild_id, debugdir);
-	if (!err) {
-		err = build_id_cache__add_s(sbuild_id, debugdir, filename,
-					    false, false);
-	}
-	if (verbose)
-		pr_info("Updating %s %s: %s\n", sbuild_id, filename,
-			err ? "FAIL" : "Ok");
+	if (build_id_cache__cached(sbuild_id))
+		err = build_id_cache__remove_s(sbuild_id);
+
+	if (!err)
+		err = build_id_cache__add_s(sbuild_id, filename, false, false);
+
+	pr_debug("Updating %s %s: %s\n", sbuild_id, filename,
+		 err ? "FAIL" : "Ok");
 
 	return err;
 }
@@ -287,6 +307,7 @@ int cmd_buildid_cache(int argc, const char **argv,
 	bool force = false;
 	char const *add_name_list_str = NULL,
 		   *remove_name_list_str = NULL,
+		   *purge_name_list_str = NULL,
 		   *missing_filename = NULL,
 		   *update_name_list_str = NULL,
 		   *kcore_filename = NULL;
@@ -304,6 +325,8 @@ int cmd_buildid_cache(int argc, const char **argv,
 		   "file", "kcore file to add"),
 	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
 		    "file(s) to remove"),
+	OPT_STRING('p', "purge", &purge_name_list_str, "path list",
+		    "path(s) to remove (remove old caches too)"),
 	OPT_STRING('M', "missing", &missing_filename, "file",
 		   "to find missing build ids in the cache"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
@@ -320,6 +343,11 @@ int cmd_buildid_cache(int argc, const char **argv,
 	argc = parse_options(argc, argv, buildid_cache_options,
 			     buildid_cache_usage, 0);
 
+	if (argc || (!add_name_list_str && !kcore_filename &&
+		     !remove_name_list_str && !purge_name_list_str &&
+		     !missing_filename && !update_name_list_str))
+		usage_with_options(buildid_cache_usage, buildid_cache_options);
+
 	if (missing_filename) {
 		file.path = missing_filename;
 		file.force = force;
@@ -338,7 +366,7 @@ int cmd_buildid_cache(int argc, const char **argv,
 		list = strlist__new(true, add_name_list_str);
 		if (list) {
 			strlist__for_each(pos, list)
-				if (build_id_cache__add_file(pos->s, buildid_dir)) {
+				if (build_id_cache__add_file(pos->s)) {
 					if (errno == EEXIST) {
 						pr_debug("%s already in the cache\n",
 							 pos->s);
@@ -356,7 +384,25 @@ int cmd_buildid_cache(int argc, const char **argv,
 		list = strlist__new(true, remove_name_list_str);
 		if (list) {
 			strlist__for_each(pos, list)
-				if (build_id_cache__remove_file(pos->s, buildid_dir)) {
+				if (build_id_cache__remove_file(pos->s)) {
+					if (errno == ENOENT) {
+						pr_debug("%s wasn't in the cache\n",
+							 pos->s);
+						continue;
+					}
+					pr_warning("Couldn't remove %s: %s\n",
+						   pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
+				}
+
+			strlist__delete(list);
+		}
+	}
+
+	if (purge_name_list_str) {
+		list = strlist__new(true, purge_name_list_str);
+		if (list) {
+			strlist__for_each(pos, list)
+				if (build_id_cache__purge_path(pos->s)) {
 					if (errno == ENOENT) {
 						pr_debug("%s wasn't in the cache\n",
 							 pos->s);
@@ -377,7 +423,7 @@ int cmd_buildid_cache(int argc, const char **argv,
 		list = strlist__new(true, update_name_list_str);
 		if (list) {
 			strlist__for_each(pos, list)
-				if (build_id_cache__update_file(pos->s, buildid_dir)) {
+				if (build_id_cache__update_file(pos->s)) {
 					if (errno == ENOENT) {
 						pr_debug("%s wasn't in the cache\n",
 							 pos->s);
@@ -391,8 +437,7 @@ int cmd_buildid_cache(int argc, const char **argv,
 		}
 	}
 
-	if (kcore_filename &&
-	    build_id_cache__add_kcore(kcore_filename, buildid_dir, force))
+	if (kcore_filename && build_id_cache__add_kcore(kcore_filename, force))
 		pr_warning("Couldn't add %s\n", kcore_filename);
 
 out:
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index ed3873b3e238..feb420f74c2d 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -74,7 +74,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
 	 */
 	if (with_hits || perf_data_file__is_pipe(&file))
-		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
+		perf_session__process_events(session);
 
 	perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits);
 	perf_session__delete(session);
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
new file mode 100644
index 000000000000..d6525bc54d13
--- /dev/null
+++ b/tools/perf/builtin-data.c
@@ -0,0 +1,123 @@
+#include <linux/compiler.h>
+#include "builtin.h"
+#include "perf.h"
+#include "debug.h"
+#include "parse-options.h"
+#include "data-convert-bt.h"
+
+typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix);
+
+struct data_cmd {
+	const char	*name;
+	const char	*summary;
+	data_cmd_fn_t	fn;
+};
+
+static struct data_cmd data_cmds[];
+
+#define for_each_cmd(cmd) \
+	for (cmd = data_cmds; cmd && cmd->name; cmd++)
+
+static const struct option data_options[] = {
+	OPT_END()
+};
+
+static const char * const data_subcommands[] = { "convert", NULL };
+
+static const char *data_usage[] = {
+	"perf data [<common options>] <command> [<options>]",
+	NULL
+};
+
+static void print_usage(void)
+{
+	struct data_cmd *cmd;
+
+	printf("Usage:\n");
+	printf("\t%s\n\n", data_usage[0]);
+	printf("\tAvailable commands:\n");
+
+	for_each_cmd(cmd) {
+		printf("\t %s\t- %s\n", cmd->name, cmd->summary);
+	}
+
+	printf("\n");
+}
+
+static const char * const data_convert_usage[] = {
+	"perf data convert [<options>]",
+	NULL
+};
+
+static int cmd_data_convert(int argc, const char **argv,
+			    const char *prefix __maybe_unused)
+{
+	const char *to_ctf     = NULL;
+	bool force = false;
+	const struct option options[] = {
+		OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+		OPT_STRING('i', "input", &input_name, "file", "input file name"),
+#ifdef HAVE_LIBBABELTRACE_SUPPORT
+		OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"),
+#endif
+		OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+		OPT_END()
+	};
+
+#ifndef HAVE_LIBBABELTRACE_SUPPORT
+	pr_err("No conversion support compiled in.\n");
+	return -1;
+#endif
+
+	argc = parse_options(argc, argv, options,
+			     data_convert_usage, 0);
+	if (argc) {
+		usage_with_options(data_convert_usage, options);
+		return -1;
+	}
+
+	if (to_ctf) {
+#ifdef HAVE_LIBBABELTRACE_SUPPORT
+		return bt_convert__perf2ctf(input_name, to_ctf, force);
+#else
+		pr_err("The libbabeltrace support is not compiled in.\n");
+		return -1;
+#endif
+	}
+
+	return 0;
+}
+
+static struct data_cmd data_cmds[] = {
+	{ "convert", "converts data file between formats", cmd_data_convert },
+	{ .name = NULL, },
+};
+
+int cmd_data(int argc, const char **argv, const char *prefix)
+{
+	struct data_cmd *cmd;
+	const char *cmdstr;
+
+	/* No command specified. */
+	if (argc < 2)
+		goto usage;
+
+	argc = parse_options_subcommand(argc, argv, data_options, data_subcommands, data_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+	if (argc < 1)
+		goto usage;
+
+	cmdstr = argv[0];
+
+	for_each_cmd(cmd) {
+		if (strcmp(cmd->name, cmdstr))
+			continue;
+
+		return cmd->fn(argc, argv, prefix);
+	}
+
+	pr_err("Unknown command: %s\n", cmdstr);
+usage:
+	print_usage();
+	return -1;
+}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 74aada554b12..df6307b4050a 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -747,7 +747,7 @@ static int __cmd_diff(void)
 			goto out_delete;
 		}
 
-		ret = perf_session__process_events(d->session, &tool);
+		ret = perf_session__process_events(d->session);
 		if (ret) {
 			pr_err("Failed to process %s\n", d->file.path);
 			goto out_delete;
@@ -791,6 +791,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
+		   "file", "kallsyms pathname"),
 	OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
 	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
@@ -802,7 +804,7 @@ static const struct option options[] = {
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..."
 		   " Please refer the man page for the complete list."),
-	OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
+	OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 0f93f859b782..695ec5a50cf2 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -24,6 +24,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 	struct perf_data_file file = {
 		.path = file_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = details->force,
 	};
 
 	session = perf_session__new(&file, 0, NULL);
@@ -47,6 +48,7 @@ int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show all event attr details"),
 	OPT_BOOLEAN('g', "group", &details.event_group,
 		    "Show event group information"),
+	OPT_BOOLEAN('f', "force", &details.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const char * const evlist_usage[] = {
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 25d20628212e..36486eade1ef 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -437,7 +437,18 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
 			HELP_FORMAT_INFO),
 	OPT_END(),
 	};
-	const char * const builtin_help_usage[] = {
+	const char * const builtin_help_subcommands[] = {
+		"buildid-cache", "buildid-list", "diff", "evlist", "help", "list",
+		"record", "report", "bench", "stat", "timechart", "top", "annotate",
+		"script", "sched", "kmem", "lock", "kvm", "test", "inject", "mem", "data",
+#ifdef HAVE_LIBELF_SUPPORT
+		"probe",
+#endif
+#ifdef HAVE_LIBAUDIT_SUPPORT
+		"trace",
+#endif
+	NULL };
+	const char *builtin_help_usage[] = {
 		"perf help [--all] [--man|--web|--info] [command]",
 		NULL
 	};
@@ -448,8 +459,8 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	perf_config(perf_help_config, &help_format);
 
-	argc = parse_options(argc, argv, builtin_help_options,
-			builtin_help_usage, 0);
+	argc = parse_options_subcommand(argc, argv, builtin_help_options,
+			builtin_help_subcommands, builtin_help_usage, 0);
 
 	if (show_all) {
 		printf("\n usage: %s\n\n", perf_usage_string);
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a13641e066f5..40a33d7334cc 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -53,6 +53,13 @@ static int perf_event__repipe_synth(struct perf_tool *tool,
 	return 0;
 }
 
+static int perf_event__repipe_oe_synth(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct ordered_events *oe __maybe_unused)
+{
+	return perf_event__repipe_synth(tool, event);
+}
+
 static int perf_event__repipe_op2_synth(struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_session *session
@@ -359,8 +366,6 @@ static int __cmd_inject(struct perf_inject *inject)
 	} else if (inject->sched_stat) {
 		struct perf_evsel *evsel;
 
-		inject->tool.ordered_events = true;
-
 		evlist__for_each(session->evlist, evsel) {
 			const char *name = perf_evsel__name(evsel);
 
@@ -379,7 +384,7 @@ static int __cmd_inject(struct perf_inject *inject)
 	if (!file_out->is_pipe)
 		lseek(fd, session->header.data_offset, SEEK_SET);
 
-	ret = perf_session__process_events(session, &inject->tool);
+	ret = perf_session__process_events(session);
 
 	if (!file_out->is_pipe) {
 		if (inject->build_ids)
@@ -408,7 +413,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 			.unthrottle	= perf_event__repipe,
 			.attr		= perf_event__repipe_attr,
 			.tracing_data	= perf_event__repipe_op2_synth,
-			.finished_round	= perf_event__repipe_op2_synth,
+			.finished_round	= perf_event__repipe_oe_synth,
 			.build_id	= perf_event__repipe_op2_synth,
 			.id_index	= perf_event__repipe_op2_synth,
 		},
@@ -438,6 +443,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 			 "be more verbose (show build ids, etc)"),
 		OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
 			   "kallsyms pathname"),
+		OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -458,6 +464,8 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		return -1;
 	}
 
+	inject.tool.ordered_events = inject.sched_stat;
+
 	file.path = inject.input_name;
 	inject.session = perf_session__new(&file, true, &inject.tool);
 	if (inject.session == NULL)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index f295141025bc..4ebf65c79434 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -20,6 +20,7 @@
 
 #include <linux/rbtree.h>
 #include <linux/string.h>
+#include <locale.h>
 
 struct alloc_stat;
 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
@@ -275,10 +276,10 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 	struct rb_node *next;
 	struct machine *machine = &session->machines.host;
 
-	printf("%.102s\n", graph_dotted_line);
+	printf("%.105s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
 	printf(" Total_alloc/Per | Total_req/Per   | Hit      | Ping-pong | Frag\n");
-	printf("%.102s\n", graph_dotted_line);
+	printf("%.105s\n", graph_dotted_line);
 
 	next = rb_first(root);
 
@@ -304,7 +305,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
 		printf(" %-34s |", buf);
 
-		printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %8lu | %6.3f%%\n",
+		printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n",
 		       (unsigned long long)data->bytes_alloc,
 		       (unsigned long)data->bytes_alloc / data->hit,
 		       (unsigned long long)data->bytes_req,
@@ -317,21 +318,21 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 	}
 
 	if (n_lines == -1)
-		printf(" ...                                | ...             | ...             | ...    | ...      | ...   \n");
+		printf(" ...                                | ...             | ...             | ...      | ...       | ...   \n");
 
-	printf("%.102s\n", graph_dotted_line);
+	printf("%.105s\n", graph_dotted_line);
 }
 
 static void print_summary(void)
 {
 	printf("\nSUMMARY\n=======\n");
-	printf("Total bytes requested: %lu\n", total_requested);
-	printf("Total bytes allocated: %lu\n", total_allocated);
-	printf("Total bytes wasted on internal fragmentation: %lu\n",
+	printf("Total bytes requested: %'lu\n", total_requested);
+	printf("Total bytes allocated: %'lu\n", total_allocated);
+	printf("Total bytes wasted on internal fragmentation: %'lu\n",
 	       total_allocated - total_requested);
 	printf("Internal fragmentation: %f%%\n",
 	       fragmentation(total_requested, total_allocated));
-	printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs);
+	printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs);
 }
 
 static void print_result(struct perf_session *session)
@@ -426,7 +427,7 @@ static int __cmd_kmem(struct perf_session *session)
 	}
 
 	setup_pager();
-	err = perf_session__process_events(session, &perf_kmem);
+	err = perf_session__process_events(session);
 	if (err != 0)
 		goto out;
 	sort_result();
@@ -559,6 +560,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 {
 	char *tok;
 	char *str = strdup(arg);
+	char *pos = str;
 
 	if (!str) {
 		pr_err("%s: strdup failed\n", __func__);
@@ -566,7 +568,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 	}
 
 	while (true) {
-		tok = strsep(&str, ",");
+		tok = strsep(&pos, ",");
 		if (!tok)
 			break;
 		if (sort_dimension__add(tok, sort_list) < 0) {
@@ -660,8 +662,13 @@ static int __cmd_record(int argc, const char **argv)
 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	const char * const default_sort_order = "frag,hit,bytes";
+	struct perf_data_file file = {
+		.mode = PERF_DATA_MODE_READ,
+	};
 	const struct option kmem_options[] = {
 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
 			   "show per-callsite statistics", parse_caller_opt),
 	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
@@ -671,6 +678,7 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		     parse_sort_opt),
 	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
 	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
+	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const char *const kmem_subcommands[] = { "record", "stat", NULL };
@@ -679,10 +687,6 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 	struct perf_session *session;
-	struct perf_data_file file = {
-		.path = input_name,
-		.mode = PERF_DATA_MODE_READ,
-	};
 	int ret = -1;
 
 	argc = parse_options_subcommand(argc, argv, kmem_options,
@@ -696,6 +700,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		return __cmd_record(argc, argv);
 	}
 
+	file.path = input_name;
+
 	session = perf_session__new(&file, false, &perf_kmem);
 	if (session == NULL)
 		return -1;
@@ -703,6 +709,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	symbol__init(&session->header.env);
 
 	if (!strcmp(argv[0], "stat")) {
+		setlocale(LC_ALL, "");
+
 		if (cpu__setup_cpunode_map())
 			goto out_delete;
 
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 0894a817f67e..1f9338f6109c 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -18,6 +18,7 @@
 #include "util/stat.h"
 #include "util/top.h"
 #include "util/data.h"
+#include "util/ordered-events.h"
 
 #include <sys/prctl.h>
 #ifdef HAVE_TIMERFD_SUPPORT
@@ -730,9 +731,9 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
 			return -1;
 		}
 
-		err = perf_session_queue_event(kvm->session, event, &kvm->tool, &sample, 0);
+		err = perf_session__queue_event(kvm->session, event, &sample, 0);
 		/*
-		 * FIXME: Here we can't consume the event, as perf_session_queue_event will
+		 * FIXME: Here we can't consume the event, as perf_session__queue_event will
 		 *        point to it, and it'll get possibly overwritten by the kernel.
 		 */
 		perf_evlist__mmap_consume(kvm->evlist, idx);
@@ -783,8 +784,10 @@ static int perf_kvm__mmap_read(struct perf_kvm_stat *kvm)
 
 	/* flush queue after each round in which we processed events */
 	if (ntotal) {
-		kvm->session->ordered_events.next_flush = flush_time;
-		err = kvm->tool.finished_round(&kvm->tool, NULL, kvm->session);
+		struct ordered_events *oe = &kvm->session->ordered_events;
+
+		oe->next_flush = flush_time;
+		err = ordered_events__flush(oe, OE_FLUSH__ROUND);
 		if (err) {
 			if (kvm->lost_events)
 				pr_info("\nLost events: %" PRIu64 "\n\n",
@@ -1044,6 +1047,7 @@ static int read_events(struct perf_kvm_stat *kvm)
 	struct perf_data_file file = {
 		.path = kvm->file_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = kvm->force,
 	};
 
 	kvm->tool = eops;
@@ -1066,7 +1070,7 @@ static int read_events(struct perf_kvm_stat *kvm)
 	if (ret < 0)
 		return ret;
 
-	return perf_session__process_events(kvm->session, &kvm->tool);
+	return perf_session__process_events(kvm->session);
 }
 
 static int parse_target_str(struct perf_kvm_stat *kvm)
@@ -1201,6 +1205,7 @@ kvm_events_report(struct perf_kvm_stat *kvm, int argc, const char **argv)
 			    " time (sort by avg time)"),
 		OPT_STRING('p', "pid", &kvm->opts.target.pid, "pid",
 			   "analyze events only for given process id(s)"),
+		OPT_BOOLEAN('f', "force", &kvm->force, "don't complain, do it"),
 		OPT_END()
 	};
 
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 198f3c3aff95..af5bd0514108 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -36,38 +36,36 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	setup_pager();
 
-	if (raw_dump) {
-		print_events(NULL, true);
-		return 0;
-	}
+	if (!raw_dump)
+		printf("\nList of pre-defined events (to be used in -e):\n\n");
 
 	if (argc == 0) {
-		print_events(NULL, false);
+		print_events(NULL, raw_dump);
 		return 0;
 	}
 
 	for (i = 0; i < argc; ++i) {
-		if (i)
-			putchar('\n');
-		if (strncmp(argv[i], "tracepoint", 10) == 0)
-			print_tracepoint_events(NULL, NULL, false);
+		if (strcmp(argv[i], "tracepoint") == 0)
+			print_tracepoint_events(NULL, NULL, raw_dump);
 		else if (strcmp(argv[i], "hw") == 0 ||
 			 strcmp(argv[i], "hardware") == 0)
-			print_events_type(PERF_TYPE_HARDWARE);
+			print_symbol_events(NULL, PERF_TYPE_HARDWARE,
+					event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump);
 		else if (strcmp(argv[i], "sw") == 0 ||
 			 strcmp(argv[i], "software") == 0)
-			print_events_type(PERF_TYPE_SOFTWARE);
+			print_symbol_events(NULL, PERF_TYPE_SOFTWARE,
+					event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
 		else if (strcmp(argv[i], "cache") == 0 ||
 			 strcmp(argv[i], "hwcache") == 0)
-			print_hwcache_events(NULL, false);
+			print_hwcache_events(NULL, raw_dump);
 		else if (strcmp(argv[i], "pmu") == 0)
-			print_pmu_events(NULL, false);
+			print_pmu_events(NULL, raw_dump);
 		else {
 			char *sep = strchr(argv[i], ':'), *s;
 			int sep_idx;
 
 			if (sep == NULL) {
-				print_events(argv[i], false);
+				print_events(argv[i], raw_dump);
 				continue;
 			}
 			sep_idx = sep - argv[i];
@@ -76,7 +74,7 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
 				return -1;
 
 			s[sep_idx] = '\0';
-			print_tracepoint_events(s, s + sep_idx + 1, false);
+			print_tracepoint_events(s, s + sep_idx + 1, raw_dump);
 			free(s);
 		}
 	}
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e7ec71589da6..d49c2ab85fc2 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -846,6 +846,8 @@ static const struct perf_evsel_str_handler lock_tracepoints[] = {
 	{ "lock:lock_release",	 perf_evsel__process_lock_release,   }, /* CONFIG_LOCKDEP */
 };
 
+static bool force;
+
 static int __cmd_report(bool display_info)
 {
 	int err = -EINVAL;
@@ -857,6 +859,7 @@ static int __cmd_report(bool display_info)
 	struct perf_data_file file = {
 		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = force,
 	};
 
 	session = perf_session__new(&file, false, &eops);
@@ -878,7 +881,7 @@ static int __cmd_report(bool display_info)
 	if (select_key())
 		goto out_delete;
 
-	err = perf_session__process_events(session, &eops);
+	err = perf_session__process_events(session);
 	if (err)
 		goto out_delete;
 
@@ -945,6 +948,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "dump thread list in perf.data"),
 	OPT_BOOLEAN('m', "map", &info_map,
 		    "map of lock instances (address:name table)"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
 	OPT_END()
 	};
 	const struct option lock_options[] = {
@@ -956,6 +960,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
 	const struct option report_options[] = {
 	OPT_STRING('k', "key", &sort_key, "acquired",
 		    "key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
 	/* TODO: type */
 	OPT_END()
 	};
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 9b5663950a4d..675216e08bfc 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -15,6 +15,7 @@ struct perf_mem {
 	char const		*input_name;
 	bool			hide_unresolved;
 	bool			dump_raw;
+	bool			force;
 	int			operation;
 	const char		*cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -120,6 +121,7 @@ static int report_raw_events(struct perf_mem *mem)
 	struct perf_data_file file = {
 		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = mem->force,
 	};
 	int err = -EINVAL;
 	int ret;
@@ -141,7 +143,7 @@ static int report_raw_events(struct perf_mem *mem)
 
 	printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
 
-	err = perf_session__process_events(session, &mem->tool);
+	err = perf_session__process_events(session);
 	if (err)
 		return err;
 
@@ -286,10 +288,11 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "input file name"),
 	OPT_STRING('C', "cpu", &mem.cpu_list, "cpu",
 		   "list of cpus to profile"),
-	OPT_STRING('x', "field-separator", &symbol_conf.field_sep,
+	OPT_STRING_NOEMPTY('x', "field-separator", &symbol_conf.field_sep,
 		   "separator",
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
+	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 921bb6942503..f7b1af67e9f6 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -56,6 +56,7 @@ static struct {
 	bool mod_events;
 	bool uprobes;
 	bool quiet;
+	bool target_used;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
 	struct strlist *dellist;
@@ -78,6 +79,12 @@ static int parse_probe_event(const char *str)
 	}
 
 	pev->uprobes = params.uprobes;
+	if (params.target) {
+		pev->target = strdup(params.target);
+		if (!pev->target)
+			return -ENOMEM;
+		params.target_used = true;
+	}
 
 	/* Parse a perf-probe command into event */
 	ret = parse_perf_probe_command(str, pev);
@@ -102,6 +109,7 @@ static int set_target(const char *ptr)
 		params.target = strdup(ptr);
 		if (!params.target)
 			return -ENOMEM;
+		params.target_used = false;
 
 		found = 1;
 		buf = ptr + (strlen(ptr) - 3);
@@ -178,7 +186,7 @@ static int opt_set_target(const struct option *opt, const char *str,
 	int ret = -ENOENT;
 	char *tmp;
 
-	if  (str && !params.target) {
+	if  (str) {
 		if (!strcmp(opt->long_name, "exec"))
 			params.uprobes = true;
 #ifdef HAVE_DWARF_SUPPORT
@@ -200,7 +208,9 @@ static int opt_set_target(const struct option *opt, const char *str,
 			if (!tmp)
 				return -ENOMEM;
 		}
+		free(params.target);
 		params.target = tmp;
+		params.target_used = false;
 		ret = 0;
 	}
 
@@ -485,9 +495,14 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	}
 
 	if (params.nevents) {
+		/* Ensure the last given target is used */
+		if (params.target && !params.target_used) {
+			pr_warning("  Error: -x/-m must follow the probe definitions.\n");
+			usage_with_options(probe_usage, options);
+		}
+
 		ret = add_perf_probe_events(params.events, params.nevents,
 					    params.max_probe_points,
-					    params.target,
 					    params.force_add);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Failed to add events.", ret);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 404ab3434052..c3efdfb630b5 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -70,8 +70,8 @@ static int process_synthesized_event(struct perf_tool *tool,
 static int record__mmap_read(struct record *rec, int idx)
 {
 	struct perf_mmap *md = &rec->evlist->mmap[idx];
-	unsigned int head = perf_mmap__read_head(md);
-	unsigned int old = md->prev;
+	u64 head = perf_mmap__read_head(md);
+	u64 old = md->prev;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
 	void *buf;
@@ -161,8 +161,9 @@ try_again:
 		}
 	}
 
-	if (perf_evlist__apply_filters(evlist)) {
-		error("failed to set filter with %d (%s)\n", errno,
+	if (perf_evlist__apply_filters(evlist, &pos)) {
+		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
+			pos->filter, perf_evsel__name(pos), errno,
 			strerror_r(errno, msg, sizeof(msg)));
 		rc = -1;
 		goto out;
@@ -225,7 +226,7 @@ static int process_buildids(struct record *rec)
 	 */
 	symbol_conf.ignore_vmlinux_buildid = true;
 
-	return perf_session__process_events(session, &rec->tool);
+	return perf_session__process_events(session);
 }
 
 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
@@ -343,7 +344,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
 
-	session = perf_session__new(file, false, NULL);
+	session = perf_session__new(file, false, tool);
 	if (session == NULL) {
 		pr_err("Perf session creation failed.\n");
 		return -1;
@@ -658,7 +659,7 @@ error:
 
 static void callchain_debug(void)
 {
-	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF" };
+	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
 
 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
 
@@ -710,6 +711,90 @@ static int perf_record_config(const char *var, const char *value, void *cb)
 	return perf_default_config(var, value, cb);
 }
 
+struct clockid_map {
+	const char *name;
+	int clockid;
+};
+
+#define CLOCKID_MAP(n, c)	\
+	{ .name = n, .clockid = (c), }
+
+#define CLOCKID_END	{ .name = NULL, }
+
+
+/*
+ * Add the missing ones, we need to build on many distros...
+ */
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+#ifndef CLOCK_BOOTTIME
+#define CLOCK_BOOTTIME 7
+#endif
+#ifndef CLOCK_TAI
+#define CLOCK_TAI 11
+#endif
+
+static const struct clockid_map clockids[] = {
+	/* available for all events, NMI safe */
+	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
+	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
+
+	/* available for some events */
+	CLOCKID_MAP("realtime", CLOCK_REALTIME),
+	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
+	CLOCKID_MAP("tai", CLOCK_TAI),
+
+	/* available for the lazy */
+	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
+	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
+	CLOCKID_MAP("real", CLOCK_REALTIME),
+	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
+
+	CLOCKID_END,
+};
+
+static int parse_clockid(const struct option *opt, const char *str, int unset)
+{
+	struct record_opts *opts = (struct record_opts *)opt->value;
+	const struct clockid_map *cm;
+	const char *ostr = str;
+
+	if (unset) {
+		opts->use_clockid = 0;
+		return 0;
+	}
+
+	/* no arg passed */
+	if (!str)
+		return 0;
+
+	/* no setting it twice */
+	if (opts->use_clockid)
+		return -1;
+
+	opts->use_clockid = true;
+
+	/* if its a number, we're done */
+	if (sscanf(str, "%d", &opts->clockid) == 1)
+		return 0;
+
+	/* allow a "CLOCK_" prefix to the name */
+	if (!strncasecmp(str, "CLOCK_", 6))
+		str += 6;
+
+	for (cm = clockids; cm->name; cm++) {
+		if (!strcasecmp(str, cm->name)) {
+			opts->clockid = cm->clockid;
+			return 0;
+		}
+	}
+
+	opts->use_clockid = false;
+	ui__warning("unknown clockid %s, check man page\n", ostr);
+	return -1;
+}
+
 static const char * const __record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -751,9 +836,9 @@ static struct record record = {
 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
-const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf";
+const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf lbr";
 #else
-const char record_callchain_help[] = CALLCHAIN_HELP "fp";
+const char record_callchain_help[] = CALLCHAIN_HELP "fp lbr";
 #endif
 
 /*
@@ -839,6 +924,11 @@ struct option __record_options[] = {
 		    "use per-thread mmaps"),
 	OPT_BOOLEAN('I', "intr-regs", &record.opts.sample_intr_regs,
 		    "Sample machine registers on interrupt"),
+	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
+		    "Record running/enabled time of read (:S) events"),
+	OPT_CALLBACK('k', "clockid", &record.opts,
+	"clockid", "clockid to use for events, see clock_gettime()",
+	parse_clockid),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2f91094e228b..476cdf7afcca 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -249,6 +249,8 @@ static int report__setup_sample_type(struct report *rep)
 		if ((sample_type & PERF_SAMPLE_REGS_USER) &&
 		    (sample_type & PERF_SAMPLE_STACK_USER))
 			callchain_param.record_mode = CALLCHAIN_DWARF;
+		else if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+			callchain_param.record_mode = CALLCHAIN_LBR;
 		else
 			callchain_param.record_mode = CALLCHAIN_FP;
 	}
@@ -302,7 +304,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
 
 	if (rep->mem_mode) {
 		ret += fprintf(fp, "\n# Total weight : %" PRIu64, nr_events);
-		ret += fprintf(fp, "\n# Sort order   : %s", sort_order);
+		ret += fprintf(fp, "\n# Sort order   : %s", sort_order ? : default_mem_sort_order);
 	} else
 		ret += fprintf(fp, "\n# Event count (approx.): %" PRIu64, nr_events);
 	return ret + fprintf(fp, "\n#\n");
@@ -345,7 +347,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 static void report__warn_kptr_restrict(const struct report *rep)
 {
 	struct map *kernel_map = rep->session->machines.host.vmlinux_maps[MAP__FUNCTION];
-	struct kmap *kernel_kmap = map__kmap(kernel_map);
+	struct kmap *kernel_kmap = kernel_map ? map__kmap(kernel_map) : NULL;
 
 	if (kernel_map == NULL ||
 	    (kernel_map->dso->hit &&
@@ -480,7 +482,7 @@ static int __cmd_report(struct report *rep)
 	if (ret)
 		return ret;
 
-	ret = perf_session__process_events(session, &rep->tool);
+	ret = perf_session__process_events(session);
 	if (ret)
 		return ret;
 
@@ -667,6 +669,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "only consider symbols in these dsos"),
 	OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
 		   "only consider symbols in these comms"),
+	OPT_STRING(0, "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
+		   "only consider symbols in these pids"),
+	OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
+		   "only consider symbols in these tids"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
 		   "only consider these symbols"),
 	OPT_STRING(0, "symbol-filter", &report.symbol_filter_str, "filter",
@@ -674,7 +680,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str,
 		   "width[,width...]",
 		   "don't try to adjust column width, use these fixed values"),
-	OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
+	OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
 	OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved,
@@ -766,7 +772,7 @@ repeat:
 	 * 0/1 means the user chose a mode.
 	 */
 	if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) &&
-	    branch_call_mode == -1) {
+	    !branch_call_mode) {
 		sort__mode = SORT_MODE__BRANCH;
 		symbol_conf.cumulate_callchain = false;
 	}
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 891c3930080e..5275bab70313 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -23,12 +23,13 @@
 #include <semaphore.h>
 #include <pthread.h>
 #include <math.h>
+#include <api/fs/fs.h>
 
 #define PR_SET_NAME		15               /* Set process name */
 #define MAX_CPUS		4096
 #define COMM_LEN		20
 #define SYM_LEN			129
-#define MAX_PID			65536
+#define MAX_PID			1024000
 
 struct sched_atom;
 
@@ -124,7 +125,7 @@ struct perf_sched {
 	struct perf_tool tool;
 	const char	 *sort_order;
 	unsigned long	 nr_tasks;
-	struct task_desc *pid_to_task[MAX_PID];
+	struct task_desc **pid_to_task;
 	struct task_desc **tasks;
 	const struct trace_sched_handler *tp_handler;
 	pthread_mutex_t	 start_work_mutex;
@@ -169,6 +170,7 @@ struct perf_sched {
 	u64		 cpu_last_switched[MAX_CPUS];
 	struct rb_root	 atom_root, sorted_atom_root;
 	struct list_head sort_list, cmp_pid;
+	bool force;
 };
 
 static u64 get_nsecs(void)
@@ -326,8 +328,19 @@ static struct task_desc *register_pid(struct perf_sched *sched,
 				      unsigned long pid, const char *comm)
 {
 	struct task_desc *task;
+	static int pid_max;
 
-	BUG_ON(pid >= MAX_PID);
+	if (sched->pid_to_task == NULL) {
+		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
+			pid_max = MAX_PID;
+		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
+	}
+	if (pid >= (unsigned long)pid_max) {
+		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
+			sizeof(struct task_desc *))) == NULL);
+		while (pid >= (unsigned long)pid_max)
+			sched->pid_to_task[pid_max++] = NULL;
+	}
 
 	task = sched->pid_to_task[pid];
 
@@ -346,7 +359,7 @@ static struct task_desc *register_pid(struct perf_sched *sched,
 
 	sched->pid_to_task[pid] = task;
 	sched->nr_tasks++;
-	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
+	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 	BUG_ON(!sched->tasks);
 	sched->tasks[task->nr] = task;
 
@@ -425,24 +438,45 @@ static u64 get_cpu_usage_nsec_parent(void)
 	return sum;
 }
 
-static int self_open_counters(void)
+static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 {
 	struct perf_event_attr attr;
-	char sbuf[STRERR_BUFSIZE];
+	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 	int fd;
+	struct rlimit limit;
+	bool need_privilege = false;
 
 	memset(&attr, 0, sizeof(attr));
 
 	attr.type = PERF_TYPE_SOFTWARE;
 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
 
+force_again:
 	fd = sys_perf_event_open(&attr, 0, -1, -1,
 				 perf_event_open_cloexec_flag());
 
-	if (fd < 0)
+	if (fd < 0) {
+		if (errno == EMFILE) {
+			if (sched->force) {
+				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
+				limit.rlim_cur += sched->nr_tasks - cur_task;
+				if (limit.rlim_cur > limit.rlim_max) {
+					limit.rlim_max = limit.rlim_cur;
+					need_privilege = true;
+				}
+				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
+					if (need_privilege && errno == EPERM)
+						strcpy(info, "Need privilege\n");
+				} else
+					goto force_again;
+			} else
+				strcpy(info, "Have a try with -f option\n");
+		}
 		pr_err("Error: sys_perf_event_open() syscall returned "
-		       "with %d (%s)\n", fd,
-		       strerror_r(errno, sbuf, sizeof(sbuf)));
+		       "with %d (%s)\n%s", fd,
+		       strerror_r(errno, sbuf, sizeof(sbuf)), info);
+		exit(EXIT_FAILURE);
+	}
 	return fd;
 }
 
@@ -460,6 +494,7 @@ static u64 get_cpu_usage_nsec_self(int fd)
 struct sched_thread_parms {
 	struct task_desc  *task;
 	struct perf_sched *sched;
+	int fd;
 };
 
 static void *thread_func(void *ctx)
@@ -470,13 +505,12 @@ static void *thread_func(void *ctx)
 	u64 cpu_usage_0, cpu_usage_1;
 	unsigned long i, ret;
 	char comm2[22];
-	int fd;
+	int fd = parms->fd;
 
 	zfree(&parms);
 
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
-	fd = self_open_counters();
 	if (fd < 0)
 		return NULL;
 again:
@@ -528,6 +562,7 @@ static void create_tasks(struct perf_sched *sched)
 		BUG_ON(parms == NULL);
 		parms->task = task = sched->tasks[i];
 		parms->sched = sched;
+		parms->fd = self_open_counters(sched, i);
 		sem_init(&task->sleep_sem, 0, 0);
 		sem_init(&task->ready_for_work, 0, 0);
 		sem_init(&task->work_done_sem, 0, 0);
@@ -572,13 +607,13 @@ static void wait_for_tasks(struct perf_sched *sched)
 	cpu_usage_1 = get_cpu_usage_nsec_parent();
 	if (!sched->runavg_cpu_usage)
 		sched->runavg_cpu_usage = sched->cpu_usage;
-	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
+	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 
 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 	if (!sched->runavg_parent_cpu_usage)
 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
-	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
-					 sched->parent_cpu_usage)/10;
+	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
+					 sched->parent_cpu_usage)/sched->replay_repeat;
 
 	ret = pthread_mutex_lock(&sched->start_work_mutex);
 	BUG_ON(ret);
@@ -610,7 +645,7 @@ static void run_one_test(struct perf_sched *sched)
 	sched->sum_fluct += fluct;
 	if (!sched->run_avg)
 		sched->run_avg = delta;
-	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
+	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 
 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
 
@@ -831,7 +866,7 @@ static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 		return -1;
 	}
 
-	atoms->thread = thread;
+	atoms->thread = thread__get(thread);
 	INIT_LIST_HEAD(&atoms->work_list);
 	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
 	return 0;
@@ -1439,8 +1474,7 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_
 	return err;
 }
 
-static int perf_sched__read_events(struct perf_sched *sched,
-				   struct perf_session **psession)
+static int perf_sched__read_events(struct perf_sched *sched)
 {
 	const struct perf_evsel_str_handler handlers[] = {
 		{ "sched:sched_switch",	      process_sched_switch_event, },
@@ -1453,7 +1487,9 @@ static int perf_sched__read_events(struct perf_sched *sched,
 	struct perf_data_file file = {
 		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = sched->force,
 	};
+	int rc = -1;
 
 	session = perf_session__new(&file, false, &sched->tool);
 	if (session == NULL) {
@@ -1467,27 +1503,21 @@ static int perf_sched__read_events(struct perf_sched *sched,
 		goto out_delete;
 
 	if (perf_session__has_traces(session, "record -R")) {
-		int err = perf_session__process_events(session, &sched->tool);
+		int err = perf_session__process_events(session);
 		if (err) {
 			pr_err("Failed to process events, error %d", err);
 			goto out_delete;
 		}
 
-		sched->nr_events      = session->stats.nr_events[0];
-		sched->nr_lost_events = session->stats.total_lost;
-		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
+		sched->nr_events      = session->evlist->stats.nr_events[0];
+		sched->nr_lost_events = session->evlist->stats.total_lost;
+		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
 	}
 
-	if (psession)
-		*psession = session;
-	else
-		perf_session__delete(session);
-
-	return 0;
-
+	rc = 0;
 out_delete:
 	perf_session__delete(session);
-	return -1;
+	return rc;
 }
 
 static void print_bad_events(struct perf_sched *sched)
@@ -1515,12 +1545,10 @@ static void print_bad_events(struct perf_sched *sched)
 static int perf_sched__lat(struct perf_sched *sched)
 {
 	struct rb_node *next;
-	struct perf_session *session;
 
 	setup_pager();
 
-	/* save session -- references to threads are held in work_list */
-	if (perf_sched__read_events(sched, &session))
+	if (perf_sched__read_events(sched))
 		return -1;
 
 	perf_sched__sort_lat(sched);
@@ -1537,6 +1565,7 @@ static int perf_sched__lat(struct perf_sched *sched)
 		work_list = rb_entry(next, struct work_atoms, node);
 		output_lat_thread(sched, work_list);
 		next = rb_next(next);
+		thread__zput(work_list->thread);
 	}
 
 	printf(" -----------------------------------------------------------------------------------------------------------------\n");
@@ -1548,7 +1577,6 @@ static int perf_sched__lat(struct perf_sched *sched)
 	print_bad_events(sched);
 	printf("\n");
 
-	perf_session__delete(session);
 	return 0;
 }
 
@@ -1557,7 +1585,7 @@ static int perf_sched__map(struct perf_sched *sched)
 	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
 
 	setup_pager();
-	if (perf_sched__read_events(sched, NULL))
+	if (perf_sched__read_events(sched))
 		return -1;
 	print_bad_events(sched);
 	return 0;
@@ -1572,7 +1600,7 @@ static int perf_sched__replay(struct perf_sched *sched)
 
 	test_calibrations(sched);
 
-	if (perf_sched__read_events(sched, NULL))
+	if (perf_sched__read_events(sched))
 		return -1;
 
 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
@@ -1693,6 +1721,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const struct option sched_options[] = {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ce304dfd962a..58f10b8e6ff2 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -446,9 +446,9 @@ static void print_sample_bts(union perf_event *event,
 }
 
 static void process_event(union perf_event *event, struct perf_sample *sample,
-			  struct perf_evsel *evsel, struct thread *thread,
-			  struct addr_location *al)
+			  struct perf_evsel *evsel, struct addr_location *al)
 {
+	struct thread *thread = al->thread;
 	struct perf_event_attr *attr = &evsel->attr;
 
 	if (output[attr->type].fields == 0)
@@ -549,14 +549,6 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 				struct machine *machine)
 {
 	struct addr_location al;
-	struct thread *thread = machine__findnew_thread(machine, sample->pid,
-							sample->tid);
-
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
-		return -1;
-	}
 
 	if (debug_mode) {
 		if (sample->time < last_timestamp) {
@@ -581,7 +573,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
 		return 0;
 
-	scripting_ops->process_event(event, sample, evsel, thread, &al);
+	scripting_ops->process_event(event, sample, evsel, &al);
 
 	return 0;
 }
@@ -800,7 +792,7 @@ static int __cmd_script(struct perf_script *script)
 		script->tool.mmap2 = process_mmap2_event;
 	}
 
-	ret = perf_session__process_events(script->session, &script->tool);
+	ret = perf_session__process_events(script->session);
 
 	if (debug_mode)
 		pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered);
@@ -1523,6 +1515,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.ordering_requires_timestamps = true,
 		},
 	};
+	struct perf_data_file file = {
+		.mode = PERF_DATA_MODE_READ,
+	};
 	const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
@@ -1550,7 +1545,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "When printing symbols do not display call chain"),
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
 		    "Look for files with symbols relative to this directory"),
-	OPT_CALLBACK('f', "fields", NULL, "str",
+	OPT_CALLBACK('F', "fields", NULL, "str",
 		     "comma separated output fields prepend with 'type:'. "
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
@@ -1562,6 +1557,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
 	OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
 		   "only display events for these comms"),
+	OPT_STRING(0, "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
+		   "only consider symbols in these pids"),
+	OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
+		   "only consider symbols in these tids"),
 	OPT_BOOLEAN('I', "show-info", &show_full_info,
 		    "display extended information from perf.data file"),
 	OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path,
@@ -1570,9 +1569,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show the fork/comm/exit events"),
 	OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events,
 		    "Show the mmap events"),
+	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_END()
 	};
-	const char * const script_usage[] = {
+	const char * const script_subcommands[] = { "record", "report", NULL };
+	const char *script_usage[] = {
 		"perf script [<options>]",
 		"perf script [<options>] record <script> [<record-options>] <command>",
 		"perf script [<options>] report <script> [script-args]",
@@ -1580,13 +1581,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf script [<options>] <top-script> [script-args]",
 		NULL
 	};
-	struct perf_data_file file = {
-		.mode = PERF_DATA_MODE_READ,
-	};
 
 	setup_scripting();
 
-	argc = parse_options(argc, argv, options, script_usage,
+	argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 
 	file.path = input_name;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e598e4e98170..f7b8218785f6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -353,39 +353,40 @@ static struct perf_evsel *nth_evsel(int n)
  * more semantic information such as miss/hit ratios,
  * instruction rates, etc:
  */
-static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
+static void update_shadow_stats(struct perf_evsel *counter, u64 *count,
+				int cpu)
 {
 	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
-		update_stats(&runtime_nsecs_stats[0], count[0]);
+		update_stats(&runtime_nsecs_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-		update_stats(&runtime_cycles_stats[0], count[0]);
+		update_stats(&runtime_cycles_stats[cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
-		update_stats(&runtime_cycles_in_tx_stats[0], count[0]);
+		update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
-		update_stats(&runtime_transaction_stats[0], count[0]);
+		update_stats(&runtime_transaction_stats[cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
-		update_stats(&runtime_elision_stats[0], count[0]);
+		update_stats(&runtime_elision_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
-		update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
+		update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-		update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
+		update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-		update_stats(&runtime_branches_stats[0], count[0]);
+		update_stats(&runtime_branches_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-		update_stats(&runtime_cacherefs_stats[0], count[0]);
+		update_stats(&runtime_cacherefs_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-		update_stats(&runtime_l1_dcache_stats[0], count[0]);
+		update_stats(&runtime_l1_dcache_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-		update_stats(&runtime_l1_icache_stats[0], count[0]);
+		update_stats(&runtime_l1_icache_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-		update_stats(&runtime_ll_cache_stats[0], count[0]);
+		update_stats(&runtime_ll_cache_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-		update_stats(&runtime_dtlb_cache_stats[0], count[0]);
+		update_stats(&runtime_dtlb_cache_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-		update_stats(&runtime_itlb_cache_stats[0], count[0]);
+		update_stats(&runtime_itlb_cache_stats[cpu], count[0]);
 }
 
 static void zero_per_pkg(struct perf_evsel *counter)
@@ -447,7 +448,8 @@ static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
 			perf_evsel__compute_deltas(evsel, cpu, count);
 		perf_counts_values__scale(count, scale, NULL);
 		evsel->counts->cpu[cpu] = *count;
-		update_shadow_stats(evsel, count->values);
+		if (aggr_mode == AGGR_NONE)
+			update_shadow_stats(evsel, count->values, cpu);
 		break;
 	case AGGR_GLOBAL:
 		aggr->val += count->val;
@@ -495,7 +497,7 @@ static int read_counter_aggr(struct perf_evsel *counter)
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	update_shadow_stats(counter, count);
+	update_shadow_stats(counter, count, 0);
 
 	return 0;
 }
@@ -510,6 +512,9 @@ static int read_counter(struct perf_evsel *counter)
 	int ncpus = perf_evsel__nr_cpus(counter);
 	int cpu, thread;
 
+	if (!counter->supported)
+		return -ENOENT;
+
 	if (counter->system_wide)
 		nthreads = 1;
 
@@ -679,8 +684,9 @@ static int __run_perf_stat(int argc, const char **argv)
 			unit_width = l;
 	}
 
-	if (perf_evlist__apply_filters(evsel_list)) {
-		error("failed to set filter with %d (%s)\n", errno,
+	if (perf_evlist__apply_filters(evsel_list, &counter)) {
+		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
+			counter->filter, perf_evsel__name(counter), errno,
 			strerror_r(errno, msg, sizeof(msg)));
 		return -1;
 	}
@@ -766,6 +772,19 @@ static int run_perf_stat(int argc, const char **argv)
 	return ret;
 }
 
+static void print_running(u64 run, u64 ena)
+{
+	if (csv_output) {
+		fprintf(output, "%s%" PRIu64 "%s%.2f",
+					csv_sep,
+					run,
+					csv_sep,
+					ena ? 100.0 * run / ena : 100.0);
+	} else if (run != ena) {
+		fprintf(output, "  (%.2f%%)", 100.0 * run / ena);
+	}
+}
+
 static void print_noise_pct(double total, double avg)
 {
 	double pct = rel_stddev_stats(total, avg);
@@ -1076,6 +1095,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		if (total) {
 			ratio = avg / total;
 			fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
+		} else {
+			fprintf(output, "                                   ");
 		}
 		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
 		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
@@ -1145,6 +1166,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		if (total) {
 			ratio = avg / total;
 			fprintf(output, " # %8.3f GHz                    ", ratio);
+		} else {
+			fprintf(output, "                                   ");
 		}
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
@@ -1249,6 +1272,7 @@ static void print_aggr(char *prefix)
 					fprintf(output, "%s%s",
 						csv_sep, counter->cgrp->name);
 
+				print_running(run, ena);
 				fputc('\n', output);
 				continue;
 			}
@@ -1259,13 +1283,10 @@ static void print_aggr(char *prefix)
 			else
 				abs_printout(id, nr, counter, uval);
 
-			if (!csv_output) {
+			if (!csv_output)
 				print_noise(counter, 1.0);
 
-				if (run != ena)
-					fprintf(output, "  (%.2f%%)",
-						100.0 * run / ena);
-			}
+			print_running(run, ena);
 			fputc('\n', output);
 		}
 	}
@@ -1281,11 +1302,15 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 	double avg = avg_stats(&ps->res_stats[0]);
 	int scaled = counter->counts->scaled;
 	double uval;
+	double avg_enabled, avg_running;
+
+	avg_enabled = avg_stats(&ps->res_stats[1]);
+	avg_running = avg_stats(&ps->res_stats[2]);
 
 	if (prefix)
 		fprintf(output, "%s", prefix);
 
-	if (scaled == -1) {
+	if (scaled == -1 || !counter->supported) {
 		fprintf(output, "%*s%s",
 			csv_output ? 0 : 18,
 			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
@@ -1300,6 +1325,7 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 		if (counter->cgrp)
 			fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
 
+		print_running(avg_running, avg_enabled);
 		fputc('\n', output);
 		return;
 	}
@@ -1313,19 +1339,7 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 
 	print_noise(counter, avg);
 
-	if (csv_output) {
-		fputc('\n', output);
-		return;
-	}
-
-	if (scaled) {
-		double avg_enabled, avg_running;
-
-		avg_enabled = avg_stats(&ps->res_stats[1]);
-		avg_running = avg_stats(&ps->res_stats[2]);
-
-		fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled);
-	}
+	print_running(avg_running, avg_enabled);
 	fprintf(output, "\n");
 }
 
@@ -1367,6 +1381,7 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 				fprintf(output, "%s%s",
 					csv_sep, counter->cgrp->name);
 
+			print_running(run, ena);
 			fputc('\n', output);
 			continue;
 		}
@@ -1378,13 +1393,10 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 		else
 			abs_printout(cpu, 0, counter, uval);
 
-		if (!csv_output) {
+		if (!csv_output)
 			print_noise(counter, 1.0);
+		print_running(run, ena);
 
-			if (run != ena)
-				fprintf(output, "  (%.2f%%)",
-					100.0 * run / ena);
-		}
 		fputc('\n', output);
 	}
 }
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index f3bb1a4bf060..e50fe1187b0b 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -67,6 +67,7 @@ struct timechart {
 				skip_eagain;
 	u64			min_time,
 				merge_dist;
+	bool			force;
 };
 
 struct per_pidcomm;
@@ -1598,6 +1599,7 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name)
 	struct perf_data_file file = {
 		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = tchart->force,
 	};
 
 	struct perf_session *session = perf_session__new(&file, false,
@@ -1623,7 +1625,7 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name)
 		goto out_delete;
 	}
 
-	ret = perf_session__process_events(session, &tchart->tool);
+	ret = perf_session__process_events(session);
 	if (ret)
 		goto out_delete;
 
@@ -1956,9 +1958,11 @@ int cmd_timechart(int argc, const char **argv,
 	OPT_CALLBACK(0, "io-merge-dist", &tchart.merge_dist, "time",
 		     "merge events that are merge-dist us apart",
 		     parse_time),
+	OPT_BOOLEAN('f', "force", &tchart.force, "don't complain, do it"),
 	OPT_END()
 	};
-	const char * const timechart_usage[] = {
+	const char * const timechart_subcommands[] = { "record", NULL };
+	const char *timechart_usage[] = {
 		"perf timechart [<options>] {record}",
 		NULL
 	};
@@ -1976,8 +1980,8 @@ int cmd_timechart(int argc, const char **argv,
 		"perf timechart record [<options>]",
 		NULL
 	};
-	argc = parse_options(argc, argv, timechart_options, timechart_usage,
-			PARSE_OPT_STOP_AT_NON_OPTION);
+	argc = parse_options_subcommand(argc, argv, timechart_options, timechart_subcommands,
+			timechart_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
 	if (tchart.power_only && tchart.tasks_only) {
 		pr_err("-P and -T options cannot be used at the same time.\n");
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c4c7eac69de4..1cb3436276d1 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -716,7 +716,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
 
 	if (!machine) {
 		pr_err("%u unprocessable samples recorded.\r",
-		       top->session->stats.nr_unprocessable_samples++);
+		       top->session->evlist->stats.nr_unprocessable_samples++);
 		return;
 	}
 
@@ -757,8 +757,10 @@ static void perf_event__process_sample(struct perf_tool *tool,
 		    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
 			if (symbol_conf.vmlinux_name) {
-				ui__warning("The %s file can't be used.\n%s",
-					    symbol_conf.vmlinux_name, msg);
+				char serr[256];
+				dso__strerror_load(al.map->dso, serr, sizeof(serr));
+				ui__warning("The %s file can't be used: %s\n%s",
+					    symbol_conf.vmlinux_name, serr, msg);
 			} else {
 				ui__warning("A vmlinux file was not found.\n%s",
 					    msg);
@@ -856,7 +858,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 			hists__inc_nr_events(evsel__hists(evsel), event->header.type);
 			machine__process_event(machine, event, &sample);
 		} else
-			++session->stats.nr_unknown_events;
+			++session->evlist->stats.nr_unknown_events;
 next_event:
 		perf_evlist__mmap_consume(top->evlist, idx);
 	}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7e935f1083ec..e124741be187 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -52,7 +52,9 @@ struct tp_field {
 #define TP_UINT_FIELD(bits) \
 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 { \
-	return *(u##bits *)(sample->raw_data + field->offset); \
+	u##bits value; \
+	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
+	return value;  \
 }
 
 TP_UINT_FIELD(8);
@@ -63,7 +65,8 @@ TP_UINT_FIELD(64);
 #define TP_UINT_FIELD__SWAPPED(bits) \
 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 { \
-	u##bits value = *(u##bits *)(sample->raw_data + field->offset); \
+	u##bits value; \
+	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 	return bswap_##bits(value);\
 }
 
@@ -1132,6 +1135,8 @@ static struct syscall_fmt *syscall_fmt__find(const char *name)
 
 struct syscall {
 	struct event_format *tp_format;
+	int		    nr_args;
+	struct format_field *args;
 	const char	    *name;
 	bool		    filtered;
 	bool		    is_exit;
@@ -1219,7 +1224,9 @@ struct trace {
 		struct syscall  *table;
 	} syscalls;
 	struct record_opts	opts;
+	struct perf_evlist	*evlist;
 	struct machine		*host;
+	struct thread		*current;
 	u64			base_time;
 	FILE			*output;
 	unsigned long		nr_events;
@@ -1227,6 +1234,10 @@ struct trace {
 	const char 		*last_vfs_getname;
 	struct intlist		*tid_list;
 	struct intlist		*pid_list;
+	struct {
+		size_t		nr;
+		pid_t		*entries;
+	}			filter_pids;
 	double			duration_filter;
 	double			runtime_ms;
 	struct {
@@ -1243,6 +1254,7 @@ struct trace {
 	bool			show_comm;
 	bool			show_tool_stats;
 	bool			trace_syscalls;
+	bool			force;
 	int			trace_pgfaults;
 };
 
@@ -1433,14 +1445,14 @@ static int syscall__set_arg_fmts(struct syscall *sc)
 	struct format_field *field;
 	int idx = 0;
 
-	sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
+	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
 	if (sc->arg_scnprintf == NULL)
 		return -1;
 
 	if (sc->fmt)
 		sc->arg_parm = sc->fmt->arg_parm;
 
-	for (field = sc->tp_format->format.fields->next; field; field = field->next) {
+	for (field = sc->args; field; field = field->next) {
 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
 		else if (field->flags & FIELD_IS_POINTER)
@@ -1506,18 +1518,37 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 	if (sc->tp_format == NULL)
 		return -1;
 
+	sc->args = sc->tp_format->format.fields;
+	sc->nr_args = sc->tp_format->format.nr_fields;
+	/* drop nr field - not relevant here; does not exist on older kernels */
+	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
+		sc->args = sc->args->next;
+		--sc->nr_args;
+	}
+
 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 
 	return syscall__set_arg_fmts(sc);
 }
 
+/*
+ * args is to be interpreted as a series of longs but we need to handle
+ * 8-byte unaligned accesses. args points to raw_data within the event
+ * and raw_data is guaranteed to be 8-byte unaligned because it is
+ * preceded by raw_size which is a u32. So we need to copy args to a temp
+ * variable to read it. Most notably this avoids extended load instructions
+ * on unaligned addresses
+ */
+
 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
-				      unsigned long *args, struct trace *trace,
+				      unsigned char *args, struct trace *trace,
 				      struct thread *thread)
 {
 	size_t printed = 0;
+	unsigned char *p;
+	unsigned long val;
 
-	if (sc->tp_format != NULL) {
+	if (sc->args != NULL) {
 		struct format_field *field;
 		u8 bit = 1;
 		struct syscall_arg arg = {
@@ -1527,16 +1558,21 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			.thread = thread,
 		};
 
-		for (field = sc->tp_format->format.fields->next; field;
+		for (field = sc->args; field;
 		     field = field->next, ++arg.idx, bit <<= 1) {
 			if (arg.mask & bit)
 				continue;
+
+			/* special care for unaligned accesses */
+			p = args + sizeof(unsigned long) * arg.idx;
+			memcpy(&val, p, sizeof(val));
+
 			/*
  			 * Suppress this argument if its value is zero and
  			 * and we don't have a string associated in an
  			 * strarray for it.
  			 */
-			if (args[arg.idx] == 0 &&
+			if (val == 0 &&
 			    !(sc->arg_scnprintf &&
 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
 			      sc->arg_parm[arg.idx]))
@@ -1545,23 +1581,26 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 			printed += scnprintf(bf + printed, size - printed,
 					     "%s%s: ", printed ? ", " : "", field->name);
 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
-				arg.val = args[arg.idx];
+				arg.val = val;
 				if (sc->arg_parm)
 					arg.parm = sc->arg_parm[arg.idx];
 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
 								      size - printed, &arg);
 			} else {
 				printed += scnprintf(bf + printed, size - printed,
-						     "%ld", args[arg.idx]);
+						     "%ld", val);
 			}
 		}
 	} else {
 		int i = 0;
 
 		while (i < 6) {
+			/* special care for unaligned accesses */
+			p = args + sizeof(unsigned long) * i;
+			memcpy(&val, p, sizeof(val));
 			printed += scnprintf(bf + printed, size - printed,
 					     "%sarg%d: %ld",
-					     printed ? ", " : "", i, args[i]);
+					     printed ? ", " : "", i, val);
 			++i;
 		}
 	}
@@ -1642,6 +1681,29 @@ static void thread__update_stats(struct thread_trace *ttrace,
 	update_stats(stats, duration);
 }
 
+static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
+{
+	struct thread_trace *ttrace;
+	u64 duration;
+	size_t printed;
+
+	if (trace->current == NULL)
+		return 0;
+
+	ttrace = thread__priv(trace->current);
+
+	if (!ttrace->entry_pending)
+		return 0;
+
+	duration = sample->time - ttrace->entry_time;
+
+	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
+	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
+	ttrace->entry_pending = false;
+
+	return printed;
+}
+
 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			    union perf_event *event __maybe_unused,
 			    struct perf_sample *sample)
@@ -1673,6 +1735,9 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			return -1;
 	}
 
+	if (!trace->summary_only)
+		trace__printf_interrupted_entry(trace, sample);
+
 	ttrace->entry_time = sample->time;
 	msg = ttrace->entry_str;
 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
@@ -1688,6 +1753,11 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	} else
 		ttrace->entry_pending = true;
 
+	if (trace->current != thread) {
+		thread__put(trace->current);
+		trace->current = thread__get(thread);
+	}
+
 	return 0;
 }
 
@@ -1805,6 +1875,28 @@ out_dump:
 	return 0;
 }
 
+static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
+				union perf_event *event __maybe_unused,
+				struct perf_sample *sample)
+{
+	trace__printf_interrupted_entry(trace, sample);
+	trace__fprintf_tstamp(trace, sample->time, trace->output);
+
+	if (trace->trace_syscalls)
+		fprintf(trace->output, "(         ): ");
+
+	fprintf(trace->output, "%s:", evsel->name);
+
+	if (evsel->tp_format) {
+		event_format__fprintf(evsel->tp_format, sample->cpu,
+				      sample->raw_data, sample->raw_size,
+				      trace->output);
+	}
+
+	fprintf(trace->output, ")\n");
+	return 0;
+}
+
 static void print_location(FILE *f, struct perf_sample *sample,
 			   struct addr_location *al,
 			   bool print_dso, bool print_sym)
@@ -2037,10 +2129,39 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
 	return 0;
 }
 
+static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
+{
+	const u32 type = event->header.type;
+	struct perf_evsel *evsel;
+
+	if (!trace->full_time && trace->base_time == 0)
+		trace->base_time = sample->time;
+
+	if (type != PERF_RECORD_SAMPLE) {
+		trace__process_event(trace, trace->host, event, sample);
+		return;
+	}
+
+	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
+	if (evsel == NULL) {
+		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
+		return;
+	}
+
+	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
+	    sample->raw_data == NULL) {
+		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
+		       perf_evsel__name(evsel), sample->tid,
+		       sample->cpu, sample->raw_size);
+	} else {
+		tracepoint_handler handler = evsel->handler;
+		handler(trace, evsel, event, sample);
+	}
+}
+
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
-	struct perf_evlist *evlist = perf_evlist__new();
-	struct perf_evsel *evsel;
+	struct perf_evlist *evlist = trace->evlist;
 	int err = -1, i;
 	unsigned long before;
 	const bool forks = argc > 0;
@@ -2048,11 +2169,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	trace->live = true;
 
-	if (evlist == NULL) {
-		fprintf(trace->output, "Not enough memory to run!\n");
-		goto out;
-	}
-
 	if (trace->trace_syscalls &&
 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
 					   trace__sys_exit))
@@ -2105,16 +2221,34 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_open;
 
+	/*
+	 * Better not use !target__has_task() here because we need to cover the
+	 * case where no threads were specified in the command line, but a
+	 * workload was, and in that case we will fill in the thread_map when
+	 * we fork the workload in perf_evlist__prepare_workload.
+	 */
+	if (trace->filter_pids.nr > 0)
+		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
+	else if (evlist->threads->map[0] == -1)
+		err = perf_evlist__set_filter_pid(evlist, getpid());
+
+	if (err < 0) {
+		printf("err=%d,%s\n", -err, strerror(-err));
+		exit(1);
+	}
+
 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
 	if (err < 0)
 		goto out_error_mmap;
 
-	perf_evlist__enable(evlist);
-
 	if (forks)
 		perf_evlist__start_workload(evlist);
+	else
+		perf_evlist__enable(evlist);
 
-	trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
+	trace->multiple_threads = evlist->threads->map[0] == -1 ||
+				  evlist->threads->nr > 1 ||
+				  perf_evlist__first(evlist)->attr.inherit;
 again:
 	before = trace->nr_events;
 
@@ -2122,8 +2256,6 @@ again:
 		union perf_event *event;
 
 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
-			const u32 type = event->header.type;
-			tracepoint_handler handler;
 			struct perf_sample sample;
 
 			++trace->nr_events;
@@ -2134,30 +2266,7 @@ again:
 				goto next_event;
 			}
 
-			if (!trace->full_time && trace->base_time == 0)
-				trace->base_time = sample.time;
-
-			if (type != PERF_RECORD_SAMPLE) {
-				trace__process_event(trace, trace->host, event, &sample);
-				continue;
-			}
-
-			evsel = perf_evlist__id2evsel(evlist, sample.id);
-			if (evsel == NULL) {
-				fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
-				goto next_event;
-			}
-
-			if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
-			    sample.raw_data == NULL) {
-				fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
-				       perf_evsel__name(evsel), sample.tid,
-				       sample.cpu, sample.raw_size);
-				goto next_event;
-			}
-
-			handler = evsel->handler;
-			handler(trace, evsel, event, &sample);
+			trace__handle_event(trace, event, &sample);
 next_event:
 			perf_evlist__mmap_consume(evlist, i);
 
@@ -2180,6 +2289,8 @@ next_event:
 	}
 
 out_disable:
+	thread__zput(trace->current);
+
 	perf_evlist__disable(evlist);
 
 	if (!err) {
@@ -2197,7 +2308,7 @@ out_disable:
 
 out_delete_evlist:
 	perf_evlist__delete(evlist);
-out:
+	trace->evlist = NULL;
 	trace->live = false;
 	return err;
 {
@@ -2235,6 +2346,7 @@ static int trace__replay(struct trace *trace)
 	struct perf_data_file file = {
 		.path  = input_name,
 		.mode  = PERF_DATA_MODE_READ,
+		.force = trace->force,
 	};
 	struct perf_session *session;
 	struct perf_evsel *evsel;
@@ -2309,7 +2421,7 @@ static int trace__replay(struct trace *trace)
 
 	setup_pager();
 
-	err = perf_session__process_events(session, &trace->tool);
+	err = perf_session__process_events(session);
 	if (err)
 		pr_err("Failed to process events, error %d", err);
 
@@ -2434,6 +2546,38 @@ static int trace__set_duration(const struct option *opt, const char *str,
 	return 0;
 }
 
+static int trace__set_filter_pids(const struct option *opt, const char *str,
+				  int unset __maybe_unused)
+{
+	int ret = -1;
+	size_t i;
+	struct trace *trace = opt->value;
+	/*
+	 * FIXME: introduce a intarray class, plain parse csv and create a
+	 * { int nr, int entries[] } struct...
+	 */
+	struct intlist *list = intlist__new(str);
+
+	if (list == NULL)
+		return -1;
+
+	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
+	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
+
+	if (trace->filter_pids.entries == NULL)
+		goto out;
+
+	trace->filter_pids.entries[0] = getpid();
+
+	for (i = 1; i < trace->filter_pids.nr; ++i)
+		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
+
+	intlist__delete(list);
+	ret = 0;
+out:
+	return ret;
+}
+
 static int trace__open_output(struct trace *trace, const char *filename)
 {
 	struct stat st;
@@ -2468,9 +2612,17 @@ static int parse_pagefaults(const struct option *opt, const char *str,
 	return 0;
 }
 
+static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel)
+		evsel->handler = handler;
+}
+
 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char * const trace_usage[] = {
+	const char *trace_usage[] = {
 		"perf trace [<options>] [<command>]",
 		"perf trace [<options>] -- <command> [<options>]",
 		"perf trace record [<options>] [<command>]",
@@ -2502,6 +2654,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	const char *output_name = NULL;
 	const char *ev_qualifier_str = NULL;
 	const struct option trace_options[] = {
+	OPT_CALLBACK(0, "event", &trace.evlist, "event",
+		     "event selector. use 'perf list' to list available events",
+		     parse_events_option),
 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
 		    "show the thread COMM next to its id"),
 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
@@ -2513,6 +2668,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "trace events on existing process id"),
 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
 		    "trace events on existing thread id"),
+	OPT_CALLBACK(0, "filter-pids", &trace, "float",
+		     "show only events with duration > N.M ms", trace__set_filter_pids),
 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
@@ -2538,19 +2695,36 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
 		     "Trace pagefaults", parse_pagefaults, "maj"),
 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
+	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
 	OPT_END()
 	};
+	const char * const trace_subcommands[] = { "record", NULL };
 	int err;
 	char bf[BUFSIZ];
 
-	argc = parse_options(argc, argv, trace_options, trace_usage,
-			     PARSE_OPT_STOP_AT_NON_OPTION);
+	signal(SIGSEGV, sighandler_dump_stack);
+	signal(SIGFPE, sighandler_dump_stack);
+
+	trace.evlist = perf_evlist__new();
+	if (trace.evlist == NULL)
+		return -ENOMEM;
+
+	if (trace.evlist == NULL) {
+		pr_err("Not enough memory to run!\n");
+		goto out;
+	}
+
+	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
+				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
 	if (trace.trace_pgfaults) {
 		trace.opts.sample_address = true;
 		trace.opts.sample_time = true;
 	}
 
+	if (trace.evlist->nr_entries > 0)
+		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
+
 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
 		return trace__record(&trace, argc-1, &argv[1]);
 
@@ -2558,7 +2732,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (trace.summary_only)
 		trace.summary = trace.summary_only;
 
-	if (!trace.trace_syscalls && !trace.trace_pgfaults) {
+	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
+	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
 		pr_err("Please specify something to trace.\n");
 		return -1;
 	}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index b210d62907e4..3688ad29085f 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -37,6 +37,7 @@ extern int cmd_test(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_inject(int argc, const char **argv, const char *prefix);
 extern int cmd_mem(int argc, const char **argv, const char *prefix);
+extern int cmd_data(int argc, const char **argv, const char *prefix);
 
 extern int find_scripts(char **scripts_array, char **scripts_path_array);
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 0906fc401c52..00fcaf8a5b8d 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -7,6 +7,7 @@ perf-archive			mainporcelain common
 perf-bench			mainporcelain common
 perf-buildid-cache		mainporcelain common
 perf-buildid-list		mainporcelain common
+perf-data			mainporcelain common
 perf-diff			mainporcelain common
 perf-evlist			mainporcelain common
 perf-inject			mainporcelain common
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index cc224080b525..59a98c643240 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -11,19 +11,26 @@ ifneq ($(obj-perf),)
 obj-perf := $(abspath $(obj-perf))/
 endif
 
-LIB_INCLUDE := $(srctree)/tools/lib/
+$(shell echo -n > .config-detected)
+detected     = $(shell echo "$(1)=y"       >> .config-detected)
+detected_var = $(shell echo "$(1)=$($(1))" >> .config-detected)
+
 CFLAGS := $(EXTRA_CFLAGS) $(EXTRA_WARNINGS)
 
 include $(src-perf)/config/Makefile.arch
 
+$(call detected_var,ARCH)
+
 NO_PERF_REGS := 1
 
 # Additional ARCH settings for x86
 ifeq ($(ARCH),x86)
+  $(call detected,CONFIG_X86)
   ifeq (${IS_64_BIT}, 1)
     CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
     ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
     LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
+    $(call detected,CONFIG_X86_64)
   else
     LIBUNWIND_LIBS = -lunwind -lunwind-x86
   endif
@@ -40,6 +47,10 @@ ifeq ($(ARCH),arm64)
   LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
 endif
 
+ifeq ($(NO_PERF_REGS),0)
+  $(call detected,CONFIG_PERF_REGS)
+endif
+
 # So far there's only x86 and arm libdw unwind support merged in perf.
 # Disable it on all other architectures in case libdw unwind
 # support is detected in system. Add supported architectures
@@ -84,6 +95,17 @@ ifndef NO_LIBELF
   FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
 endif
 
+ifdef LIBBABELTRACE
+  # for linking with debug library, run like:
+  # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
+  ifdef LIBBABELTRACE_DIR
+    LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
+    LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
+  endif
+  FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
+  FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
+endif
+
 # include ARCH specific config
 -include $(src-perf)/arch/$(ARCH)/Makefile
 
@@ -114,6 +136,8 @@ ifdef PARSER_DEBUG
   PARSER_DEBUG_BISON := -t
   PARSER_DEBUG_FLEX  := -d
   CFLAGS             += -DPARSER_DEBUG
+  $(call detected_var,PARSER_DEBUG_BISON)
+  $(call detected_var,PARSER_DEBUG_FLEX)
 endif
 
 ifndef NO_LIBPYTHON
@@ -152,121 +176,7 @@ LDFLAGS += -Wl,-z,noexecstack
 
 EXTLIBS = -lpthread -lrt -lm -ldl
 
-ifneq ($(OUTPUT),)
-  OUTPUT_FEATURES = $(OUTPUT)config/feature-checks/
-  $(shell mkdir -p $(OUTPUT_FEATURES))
-endif
-
-feature_check = $(eval $(feature_check_code))
-define feature_check_code
-  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C config/feature-checks test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
-endef
-
-feature_set = $(eval $(feature_set_code))
-define feature_set_code
-  feature-$(1) := 1
-endef
-
-#
-# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output:
-#
-
-#
-# Note that this is not a complete list of all feature tests, just
-# those that are typically built on a fully configured system.
-#
-# [ Feature tests not mentioned here have to be built explicitly in
-#   the rule that uses them - an example for that is the 'bionic'
-#   feature check. ]
-#
-CORE_FEATURE_TESTS =			\
-	backtrace			\
-	dwarf				\
-	fortify-source			\
-	sync-compare-and-swap		\
-	glibc				\
-	gtk2				\
-	gtk2-infobar			\
-	libaudit			\
-	libbfd				\
-	libelf				\
-	libelf-getphdrnum		\
-	libelf-mmap			\
-	libnuma				\
-	libperl				\
-	libpython			\
-	libpython-version		\
-	libslang			\
-	libunwind			\
-	pthread-attr-setaffinity-np	\
-	stackprotector-all		\
-	timerfd				\
-	libdw-dwarf-unwind		\
-	zlib
-
-LIB_FEATURE_TESTS =			\
-	dwarf				\
-	glibc				\
-	gtk2				\
-	libaudit			\
-	libbfd				\
-	libelf				\
-	libnuma				\
-	libperl				\
-	libpython			\
-	libslang			\
-	libunwind			\
-	libdw-dwarf-unwind		\
-	zlib
-
-VF_FEATURE_TESTS =			\
-	backtrace			\
-	fortify-source			\
-	sync-compare-and-swap		\
-	gtk2-infobar			\
-	libelf-getphdrnum		\
-	libelf-mmap			\
-	libpython-version		\
-	pthread-attr-setaffinity-np	\
-	stackprotector-all		\
-	timerfd				\
-	libunwind-debug-frame		\
-	bionic				\
-	liberty				\
-	liberty-z			\
-	cplus-demangle			\
-	compile-32			\
-	compile-x32
-
-# Set FEATURE_CHECK_(C|LD)FLAGS-all for all CORE_FEATURE_TESTS features.
-# If in the future we need per-feature checks/flags for features not
-# mentioned in this list we need to refactor this ;-).
-set_test_all_flags = $(eval $(set_test_all_flags_code))
-define set_test_all_flags_code
-  FEATURE_CHECK_CFLAGS-all  += $(FEATURE_CHECK_CFLAGS-$(1))
-  FEATURE_CHECK_LDFLAGS-all += $(FEATURE_CHECK_LDFLAGS-$(1))
-endef
-
-$(foreach feat,$(CORE_FEATURE_TESTS),$(call set_test_all_flags,$(feat)))
-
-#
-# Special fast-path for the 'all features are available' case:
-#
-$(call feature_check,all,$(MSG))
-
-#
-# Just in case the build freshly failed, make sure we print the
-# feature matrix:
-#
-ifeq ($(feature-all), 1)
-  #
-  # test-all.c passed - just set all the core feature flags to 1:
-  #
-  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_set,$(feat)))
-else
-  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C config/feature-checks $(addsuffix .bin,$(CORE_FEATURE_TESTS)) >/dev/null 2>&1)
-  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_check,$(feat)))
-endif
+include $(srctree)/tools/build/Makefile.feature
 
 ifeq ($(feature-stackprotector-all), 1)
   CFLAGS += -fstack-protector-all
@@ -295,7 +205,7 @@ endif
 
 CFLAGS += -I$(src-perf)/util
 CFLAGS += -I$(src-perf)
-CFLAGS += -I$(LIB_INCLUDE)
+CFLAGS += -I$(srctree)/tools/lib/
 
 CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 
@@ -361,6 +271,7 @@ endif # NO_LIBELF
 ifndef NO_LIBELF
   CFLAGS += -DHAVE_LIBELF_SUPPORT
   EXTLIBS += -lelf
+  $(call detected,CONFIG_LIBELF)
 
   ifeq ($(feature-libelf-mmap), 1)
     CFLAGS += -DHAVE_LIBELF_MMAP_SUPPORT
@@ -381,6 +292,7 @@ ifndef NO_LIBELF
       CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
       LDFLAGS += $(LIBDW_LDFLAGS)
       EXTLIBS += -ldw
+      $(call detected,CONFIG_DWARF)
     endif # PERF_HAVE_DWARF_REGS
   endif # NO_DWARF
 endif # NO_LIBELF
@@ -408,9 +320,11 @@ ifdef NO_LIBUNWIND
     dwarf-post-unwind := 0
   else
     dwarf-post-unwind-text := libdw
+    $(call detected,CONFIG_LIBDW_DWARF_UNWIND)
   endif
 else
   dwarf-post-unwind-text := libunwind
+  $(call detected,CONFIG_LIBUNWIND)
   # Enable libunwind support by default.
   ifndef NO_LIBDW_DWARF_UNWIND
     NO_LIBDW_DWARF_UNWIND := 1
@@ -419,6 +333,7 @@ endif
 
 ifeq ($(dwarf-post-unwind),1)
   CFLAGS += -DHAVE_DWARF_UNWIND_SUPPORT
+  $(call detected,CONFIG_DWARF_UNWIND)
 else
   NO_DWARF_UNWIND := 1
 endif
@@ -447,6 +362,7 @@ ifndef NO_LIBAUDIT
   else
     CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
     EXTLIBS += -laudit
+    $(call detected,CONFIG_AUDIT)
   endif
 endif
 
@@ -463,6 +379,7 @@ ifndef NO_SLANG
     CFLAGS += -I/usr/include/slang
     CFLAGS += -DHAVE_SLANG_SUPPORT
     EXTLIBS += -lslang
+    $(call detected,CONFIG_SLANG)
   endif
 endif
 
@@ -497,10 +414,11 @@ else
   ifneq ($(feature-libperl), 1)
     CFLAGS += -DNO_LIBPERL
     NO_LIBPERL := 1
-    msg := $(warning Missing perl devel files. Disabling perl scripting support, consider installing perl-ExtUtils-Embed);
+    msg := $(warning Missing perl devel files. Disabling perl scripting support, please install perl-ExtUtils-Embed/libperl-dev);
   else
     LDFLAGS += $(PERL_EMBED_LDFLAGS)
     EXTLIBS += $(PERL_EMBED_LIBADD)
+    $(call detected,CONFIG_LIBPERL)
   endif
 endif
 
@@ -513,22 +431,21 @@ endif
 disable-python = $(eval $(disable-python_code))
 define disable-python_code
   CFLAGS += -DNO_LIBPYTHON
-  $(if $(1),$(warning No $(1) was found))
-  $(warning Python support will not be built)
+  $(warning $1)
   NO_LIBPYTHON := 1
 endef
 
 ifdef NO_LIBPYTHON
-  $(call disable-python)
+  $(call disable-python,Python support disabled by user)
 else
 
   ifndef PYTHON
-    $(call disable-python,python interpreter)
+    $(call disable-python,No python interpreter was found: disables Python support - please install python-devel/python-dev)
   else
     PYTHON_WORD := $(call shell-wordify,$(PYTHON))
 
     ifndef PYTHON_CONFIG
-      $(call disable-python,python-config tool)
+      $(call disable-python,No 'python-config' tool was found: disables Python support - please install python-devel/python-dev)
     else
 
       PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
@@ -540,7 +457,7 @@ else
       FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 
       ifneq ($(feature-libpython), 1)
-        $(call disable-python,Python.h (for Python 2.x))
+        $(call disable-python,No 'Python.h' (for Python 2.x support) was found: disables Python support - please install python-devel/python-dev)
       else
 
         ifneq ($(feature-libpython-version), 1)
@@ -560,6 +477,7 @@ else
           LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
           EXTLIBS += $(PYTHON_EMBED_LIBADD)
           LANG_BINDINGS += $(obj-perf)python/perf.so
+          $(call detected,CONFIG_LIBPYTHON)
         endif
       endif
     endif
@@ -600,7 +518,7 @@ else
             EXTLIBS += -liberty
             CFLAGS += -DHAVE_CPLUS_DEMANGLE_SUPPORT
           else
-            msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
+            msg := $(warning No bfd.h/libbfd found, please install binutils-dev[el]/zlib-static/libiberty-dev to gain symbol demangling)
             CFLAGS += -DNO_DEMANGLE
           endif
         endif
@@ -617,11 +535,23 @@ ifndef NO_ZLIB
   ifeq ($(feature-zlib), 1)
     CFLAGS += -DHAVE_ZLIB_SUPPORT
     EXTLIBS += -lz
+    $(call detected,CONFIG_ZLIB)
   else
     NO_ZLIB := 1
   endif
 endif
 
+ifndef NO_LZMA
+  ifeq ($(feature-lzma), 1)
+    CFLAGS += -DHAVE_LZMA_SUPPORT
+    EXTLIBS += -llzma
+    $(call detected,CONFIG_LZMA)
+  else
+    msg := $(warning No liblzma found, disables xz kernel module decompression, please install xz-devel/liblzma-dev);
+    NO_LZMA := 1
+  endif
+endif
+
 ifndef NO_BACKTRACE
   ifeq ($(feature-backtrace), 1)
     CFLAGS += -DHAVE_BACKTRACE_SUPPORT
@@ -635,6 +565,7 @@ ifndef NO_LIBNUMA
   else
     CFLAGS += -DHAVE_LIBNUMA_SUPPORT
     EXTLIBS += -lnuma
+    $(call detected,CONFIG_NUMA)
   endif
 endif
 
@@ -651,7 +582,7 @@ ifeq (${IS_64_BIT}, 1)
       NO_PERF_READ_VDSO32 := 1
     endif
   endif
-  ifneq (${IS_X86_64}, 1)
+  ifneq ($(ARCH), x86)
     NO_PERF_READ_VDSOX32 := 1
   endif
   ifndef NO_PERF_READ_VDSOX32
@@ -667,6 +598,18 @@ else
   NO_PERF_READ_VDSOX32 := 1
 endif
 
+ifdef LIBBABELTRACE
+  $(call feature_check,libbabeltrace)
+  ifeq ($(feature-libbabeltrace), 1)
+    CFLAGS += -DHAVE_LIBBABELTRACE_SUPPORT $(LIBBABELTRACE_CFLAGS)
+    LDFLAGS += $(LIBBABELTRACE_LDFLAGS)
+    EXTLIBS += -lbabeltrace-ctf
+    $(call detected,CONFIG_LIBBABELTRACE)
+  else
+    msg := $(warning No libbabeltrace found, disables 'perf data' CTF format support, please install libbabeltrace-dev[el]/libbabeltrace-ctf-dev);
+  endif
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   template_dir
@@ -699,7 +642,7 @@ sysconfdir = $(prefix)/etc
 ETC_PERFCONFIG = etc/perfconfig
 endif
 ifndef lib
-ifeq ($(IS_X86_64),1)
+ifeq ($(ARCH)$(IS_64_BIT), x861)
 lib = lib64
 else
 lib = lib
@@ -735,83 +678,33 @@ plugindir=$(libdir)/traceevent/plugins
 plugindir_SQ= $(subst ','\'',$(plugindir))
 endif
 
-#
-# Print the result of the feature test:
-#
-feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG))
-
-define feature_print_status_code
-  ifeq ($(feature-$(1)), 1)
-    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
-  else
-    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
-  endif
-endef
-
-feature_print_var = $(eval $(feature_print_var_code)) $(info $(MSG))
-define feature_print_var_code
+print_var = $(eval $(print_var_code)) $(info $(MSG))
+define print_var_code
     MSG = $(shell printf '...%30s: %s' $(1) $($(1)))
 endef
 
-feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG))
-define feature_print_text_code
-    MSG = $(shell printf '...%30s: %s' $(1) $(2))
-endef
-
-PERF_FEATURES := $(foreach feat,$(LIB_FEATURE_TESTS),feature-$(feat)($(feature-$(feat))))
-PERF_FEATURES_FILE := $(shell touch $(OUTPUT)PERF-FEATURES; cat $(OUTPUT)PERF-FEATURES)
-
-ifeq ($(dwarf-post-unwind),1)
-  PERF_FEATURES += dwarf-post-unwind($(dwarf-post-unwind-text))
-endif
-
-# The $(display_lib) controls the default detection message
-# output. It's set if:
-# - detected features differes from stored features from
-#   last build (in PERF-FEATURES file)
-# - one of the $(LIB_FEATURE_TESTS) is not detected
-# - VF is enabled
-
-ifneq ("$(PERF_FEATURES)","$(PERF_FEATURES_FILE)")
-  $(shell echo "$(PERF_FEATURES)" > $(OUTPUT)PERF-FEATURES)
-  display_lib := 1
-endif
-
-feature_check = $(eval $(feature_check_code))
-define feature_check_code
-  ifneq ($(feature-$(1)), 1)
-    display_lib := 1
-  endif
-endef
-
-$(foreach feat,$(LIB_FEATURE_TESTS),$(call feature_check,$(feat)))
-
 ifeq ($(VF),1)
-  display_lib := 1
-  display_vf := 1
-endif
-
-ifeq ($(display_lib),1)
-  $(info )
-  $(info Auto-detecting system features:)
-  $(foreach feat,$(LIB_FEATURE_TESTS),$(call feature_print_status,$(feat),))
-
-  ifeq ($(dwarf-post-unwind),1)
-    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
-  endif
-endif
-
-ifeq ($(display_vf),1)
-  $(foreach feat,$(VF_FEATURE_TESTS),$(call feature_print_status,$(feat),))
-  $(info )
-  $(call feature_print_var,prefix)
-  $(call feature_print_var,bindir)
-  $(call feature_print_var,libdir)
-  $(call feature_print_var,sysconfdir)
-  $(call feature_print_var,LIBUNWIND_DIR)
-  $(call feature_print_var,LIBDW_DIR)
-endif
-
-ifeq ($(display_lib),1)
+  $(call print_var,prefix)
+  $(call print_var,bindir)
+  $(call print_var,libdir)
+  $(call print_var,sysconfdir)
+  $(call print_var,LIBUNWIND_DIR)
+  $(call print_var,LIBDW_DIR)
   $(info )
 endif
+
+$(call detected_var,bindir_SQ)
+$(call detected_var,PYTHON_WORD)
+ifneq ($(OUTPUT),)
+$(call detected_var,OUTPUT)
+endif
+$(call detected_var,htmldir_SQ)
+$(call detected_var,infodir_SQ)
+$(call detected_var,mandir_SQ)
+$(call detected_var,ETC_PERFCONFIG_SQ)
+$(call detected_var,prefix_SQ)
+$(call detected_var,perfexecdir_SQ)
+$(call detected_var,LIBDIR)
+$(call detected_var,GTK_CFLAGS)
+$(call detected_var,PERL_EMBED_CCOPTS)
+$(call detected_var,PYTHON_EMBED_CCOPTS)
diff --git a/tools/perf/config/Makefile.arch b/tools/perf/config/Makefile.arch
index ac8721ffa6c8..e11fbd6fae78 100644
--- a/tools/perf/config/Makefile.arch
+++ b/tools/perf/config/Makefile.arch
@@ -1,32 +1,15 @@
+ifndef ARCH
+ARCH := $(shell uname -m 2>/dev/null || echo not)
+endif
 
-uname_M := $(shell uname -m 2>/dev/null || echo not)
-
-RAW_ARCH := $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
-                                  -e s/arm.*/arm/ -e s/sa110/arm/ \
+ARCH := $(shell echo $(ARCH) | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
+                                  -e s/sun4u/sparc/ -e s/sparc64/sparc/ \
+                                  -e /arm64/!s/arm.*/arm/ -e s/sa110/arm/ \
                                   -e s/s390x/s390/ -e s/parisc64/parisc/ \
                                   -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
                                   -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
                                   -e s/tile.*/tile/ )
 
-# Additional ARCH settings for x86
-ifeq ($(RAW_ARCH),i386)
-  ARCH ?= x86
-endif
-
-ifeq ($(RAW_ARCH),x86_64)
-  ARCH ?= x86
-
-  ifneq (, $(findstring m32,$(CFLAGS)))
-    RAW_ARCH := x86_32
-  endif
-endif
-
-ifeq ($(RAW_ARCH),sparc64)
-  ARCH ?= sparc
-endif
-
-ARCH ?= $(RAW_ARCH)
-
 LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
 ifeq ($(LP64), 1)
   IS_64_BIT := 1
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
index 7076a62d0ff7..c16ce833079c 100644
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -175,6 +175,5 @@ _ge-abspath = $(if $(is-executable),$(1))
 define get-executable-or-default
 $(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
 endef
-_ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_err,$(2)))
-_gea_warn = $(warning The path '$(1)' is not executable.)
+_ge_attempt = $(if $(get-executable),$(get-executable),$(call _gea_err,$(2)))
 _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
index 33569847fdcc..3ba80b2359cc 100644
--- a/tools/perf/perf-completion.sh
+++ b/tools/perf/perf-completion.sh
@@ -47,8 +47,16 @@ __my_reassemble_comp_words_by_ref()
 	done
 }
 
-type _get_comp_words_by_ref &>/dev/null ||
-_get_comp_words_by_ref()
+# Define preload_get_comp_words_by_ref="false", if the function
+# __perf_get_comp_words_by_ref() is required instead.
+preload_get_comp_words_by_ref="true"
+
+if [ $preload_get_comp_words_by_ref = "true" ]; then
+	type _get_comp_words_by_ref &>/dev/null ||
+	preload_get_comp_words_by_ref="false"
+fi
+[ $preload_get_comp_words_by_ref = "true" ] ||
+__perf_get_comp_words_by_ref()
 {
 	local exclude cur_ words_ cword_
 	if [ "$1" = "-n" ]; then
@@ -76,8 +84,16 @@ _get_comp_words_by_ref()
 	done
 }
 
-type __ltrim_colon_completions &>/dev/null ||
-__ltrim_colon_completions()
+# Define preload__ltrim_colon_completions="false", if the function
+# __perf__ltrim_colon_completions() is required instead.
+preload__ltrim_colon_completions="true"
+
+if [ $preload__ltrim_colon_completions = "true" ]; then
+	type __ltrim_colon_completions &>/dev/null ||
+	preload__ltrim_colon_completions="false"
+fi
+[ $preload__ltrim_colon_completions = "true" ] ||
+__perf__ltrim_colon_completions()
 {
 	if [[ "$1" == *:* && "$COMP_WORDBREAKS" == *:* ]]; then
 		# Remove colon-word prefix from COMPREPLY items
@@ -97,7 +113,32 @@ __perfcomp ()
 __perfcomp_colon ()
 {
 	__perfcomp "$1" "$2"
-	__ltrim_colon_completions $cur
+	if [ $preload__ltrim_colon_completions = "true" ]; then
+		__ltrim_colon_completions $cur
+	else
+		__perf__ltrim_colon_completions $cur
+	fi
+}
+
+__perf_prev_skip_opts ()
+{
+	local i cmd_ cmds_
+
+	let i=cword-1
+	cmds_=$($cmd $1 --list-cmds)
+	prev_skip_opts=()
+	while [ $i -ge 0 ]; do
+		if [[ ${words[i]} == $1 ]]; then
+			return
+		fi
+		for cmd_ in $cmds_; do
+			if [[ ${words[i]} == $cmd_ ]]; then
+				prev_skip_opts=${words[i]}
+				return
+			fi
+		done
+		((i--))
+	done
 }
 
 __perf_main ()
@@ -107,29 +148,36 @@ __perf_main ()
 	cmd=${words[0]}
 	COMPREPLY=()
 
+	# Skip options backward and find the last perf command
+	__perf_prev_skip_opts
 	# List perf subcommands or long options
-	if [ $cword -eq 1 ]; then
+	if [ -z $prev_skip_opts ]; then
 		if [[ $cur == --* ]]; then
-			__perfcomp '--help --version \
-			--exec-path --html-path --paginate --no-pager \
-			--perf-dir --work-tree --debugfs-dir' -- "$cur"
+			cmds=$($cmd --list-opts)
 		else
 			cmds=$($cmd --list-cmds)
-			__perfcomp "$cmds" "$cur"
 		fi
+		__perfcomp "$cmds" "$cur"
 	# List possible events for -e option
-	elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then
+	elif [[ $prev == @("-e"|"--event") &&
+		$prev_skip_opts == @(record|stat|top) ]]; then
 		evts=$($cmd list --raw-dump)
 		__perfcomp_colon "$evts" "$cur"
-	# List subcommands for perf commands
-	elif [[ $prev == @(kvm|kmem|mem|lock|sched) ]]; then
-		subcmds=$($cmd $prev --list-cmds)
-		__perfcomp_colon "$subcmds" "$cur"
-	# List long option names
-	elif [[ $cur == --* ]];  then
-		subcmd=${words[1]}
-		opts=$($cmd $subcmd --list-opts)
-		__perfcomp "$opts" "$cur"
+	else
+		# List subcommands for perf commands
+		if [[ $prev_skip_opts == @(kvm|kmem|mem|lock|sched|
+			|data|help|script|test|timechart|trace) ]]; then
+			subcmds=$($cmd $prev_skip_opts --list-cmds)
+			__perfcomp_colon "$subcmds" "$cur"
+		fi
+		# List long option names
+		if [[ $cur == --* ]];  then
+			subcmd=$prev_skip_opts
+			__perf_prev_skip_opts $subcmd
+			subcmd=$subcmd" "$prev_skip_opts
+			opts=$($cmd $subcmd --list-opts)
+			__perfcomp "$opts" "$cur"
+		fi
 	fi
 }
 
@@ -198,7 +246,11 @@ type perf &>/dev/null &&
 _perf()
 {
 	local cur words cword prev
-	_get_comp_words_by_ref -n =: cur words cword prev
+	if [ $preload_get_comp_words_by_ref = "true" ]; then
+		_get_comp_words_by_ref -n =: cur words cword prev
+	else
+		__perf_get_comp_words_by_ref -n =: cur words cword prev
+	fi
 	__perf_main
 } &&
 
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 3700a7faca6c..b857fcbd00cf 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -13,6 +13,7 @@
 #include "util/quote.h"
 #include "util/run-command.h"
 #include "util/parse-events.h"
+#include "util/parse-options.h"
 #include "util/debug.h"
 #include <api/fs/debugfs.h>
 #include <pthread.h>
@@ -62,6 +63,7 @@ static struct cmd_struct commands[] = {
 #endif
 	{ "inject",	cmd_inject,	0 },
 	{ "mem",	cmd_mem,	0 },
+	{ "data",	cmd_data,	0 },
 };
 
 struct pager_config {
@@ -124,6 +126,23 @@ static void commit_pager_choice(void)
 	}
 }
 
+struct option options[] = {
+	OPT_ARGUMENT("help", "help"),
+	OPT_ARGUMENT("version", "version"),
+	OPT_ARGUMENT("exec-path", "exec-path"),
+	OPT_ARGUMENT("html-path", "html-path"),
+	OPT_ARGUMENT("paginate", "paginate"),
+	OPT_ARGUMENT("no-pager", "no-pager"),
+	OPT_ARGUMENT("perf-dir", "perf-dir"),
+	OPT_ARGUMENT("work-tree", "work-tree"),
+	OPT_ARGUMENT("debugfs-dir", "debugfs-dir"),
+	OPT_ARGUMENT("buildid-dir", "buildid-dir"),
+	OPT_ARGUMENT("list-cmds", "list-cmds"),
+	OPT_ARGUMENT("list-opts", "list-opts"),
+	OPT_ARGUMENT("debug", "debug"),
+	OPT_END()
+};
+
 static int handle_options(const char ***argv, int *argc, int *envchanged)
 {
 	int handled = 0;
@@ -222,6 +241,16 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 				struct cmd_struct *p = commands+i;
 				printf("%s ", p->cmd);
 			}
+			putchar('\n');
+			exit(0);
+		} else if (!strcmp(cmd, "--list-opts")) {
+			unsigned int i;
+
+			for (i = 0; i < ARRAY_SIZE(options)-1; i++) {
+				struct option *p = options+i;
+				printf("--%s ", p->long_name);
+			}
+			putchar('\n');
 			exit(0);
 		} else if (!strcmp(cmd, "--debug")) {
 			if (*argc < 2) {
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 1dabb8553499..e14bb637255c 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -29,7 +29,7 @@ static inline unsigned long long rdclock(void)
 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-#define MAX_NR_CPUS			256
+#define MAX_NR_CPUS			1024
 
 extern const char *input_name;
 extern bool perf_host, perf_guest;
@@ -53,6 +53,7 @@ struct record_opts {
 	bool	     sample_time;
 	bool	     period;
 	bool	     sample_intr_regs;
+	bool	     running_time;
 	unsigned int freq;
 	unsigned int mmap_pages;
 	unsigned int user_freq;
@@ -61,6 +62,8 @@ struct record_opts {
 	u64	     user_interval;
 	bool	     sample_transaction;
 	unsigned     initial_delay;
+	bool         use_clockid;
+	clockid_t    clockid;
 };
 
 struct option;
diff --git a/tools/perf/scripts/Build b/tools/perf/scripts/Build
new file mode 100644
index 000000000000..41efd7e368b3
--- /dev/null
+++ b/tools/perf/scripts/Build
@@ -0,0 +1,2 @@
+libperf-$(CONFIG_LIBPERL)   += perl/Perf-Trace-Util/
+libperf-$(CONFIG_LIBPYTHON) += python/Perf-Trace-Util/
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Build b/tools/perf/scripts/perl/Perf-Trace-Util/Build
new file mode 100644
index 000000000000..928e110179cb
--- /dev/null
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/Build
@@ -0,0 +1,3 @@
+libperf-y += Context.o
+
+CFLAGS_Context.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Build b/tools/perf/scripts/python/Perf-Trace-Util/Build
new file mode 100644
index 000000000000..aefc15c9444a
--- /dev/null
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Build
@@ -0,0 +1,3 @@
+libperf-y += Context.o
+
+CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
new file mode 100644
index 000000000000..6a8801b32017
--- /dev/null
+++ b/tools/perf/tests/Build
@@ -0,0 +1,43 @@
+perf-y += builtin-test.o
+perf-y += parse-events.o
+perf-y += dso-data.o
+perf-y += attr.o
+perf-y += vmlinux-kallsyms.o
+perf-y += open-syscall.o
+perf-y += open-syscall-all-cpus.o
+perf-y += open-syscall-tp-fields.o
+perf-y += mmap-basic.o
+perf-y += perf-record.o
+perf-y += rdpmc.o
+perf-y += evsel-roundtrip-name.o
+perf-y += evsel-tp-sched.o
+perf-y += fdarray.o
+perf-y += pmu.o
+perf-y += hists_common.o
+perf-y += hists_link.o
+perf-y += hists_filter.o
+perf-y += hists_output.o
+perf-y += hists_cumulate.o
+perf-y += python-use.o
+perf-y += bp_signal.o
+perf-y += bp_signal_overflow.o
+perf-y += task-exit.o
+perf-y += sw-clock.o
+perf-y += mmap-thread-lookup.o
+perf-y += thread-mg-share.o
+perf-y += switch-tracking.o
+perf-y += keep-tracking.o
+perf-y += code-reading.o
+perf-y += sample-parsing.o
+perf-y += parse-no-sample-id-all.o
+perf-y += kmod-path.o
+
+perf-$(CONFIG_X86) += perf-time-to-tsc.o
+
+ifeq ($(ARCH),$(filter $(ARCH),x86 arm))
+perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
+endif
+
+CFLAGS_attr.o         += -DBINDIR="BUILD_STR($(bindir_SQ))" -DPYTHON="BUILD_STR($(PYTHON_WORD))"
+CFLAGS_python-use.o   += -DPYTHONPATH="BUILD_STR($(OUTPUT)python)" -DPYTHON="BUILD_STR($(PYTHON_WORD))"
+CFLAGS_dwarf-unwind.o += -fno-optimize-sibling-calls
diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index d3095dafed36..7e6d74946e04 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0|1
-size=104
+size=112
 config=0
 sample_period=4000
 sample_type=263
diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat
index 872ed7e24c7c..f4cf148f14cb 100644
--- a/tools/perf/tests/attr/base-stat
+++ b/tools/perf/tests/attr/base-stat
@@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0
-size=104
+size=112
 config=0
 sample_period=0
 sample_type=0
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 4b7d9ab0f049..4f4098167112 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -166,6 +166,10 @@ static struct test {
 		.desc = "Add fd to a fdarray, making it autogrow",
 		.func = test__fdarray__add,
 	},
+	{
+		.desc = "Test kmod_path__parse function",
+		.func = test__kmod_path__parse,
+	},
 	{
 		.func = NULL,
 	},
@@ -291,7 +295,7 @@ static int perf_test__list(int argc, const char **argv)
 
 int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char * const test_usage[] = {
+	const char *test_usage[] = {
 	"perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
 	NULL,
 	};
@@ -302,13 +306,14 @@ int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
 	};
+	const char * const test_subcommands[] = { "list", NULL };
 	struct intlist *skiplist = NULL;
         int ret = hists__init();
 
         if (ret < 0)
                 return ret;
 
-	argc = parse_options(argc, argv, test_options, test_usage, 0);
+	argc = parse_options_subcommand(argc, argv, test_options, test_subcommands, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
 		return perf_test__list(argc, argv);
 
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index caaf37f079b1..513e5febbe5a 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -112,6 +112,9 @@ int test__dso_data(void)
 
 	dso = dso__new((const char *)file);
 
+	TEST_ASSERT_VAL("Failed to access to dso",
+			dso__data_fd(dso, &machine) >= 0);
+
 	/* Basic 10 bytes tests. */
 	for (i = 0; i < ARRAY_SIZE(offsets); i++) {
 		struct test_data_offset *data = &offsets[i];
@@ -243,8 +246,8 @@ int test__dso_data_cache(void)
 	limit = nr * 4;
 	TEST_ASSERT_VAL("failed to set file limit", !set_fd_limit(limit));
 
-	/* and this is now our dso open FDs limit + 1 extra */
-	dso_cnt = limit / 2 + 1;
+	/* and this is now our dso open FDs limit */
+	dso_cnt = limit / 2;
 	TEST_ASSERT_VAL("failed to create dsos\n",
 		!dsos__create(dso_cnt, TEST_FILE_SIZE));
 
@@ -252,13 +255,13 @@ int test__dso_data_cache(void)
 		struct dso *dso = dsos[i];
 
 		/*
-		 * Open dsos via dso__data_fd or dso__data_read_offset.
-		 * Both opens the data file and keep it open.
+		 * Open dsos via dso__data_fd(), it opens the data
+		 * file and keep it open (unless open file limit).
 		 */
+		fd = dso__data_fd(dso, &machine);
+		TEST_ASSERT_VAL("failed to get fd", fd > 0);
+
 		if (i % 2) {
-			fd = dso__data_fd(dso, &machine);
-			TEST_ASSERT_VAL("failed to get fd", fd > 0);
-		} else {
 			#define BUFSIZE 10
 			u8 buf[BUFSIZE];
 			ssize_t n;
@@ -268,7 +271,10 @@ int test__dso_data_cache(void)
 		}
 	}
 
-	/* open +1 dso over the allowed limit */
+	/* verify the first one is already open */
+	TEST_ASSERT_VAL("dsos[0] is not open", dsos[0]->data.fd != -1);
+
+	/* open +1 dso to reach the allowed limit */
 	fd = dso__data_fd(dsos[i], &machine);
 	TEST_ASSERT_VAL("failed to get fd", fd > 0);
 
diff --git a/tools/perf/tests/kmod-path.c b/tools/perf/tests/kmod-path.c
new file mode 100644
index 000000000000..e8d7cbb9320c
--- /dev/null
+++ b/tools/perf/tests/kmod-path.c
@@ -0,0 +1,73 @@
+#include <stdbool.h>
+#include "tests.h"
+#include "dso.h"
+#include "debug.h"
+
+static int test(const char *path, bool alloc_name, bool alloc_ext,
+		bool kmod, bool comp, const char *name, const char *ext)
+{
+	struct kmod_path m;
+
+	memset(&m, 0x0, sizeof(m));
+
+	TEST_ASSERT_VAL("kmod_path__parse",
+			!__kmod_path__parse(&m, path, alloc_name, alloc_ext));
+
+	pr_debug("%s - alloc name %d, alloc ext %d, kmod %d, comp %d, name '%s', ext '%s'\n",
+		 path, alloc_name, alloc_ext, m.kmod, m.comp, m.name, m.ext);
+
+	TEST_ASSERT_VAL("wrong kmod", m.kmod == kmod);
+	TEST_ASSERT_VAL("wrong comp", m.comp == comp);
+
+	if (ext)
+		TEST_ASSERT_VAL("wrong ext", m.ext && !strcmp(ext, m.ext));
+	else
+		TEST_ASSERT_VAL("wrong ext", !m.ext);
+
+	if (name)
+		TEST_ASSERT_VAL("wrong name", m.name && !strcmp(name, m.name));
+	else
+		TEST_ASSERT_VAL("wrong name", !m.name);
+
+	free(m.name);
+	free(m.ext);
+	return 0;
+}
+
+#define T(path, an, ae, k, c, n, e) \
+	TEST_ASSERT_VAL("failed", !test(path, an, ae, k, c, n, e))
+
+int test__kmod_path__parse(void)
+{
+	/* path                alloc_name  alloc_ext   kmod  comp   name     ext */
+	T("/xxxx/xxxx/x-x.ko", true      , true      , true, false, "[x_x]", NULL);
+	T("/xxxx/xxxx/x-x.ko", false     , true      , true, false, NULL   , NULL);
+	T("/xxxx/xxxx/x-x.ko", true      , false     , true, false, "[x_x]", NULL);
+	T("/xxxx/xxxx/x-x.ko", false     , false     , true, false, NULL   , NULL);
+
+	/* path                alloc_name  alloc_ext   kmod  comp  name   ext */
+	T("/xxxx/xxxx/x.ko.gz", true     , true      , true, true, "[x]", "gz");
+	T("/xxxx/xxxx/x.ko.gz", false    , true      , true, true, NULL , "gz");
+	T("/xxxx/xxxx/x.ko.gz", true     , false     , true, true, "[x]", NULL);
+	T("/xxxx/xxxx/x.ko.gz", false    , false     , true, true, NULL , NULL);
+
+	/* path              alloc_name  alloc_ext  kmod   comp  name    ext */
+	T("/xxxx/xxxx/x.gz", true      , true     , false, true, "x.gz" ,"gz");
+	T("/xxxx/xxxx/x.gz", false     , true     , false, true, NULL   ,"gz");
+	T("/xxxx/xxxx/x.gz", true      , false    , false, true, "x.gz" , NULL);
+	T("/xxxx/xxxx/x.gz", false     , false    , false, true, NULL   , NULL);
+
+	/* path   alloc_name  alloc_ext  kmod   comp  name     ext */
+	T("x.gz", true      , true     , false, true, "x.gz", "gz");
+	T("x.gz", false     , true     , false, true, NULL  , "gz");
+	T("x.gz", true      , false    , false, true, "x.gz", NULL);
+	T("x.gz", false     , false    , false, true, NULL  , NULL);
+
+	/* path      alloc_name  alloc_ext  kmod  comp  name  ext */
+	T("x.ko.gz", true      , true     , true, true, "[x]", "gz");
+	T("x.ko.gz", false     , true     , true, true, NULL , "gz");
+	T("x.ko.gz", true      , false    , true, true, "[x]", NULL);
+	T("x.ko.gz", false     , false    , true, true, NULL , NULL);
+
+	return 0;
+}
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 75709d2b17b4..bff85324f799 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -5,7 +5,7 @@ include config/Makefile.arch
 
 # FIXME looks like x86 is the only arch running tests ;-)
 # we need some IS_(32/64) flag to make this generic
-ifeq ($(IS_X86_64),1)
+ifeq ($(ARCH)$(IS_64_BIT), x861)
 lib = lib64
 else
 lib = lib
diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c
index 8fa82d1700c7..3ec885c48f8f 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/open-syscall-all-cpus.c
@@ -29,7 +29,12 @@ int test__open_syscall_event_on_all_cpus(void)
 
 	evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
 	if (evsel == NULL) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+		if (tracefs_configured())
+			pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
+		else if (debugfs_configured())
+			pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+		else
+			pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
 		goto out_thread_map_delete;
 	}
 
diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c
index a33b2daae40f..07aa319bf334 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/open-syscall.c
@@ -18,7 +18,12 @@ int test__open_syscall_event(void)
 
 	evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
 	if (evsel == NULL) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+		if (tracefs_configured())
+			pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
+		else if (debugfs_configured())
+			pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+		else
+			pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
 		goto out_thread_map_delete;
 	}
 
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 1cdab0ce00e2..3de744961739 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -3,6 +3,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include <api/fs/fs.h>
+#include <api/fs/tracefs.h>
 #include <api/fs/debugfs.h>
 #include "tests.h"
 #include "debug.h"
@@ -294,6 +295,36 @@ static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
 	return test__checkevent_genhw(evlist);
 }
 
+static int test__checkevent_exclude_idle_modifier(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+	return test__checkevent_symbolic_name(evlist);
+}
+
+static int test__checkevent_exclude_idle_modifier_1(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+	return test__checkevent_symbolic_name(evlist);
+}
+
 static int test__checkevent_breakpoint_modifier(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel = perf_evlist__first(evlist);
@@ -1192,11 +1223,19 @@ static int count_tracepoints(void)
 {
 	char events_path[PATH_MAX];
 	struct dirent *events_ent;
+	const char *mountpoint;
 	DIR *events_dir;
 	int cnt = 0;
 
-	scnprintf(events_path, PATH_MAX, "%s/tracing/events",
-		  debugfs_find_mountpoint());
+	mountpoint = tracefs_find_mountpoint();
+	if (mountpoint) {
+		scnprintf(events_path, PATH_MAX, "%s/events",
+			  mountpoint);
+	} else {
+		mountpoint = debugfs_find_mountpoint();
+		scnprintf(events_path, PATH_MAX, "%s/tracing/events",
+			  mountpoint);
+	}
 
 	events_dir = opendir(events_path);
 
@@ -1485,6 +1524,16 @@ static struct evlist_test test__events[] = {
 		.id    = 100,
 	},
 #endif
+	{
+		.name  = "instructions:I",
+		.check = test__checkevent_exclude_idle_modifier,
+		.id    = 45,
+	},
+	{
+		.name  = "instructions:kIG",
+		.check = test__checkevent_exclude_idle_modifier_1,
+		.id    = 46,
+	},
 };
 
 static struct evlist_test test__events_pmu[] = {
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 00e776a87a9c..52758a33f64c 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -51,6 +51,7 @@ int test__hists_cumulate(void);
 int test__switch_tracking(void);
 int test__fdarray__filter(void);
 int test__fdarray__add(void);
+int test__kmod_path__parse(void);
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/ui/Build b/tools/perf/ui/Build
new file mode 100644
index 000000000000..0a73538c0441
--- /dev/null
+++ b/tools/perf/ui/Build
@@ -0,0 +1,14 @@
+libperf-y += setup.o
+libperf-y += helpline.o
+libperf-y += progress.o
+libperf-y += util.o
+libperf-y += hist.o
+libperf-y += stdio/hist.o
+
+CFLAGS_setup.o += -DLIBDIR="BUILD_STR($(LIBDIR))"
+
+libperf-$(CONFIG_SLANG) += browser.o
+libperf-$(CONFIG_SLANG) += browsers/
+libperf-$(CONFIG_SLANG) += tui/
+
+CFLAGS_browser.o += -DENABLE_SLFUTURE_CONST
diff --git a/tools/perf/ui/browsers/Build b/tools/perf/ui/browsers/Build
new file mode 100644
index 000000000000..de223f5bed58
--- /dev/null
+++ b/tools/perf/ui/browsers/Build
@@ -0,0 +1,10 @@
+libperf-y += annotate.o
+libperf-y += hists.o
+libperf-y += map.o
+libperf-y += scripts.o
+libperf-y += header.o
+
+CFLAGS_annotate.o += -DENABLE_SLFUTURE_CONST
+CFLAGS_hists.o    += -DENABLE_SLFUTURE_CONST
+CFLAGS_map.o      += -DENABLE_SLFUTURE_CONST
+CFLAGS_scripts.o  += -DENABLE_SLFUTURE_CONST
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 9d32e3c0cfee..e5250eb2dd57 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -829,10 +829,16 @@ out:
 	return key;
 }
 
+int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel,
+			     struct hist_browser_timer *hbt)
+{
+	return symbol__tui_annotate(ms->sym, ms->map, evsel, hbt);
+}
+
 int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
 			     struct hist_browser_timer *hbt)
 {
-	return symbol__tui_annotate(he->ms.sym, he->ms.map, evsel, hbt);
+	return map_symbol__tui_annotate(&he->ms, evsel, hbt);
 }
 
 static void annotate_browser__mark_jump_targets(struct annotate_browser *browser,
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 788506eef567..995b7a8596b1 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -48,6 +48,24 @@ static bool hist_browser__has_filter(struct hist_browser *hb)
 	return hists__has_filter(hb->hists) || hb->min_pcnt;
 }
 
+static int hist_browser__get_folding(struct hist_browser *browser)
+{
+	struct rb_node *nd;
+	struct hists *hists = browser->hists;
+	int unfolded_rows = 0;
+
+	for (nd = rb_first(&hists->entries);
+	     (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
+	     nd = rb_next(nd)) {
+		struct hist_entry *he =
+			rb_entry(nd, struct hist_entry, rb_node);
+
+		if (he->ms.unfolded)
+			unfolded_rows += he->nr_rows;
+	}
+	return unfolded_rows;
+}
+
 static u32 hist_browser__nr_entries(struct hist_browser *hb)
 {
 	u32 nr_entries;
@@ -57,6 +75,7 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
 	else
 		nr_entries = hb->hists->nr_entries;
 
+	hb->nr_callchain_rows = hist_browser__get_folding(hb);
 	return nr_entries + hb->nr_callchain_rows;
 }
 
@@ -492,6 +511,7 @@ static void hist_browser__show_callchain_entry(struct hist_browser *browser,
 {
 	int color, width;
 	char folded_sign = callchain_list__folded(chain);
+	bool show_annotated = browser->show_dso && chain->ms.sym && symbol__annotation(chain->ms.sym)->src;
 
 	color = HE_COLORSET_NORMAL;
 	width = browser->b.width - (offset + 2);
@@ -504,7 +524,8 @@ static void hist_browser__show_callchain_entry(struct hist_browser *browser,
 	ui_browser__set_color(&browser->b, color);
 	hist_browser__gotorc(browser, row, 0);
 	slsmg_write_nstring(" ", offset);
-	slsmg_printf("%c ", folded_sign);
+	slsmg_printf("%c", folded_sign);
+	ui_browser__write_graph(&browser->b, show_annotated ? SLSMG_RARROW_CHAR : ' ');
 	slsmg_write_nstring(str, width);
 }
 
@@ -1467,7 +1488,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		perf_hpp__set_user_width(symbol_conf.col_width_list_str);
 
 	while (1) {
-		const struct thread *thread = NULL;
+		struct thread *thread = NULL;
 		const struct dso *dso = NULL;
 		int choice = 0,
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
@@ -1593,28 +1614,30 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		if (!sort__has_sym)
 			goto add_exit_option;
 
+		if (browser->selection == NULL)
+			goto skip_annotation;
+
 		if (sort__mode == SORT_MODE__BRANCH) {
 			bi = browser->he_selection->branch_info;
-			if (browser->selection != NULL &&
-			    bi &&
-			    bi->from.sym != NULL &&
-			    !bi->from.map->dso->annotate_warned &&
-				asprintf(&options[nr_options], "Annotate %s",
-					 bi->from.sym->name) > 0)
-				annotate_f = nr_options++;
 
-			if (browser->selection != NULL &&
-			    bi &&
-			    bi->to.sym != NULL &&
+			if (bi == NULL)
+				goto skip_annotation;
+
+			if (bi->from.sym != NULL &&
+			    !bi->from.map->dso->annotate_warned &&
+			    asprintf(&options[nr_options], "Annotate %s", bi->from.sym->name) > 0) {
+				annotate_f = nr_options++;
+			}
+
+			if (bi->to.sym != NULL &&
 			    !bi->to.map->dso->annotate_warned &&
 			    (bi->to.sym != bi->from.sym ||
 			     bi->to.map->dso != bi->from.map->dso) &&
-				asprintf(&options[nr_options], "Annotate %s",
-					 bi->to.sym->name) > 0)
+			    asprintf(&options[nr_options], "Annotate %s", bi->to.sym->name) > 0) {
 				annotate_t = nr_options++;
+			}
 		} else {
-			if (browser->selection != NULL &&
-			    browser->selection->sym != NULL &&
+			if (browser->selection->sym != NULL &&
 			    !browser->selection->map->dso->annotate_warned) {
 				struct annotation *notes;
 
@@ -1622,11 +1645,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 
 				if (notes->src &&
 				    asprintf(&options[nr_options], "Annotate %s",
-						 browser->selection->sym->name) > 0)
+						 browser->selection->sym->name) > 0) {
 					annotate = nr_options++;
+				}
 			}
 		}
-
+skip_annotation:
 		if (thread != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
 			     (browser->hists->thread_filter ? "out of" : "into"),
@@ -1682,6 +1706,7 @@ retry_popup_menu:
 		if (choice == annotate || choice == annotate_t || choice == annotate_f) {
 			struct hist_entry *he;
 			struct annotation *notes;
+			struct map_symbol ms;
 			int err;
 do_annotate:
 			if (!objdump_path && perf_session_env__lookup_objdump(env))
@@ -1691,30 +1716,21 @@ do_annotate:
 			if (he == NULL)
 				continue;
 
-			/*
-			 * we stash the branch_info symbol + map into the
-			 * the ms so we don't have to rewrite all the annotation
-			 * code to use branch_info.
-			 * in branch mode, the ms struct is not used
-			 */
 			if (choice == annotate_f) {
-				he->ms.sym = he->branch_info->from.sym;
-				he->ms.map = he->branch_info->from.map;
-			}  else if (choice == annotate_t) {
-				he->ms.sym = he->branch_info->to.sym;
-				he->ms.map = he->branch_info->to.map;
+				ms.map = he->branch_info->from.map;
+				ms.sym = he->branch_info->from.sym;
+			} else if (choice == annotate_t) {
+				ms.map = he->branch_info->to.map;
+				ms.sym = he->branch_info->to.sym;
+			} else {
+				ms = *browser->selection;
 			}
 
-			notes = symbol__annotation(he->ms.sym);
+			notes = symbol__annotation(ms.sym);
 			if (!notes->src)
 				continue;
 
-			/*
-			 * Don't let this be freed, say, by hists__decay_entry.
-			 */
-			he->used = true;
-			err = hist_entry__tui_annotate(he, evsel, hbt);
-			he->used = false;
+			err = map_symbol__tui_annotate(&ms, evsel, hbt);
 			/*
 			 * offer option to annotate the other branch source or target
 			 * (if they exists) when returning from annotate
@@ -1754,13 +1770,13 @@ zoom_thread:
 				pstack__remove(fstack, &browser->hists->thread_filter);
 zoom_out_thread:
 				ui_helpline__pop();
-				browser->hists->thread_filter = NULL;
+				thread__zput(browser->hists->thread_filter);
 				perf_hpp__set_elide(HISTC_THREAD, false);
 			} else {
 				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
 						   thread->comm_set ? thread__comm_str(thread) : "",
 						   thread->tid);
-				browser->hists->thread_filter = thread;
+				browser->hists->thread_filter = thread__get(thread);
 				perf_hpp__set_elide(HISTC_THREAD, false);
 				pstack__push(fstack, &browser->hists->thread_filter);
 			}
diff --git a/tools/perf/ui/gtk/Build b/tools/perf/ui/gtk/Build
new file mode 100644
index 000000000000..ec22e899a224
--- /dev/null
+++ b/tools/perf/ui/gtk/Build
@@ -0,0 +1,9 @@
+CFLAGS_gtk += -fPIC $(GTK_CFLAGS)
+
+gtk-y += browser.o
+gtk-y += hists.o
+gtk-y += setup.o
+gtk-y += util.o
+gtk-y += helpline.o
+gtk-y += progress.o
+gtk-y += annotate.o
diff --git a/tools/perf/ui/tui/Build b/tools/perf/ui/tui/Build
new file mode 100644
index 000000000000..9e4c6ca41a9f
--- /dev/null
+++ b/tools/perf/ui/tui/Build
@@ -0,0 +1,4 @@
+libperf-y += setup.o
+libperf-y += util.o
+libperf-y += helpline.o
+libperf-y += progress.o
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
new file mode 100644
index 000000000000..797490a40075
--- /dev/null
+++ b/tools/perf/util/Build
@@ -0,0 +1,145 @@
+libperf-y += abspath.o
+libperf-y += alias.o
+libperf-y += annotate.o
+libperf-y += build-id.o
+libperf-y += config.o
+libperf-y += ctype.o
+libperf-y += db-export.o
+libperf-y += environment.o
+libperf-y += event.o
+libperf-y += evlist.o
+libperf-y += evsel.o
+libperf-y += exec_cmd.o
+libperf-y += find_next_bit.o
+libperf-y += help.o
+libperf-y += kallsyms.o
+libperf-y += levenshtein.o
+libperf-y += parse-options.o
+libperf-y += parse-events.o
+libperf-y += path.o
+libperf-y += rbtree.o
+libperf-y += bitmap.o
+libperf-y += hweight.o
+libperf-y += run-command.o
+libperf-y += quote.o
+libperf-y += strbuf.o
+libperf-y += string.o
+libperf-y += strlist.o
+libperf-y += strfilter.o
+libperf-y += top.o
+libperf-y += usage.o
+libperf-y += wrapper.o
+libperf-y += sigchain.o
+libperf-y += dso.o
+libperf-y += symbol.o
+libperf-y += color.o
+libperf-y += pager.o
+libperf-y += header.o
+libperf-y += callchain.o
+libperf-y += values.o
+libperf-y += debug.o
+libperf-y += machine.o
+libperf-y += map.o
+libperf-y += pstack.o
+libperf-y += session.o
+libperf-y += ordered-events.o
+libperf-y += comm.o
+libperf-y += thread.o
+libperf-y += thread_map.o
+libperf-y += trace-event-parse.o
+libperf-y += parse-events-flex.o
+libperf-y += parse-events-bison.o
+libperf-y += pmu.o
+libperf-y += pmu-flex.o
+libperf-y += pmu-bison.o
+libperf-y += trace-event-read.o
+libperf-y += trace-event-info.o
+libperf-y += trace-event-scripting.o
+libperf-y += trace-event.o
+libperf-y += svghelper.o
+libperf-y += sort.o
+libperf-y += hist.o
+libperf-y += util.o
+libperf-y += xyarray.o
+libperf-y += cpumap.o
+libperf-y += cgroup.o
+libperf-y += target.o
+libperf-y += rblist.o
+libperf-y += intlist.o
+libperf-y += vdso.o
+libperf-y += stat.o
+libperf-y += record.o
+libperf-y += srcline.o
+libperf-y += data.o
+libperf-$(CONFIG_X86) += tsc.o
+libperf-y += cloexec.o
+libperf-y += thread-stack.o
+
+libperf-$(CONFIG_LIBELF) += symbol-elf.o
+libperf-$(CONFIG_LIBELF) += probe-event.o
+
+ifndef CONFIG_LIBELF
+libperf-y += symbol-minimal.o
+endif
+
+libperf-$(CONFIG_DWARF) += probe-finder.o
+libperf-$(CONFIG_DWARF) += dwarf-aux.o
+
+libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+
+libperf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
+
+libperf-y += scripting-engines/
+
+libperf-$(CONFIG_PERF_REGS) += perf_regs.o
+libperf-$(CONFIG_ZLIB) += zlib.o
+libperf-$(CONFIG_LZMA) += lzma.o
+
+CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_exec_cmd.o += -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))" -DPREFIX="BUILD_STR($(prefix_SQ))"
+
+$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
+	$(call rule_mkdir)
+	@$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
+
+$(OUTPUT)util/parse-events-bison.c: util/parse-events.y
+	$(call rule_mkdir)
+	@$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
+
+$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
+	$(call rule_mkdir)
+	@$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
+
+$(OUTPUT)util/pmu-bison.c: util/pmu.y
+	$(call rule_mkdir)
+	@$(call echo-cmd,bison)$(BISON) -v util/pmu.y -d -o $@ -p perf_pmu_
+
+CFLAGS_parse-events-flex.o  += -w
+CFLAGS_pmu-flex.o           += -w
+CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+
+$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
+$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
+
+CFLAGS_find_next_bit.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_rbtree.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_hweight.o       += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_parse-events.o  += -Wno-redundant-decls
+
+$(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
+$(OUTPUT)util/find_next_bit.o: ../lib/util/find_next_bit.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
+$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
+$(OUTPUT)util/hweight.o: ../../lib/hweight.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 9d9db3b296dd..7f5bdfc9bc87 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1010,6 +1010,32 @@ fallback:
 			}
 			filename = symfs_filename;
 		}
+	} else if (dso__needs_decompress(dso)) {
+		char tmp[PATH_MAX];
+		struct kmod_path m;
+		int fd;
+		bool ret;
+
+		if (kmod_path__parse_ext(&m, symfs_filename))
+			goto out_free_filename;
+
+		snprintf(tmp, PATH_MAX, "/tmp/perf-kmod-XXXXXX");
+
+		fd = mkstemp(tmp);
+		if (fd < 0) {
+			free(m.ext);
+			goto out_free_filename;
+		}
+
+		ret = decompress_to_file(m.ext, symfs_filename, fd);
+
+		free(m.ext);
+		close(fd);
+
+		if (!ret)
+			goto out_free_filename;
+
+		strcpy(symfs_filename, tmp);
 	}
 
 	snprintf(command, sizeof(command),
@@ -1029,7 +1055,7 @@ fallback:
 
 	file = popen(command, "r");
 	if (!file)
-		goto out_free_filename;
+		goto out_remove_tmp;
 
 	while (!feof(file))
 		if (symbol__parse_objdump_line(sym, map, file, privsize,
@@ -1044,6 +1070,10 @@ fallback:
 		delete_last_nop(sym);
 
 	pclose(file);
+
+out_remove_tmp:
+	if (dso__needs_decompress(dso))
+		unlink(symfs_filename);
 out_free_filename:
 	if (delete_extract)
 		kcore_extract__delete(&kce);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 0c72680a977f..61867dff5d5a 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -59,11 +59,8 @@ static int perf_event__exit_del_thread(struct perf_tool *tool __maybe_unused,
 	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
 		    event->fork.ppid, event->fork.ptid);
 
-	if (thread) {
-		rb_erase(&thread->rb_node, &machine->threads);
-		machine->last_match = NULL;
-		thread__delete(thread);
-	}
+	if (thread)
+		machine__remove_thread(machine, thread);
 
 	return 0;
 }
@@ -93,6 +90,35 @@ int build_id__sprintf(const u8 *build_id, int len, char *bf)
 	return raw - build_id;
 }
 
+/* asnprintf consolidates asprintf and snprintf */
+static int asnprintf(char **strp, size_t size, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	if (!strp)
+		return -EINVAL;
+
+	va_start(ap, fmt);
+	if (*strp)
+		ret = vsnprintf(*strp, size, fmt, ap);
+	else
+		ret = vasprintf(strp, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static char *build_id__filename(const char *sbuild_id, char *bf, size_t size)
+{
+	char *tmp = bf;
+	int ret = asnprintf(&bf, size, "%s/.build-id/%.2s/%s", buildid_dir,
+			    sbuild_id, sbuild_id + 2);
+	if (ret < 0 || (tmp && size < (unsigned int)ret))
+		return NULL;
+	return bf;
+}
+
 char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 {
 	char build_id_hex[BUILD_ID_SIZE * 2 + 1];
@@ -101,14 +127,7 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 		return NULL;
 
 	build_id__sprintf(dso->build_id, sizeof(dso->build_id), build_id_hex);
-	if (bf == NULL) {
-		if (asprintf(&bf, "%s/.build-id/%.2s/%s", buildid_dir,
-			     build_id_hex, build_id_hex + 2) < 0)
-			return NULL;
-	} else
-		snprintf(bf, size, "%s/.build-id/%.2s/%s", buildid_dir,
-			 build_id_hex, build_id_hex + 2);
-	return bf;
+	return build_id__filename(build_id_hex, bf, size);
 }
 
 #define dsos__for_each_with_build_id(pos, head)	\
@@ -259,52 +278,113 @@ void disable_buildid_cache(void)
 	no_buildid_cache = true;
 }
 
-int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
-			  const char *name, bool is_kallsyms, bool is_vdso)
+static char *build_id_cache__dirname_from_path(const char *name,
+					       bool is_kallsyms, bool is_vdso)
 {
-	const size_t size = PATH_MAX;
-	char *realname, *filename = zalloc(size),
-	     *linkname = zalloc(size), *targetname;
-	int len, err = -1;
+	char *realname = (char *)name, *filename;
 	bool slash = is_kallsyms || is_vdso;
 
-	if (is_kallsyms) {
-		if (symbol_conf.kptr_restrict) {
-			pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
-			err = 0;
-			goto out_free;
-		}
-		realname = (char *) name;
-	} else
+	if (!slash) {
 		realname = realpath(name, NULL);
+		if (!realname)
+			return NULL;
+	}
 
-	if (realname == NULL || filename == NULL || linkname == NULL)
+	if (asprintf(&filename, "%s%s%s", buildid_dir, slash ? "/" : "",
+		     is_vdso ? DSO__NAME_VDSO : realname) < 0)
+		filename = NULL;
+
+	if (!slash)
+		free(realname);
+
+	return filename;
+}
+
+int build_id_cache__list_build_ids(const char *pathname,
+				   struct strlist **result)
+{
+	struct strlist *list;
+	char *dir_name;
+	DIR *dir;
+	struct dirent *d;
+	int ret = 0;
+
+	list = strlist__new(true, NULL);
+	dir_name = build_id_cache__dirname_from_path(pathname, false, false);
+	if (!list || !dir_name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* List up all dirents */
+	dir = opendir(dir_name);
+	if (!dir) {
+		ret = -errno;
+		goto out;
+	}
+
+	while ((d = readdir(dir)) != NULL) {
+		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+			continue;
+		strlist__add(list, d->d_name);
+	}
+	closedir(dir);
+
+out:
+	free(dir_name);
+	if (ret)
+		strlist__delete(list);
+	else
+		*result = list;
+
+	return ret;
+}
+
+int build_id_cache__add_s(const char *sbuild_id, const char *name,
+			  bool is_kallsyms, bool is_vdso)
+{
+	const size_t size = PATH_MAX;
+	char *realname = NULL, *filename = NULL, *dir_name = NULL,
+	     *linkname = zalloc(size), *targetname, *tmp;
+	int err = -1;
+
+	if (!is_kallsyms) {
+		realname = realpath(name, NULL);
+		if (!realname)
+			goto out_free;
+	}
+
+	dir_name = build_id_cache__dirname_from_path(name, is_kallsyms, is_vdso);
+	if (!dir_name)
 		goto out_free;
 
-	len = scnprintf(filename, size, "%s%s%s",
-		       debugdir, slash ? "/" : "",
-		       is_vdso ? DSO__NAME_VDSO : realname);
-	if (mkdir_p(filename, 0755))
+	if (mkdir_p(dir_name, 0755))
 		goto out_free;
 
-	snprintf(filename + len, size - len, "/%s", sbuild_id);
+	if (asprintf(&filename, "%s/%s", dir_name, sbuild_id) < 0) {
+		filename = NULL;
+		goto out_free;
+	}
 
 	if (access(filename, F_OK)) {
 		if (is_kallsyms) {
 			 if (copyfile("/proc/kallsyms", filename))
 				goto out_free;
-		} else if (link(realname, filename) && copyfile(name, filename))
+		} else if (link(realname, filename) && errno != EEXIST &&
+				copyfile(name, filename))
 			goto out_free;
 	}
 
-	len = scnprintf(linkname, size, "%s/.build-id/%.2s",
-		       debugdir, sbuild_id);
+	if (!build_id__filename(sbuild_id, linkname, size))
+		goto out_free;
+	tmp = strrchr(linkname, '/');
+	*tmp = '\0';
 
 	if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
 		goto out_free;
 
-	snprintf(linkname + len, size - len, "/%s", sbuild_id + 2);
-	targetname = filename + strlen(debugdir) - 5;
+	*tmp = '/';
+	targetname = filename + strlen(buildid_dir) - 5;
 	memcpy(targetname, "../..", 5);
 
 	if (symlink(targetname, linkname) == 0)
@@ -313,34 +393,46 @@ out_free:
 	if (!is_kallsyms)
 		free(realname);
 	free(filename);
+	free(dir_name);
 	free(linkname);
 	return err;
 }
 
 static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
-				 const char *name, const char *debugdir,
-				 bool is_kallsyms, bool is_vdso)
+				 const char *name, bool is_kallsyms,
+				 bool is_vdso)
 {
 	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 
 	build_id__sprintf(build_id, build_id_size, sbuild_id);
 
-	return build_id_cache__add_s(sbuild_id, debugdir, name,
-				     is_kallsyms, is_vdso);
+	return build_id_cache__add_s(sbuild_id, name, is_kallsyms, is_vdso);
 }
 
-int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
+bool build_id_cache__cached(const char *sbuild_id)
+{
+	bool ret = false;
+	char *filename = build_id__filename(sbuild_id, NULL, 0);
+
+	if (filename && !access(filename, F_OK))
+		ret = true;
+	free(filename);
+
+	return ret;
+}
+
+int build_id_cache__remove_s(const char *sbuild_id)
 {
 	const size_t size = PATH_MAX;
 	char *filename = zalloc(size),
-	     *linkname = zalloc(size);
+	     *linkname = zalloc(size), *tmp;
 	int err = -1;
 
 	if (filename == NULL || linkname == NULL)
 		goto out_free;
 
-	snprintf(linkname, size, "%s/.build-id/%.2s/%s",
-		 debugdir, sbuild_id, sbuild_id + 2);
+	if (!build_id__filename(sbuild_id, linkname, size))
+		goto out_free;
 
 	if (access(linkname, F_OK))
 		goto out_free;
@@ -354,8 +446,8 @@ int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
 	/*
 	 * Since the link is relative, we must make it absolute:
 	 */
-	snprintf(linkname, size, "%s/.build-id/%.2s/%s",
-		 debugdir, sbuild_id, filename);
+	tmp = strrchr(linkname, '/') + 1;
+	snprintf(tmp, size - (tmp - linkname), "%s", filename);
 
 	if (unlink(linkname))
 		goto out_free;
@@ -367,8 +459,7 @@ out_free:
 	return err;
 }
 
-static int dso__cache_build_id(struct dso *dso, struct machine *machine,
-			       const char *debugdir)
+static int dso__cache_build_id(struct dso *dso, struct machine *machine)
 {
 	bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
 	bool is_vdso = dso__is_vdso(dso);
@@ -381,28 +472,26 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine,
 		name = nm;
 	}
 	return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), name,
-				     debugdir, is_kallsyms, is_vdso);
+				     is_kallsyms, is_vdso);
 }
 
 static int __dsos__cache_build_ids(struct list_head *head,
-				   struct machine *machine, const char *debugdir)
+				   struct machine *machine)
 {
 	struct dso *pos;
 	int err = 0;
 
 	dsos__for_each_with_build_id(pos, head)
-		if (dso__cache_build_id(pos, machine, debugdir))
+		if (dso__cache_build_id(pos, machine))
 			err = -1;
 
 	return err;
 }
 
-static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
+static int machine__cache_build_ids(struct machine *machine)
 {
-	int ret = __dsos__cache_build_ids(&machine->kernel_dsos.head, machine,
-					  debugdir);
-	ret |= __dsos__cache_build_ids(&machine->user_dsos.head, machine,
-				       debugdir);
+	int ret = __dsos__cache_build_ids(&machine->kernel_dsos.head, machine);
+	ret |= __dsos__cache_build_ids(&machine->user_dsos.head, machine);
 	return ret;
 }
 
@@ -417,11 +506,11 @@ int perf_session__cache_build_ids(struct perf_session *session)
 	if (mkdir(buildid_dir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	ret = machine__cache_build_ids(&session->machines.host, buildid_dir);
+	ret = machine__cache_build_ids(&session->machines.host);
 
 	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret |= machine__cache_build_ids(pos, buildid_dir);
+		ret |= machine__cache_build_ids(pos);
 	}
 	return ret ? -1 : 0;
 }
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 8236319514d5..85011222cc14 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -4,6 +4,7 @@
 #define BUILD_ID_SIZE 20
 
 #include "tool.h"
+#include "strlist.h"
 #include <linux/types.h>
 
 extern struct perf_tool build_id__mark_dso_hit_ops;
@@ -22,9 +23,12 @@ bool perf_session__read_build_ids(struct perf_session *session, bool with_hits);
 int perf_session__write_buildid_table(struct perf_session *session, int fd);
 int perf_session__cache_build_ids(struct perf_session *session);
 
-int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
+int build_id_cache__list_build_ids(const char *pathname,
+				   struct strlist **result);
+bool build_id_cache__cached(const char *sbuild_id);
+int build_id_cache__add_s(const char *sbuild_id,
 			  const char *name, bool is_kallsyms, bool is_vdso);
-int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
+int build_id_cache__remove_s(const char *sbuild_id);
 void disable_buildid_cache(void);
 
 #endif
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index d04d770d90f6..fbcca21d66ab 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -17,6 +17,7 @@
 #define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
 #define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
 #define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR"
+#define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
 
 typedef int (*config_fn_t)(const char *, const char *, void *);
 extern int perf_default_config(const char *, const char *, void *);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 14e7a123d43b..9f643ee77001 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -97,6 +97,14 @@ int parse_callchain_record_opt(const char *arg)
 				callchain_param.dump_size = size;
 			}
 #endif /* HAVE_DWARF_UNWIND_SUPPORT */
+		} else if (!strncmp(name, "lbr", sizeof("lbr"))) {
+			if (!strtok_r(NULL, ",", &saveptr)) {
+				callchain_param.record_mode = CALLCHAIN_LBR;
+				ret = 0;
+			} else
+				pr_err("callchain: No more arguments "
+					"needed for --call-graph lbr\n");
+			break;
 		} else {
 			pr_err("callchain: Unknown --call-graph option "
 			       "value: %s\n", arg);
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index c0ec1acc38e4..6033a0a212ca 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -11,6 +11,7 @@ enum perf_call_graph_mode {
 	CALLCHAIN_NONE,
 	CALLCHAIN_FP,
 	CALLCHAIN_DWARF,
+	CALLCHAIN_LBR,
 	CALLCHAIN_MAX
 };
 
diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c
index 6da965bdbc2c..85b523885f9d 100644
--- a/tools/perf/util/cloexec.c
+++ b/tools/perf/util/cloexec.c
@@ -7,6 +7,12 @@
 
 static unsigned long flag = PERF_FLAG_FD_CLOEXEC;
 
+int __weak sched_getcpu(void)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
 static int perf_flag_probe(void)
 {
 	/* use 'safest' configuration as used in perf_evsel__fallback() */
diff --git a/tools/perf/util/cloexec.h b/tools/perf/util/cloexec.h
index 94a5a7d829d5..68888c29b04a 100644
--- a/tools/perf/util/cloexec.h
+++ b/tools/perf/util/cloexec.h
@@ -3,4 +3,10 @@
 
 unsigned long perf_event_open_cloexec_flag(void);
 
+#ifdef __GLIBC_PREREQ
+#if !__GLIBC_PREREQ(2, 6)
+extern int sched_getcpu(void) __THROW;
+#endif
+#endif
+
 #endif /* __PERF_CLOEXEC_H */
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
new file mode 100644
index 000000000000..dd17c9a32fbc
--- /dev/null
+++ b/tools/perf/util/data-convert-bt.c
@@ -0,0 +1,857 @@
+/*
+ * CTF writing support via babeltrace.
+ *
+ * Copyright (C) 2014, Jiri Olsa <jolsa@redhat.com>
+ * Copyright (C) 2014, Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include <linux/compiler.h>
+#include <babeltrace/ctf-writer/writer.h>
+#include <babeltrace/ctf-writer/clock.h>
+#include <babeltrace/ctf-writer/stream.h>
+#include <babeltrace/ctf-writer/event.h>
+#include <babeltrace/ctf-writer/event-types.h>
+#include <babeltrace/ctf-writer/event-fields.h>
+#include <babeltrace/ctf/events.h>
+#include <traceevent/event-parse.h>
+#include "asm/bug.h"
+#include "data-convert-bt.h"
+#include "session.h"
+#include "util.h"
+#include "debug.h"
+#include "tool.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "machine.h"
+
+#define pr_N(n, fmt, ...) \
+	eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__)
+
+#define pr(fmt, ...)  pr_N(1, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr2(fmt, ...) pr_N(2, pr_fmt(fmt), ##__VA_ARGS__)
+
+#define pr_time2(t, fmt, ...) pr_time_N(2, debug_data_convert, t, pr_fmt(fmt), ##__VA_ARGS__)
+
+struct evsel_priv {
+	struct bt_ctf_event_class *event_class;
+};
+
+struct ctf_writer {
+	/* writer primitives */
+	struct bt_ctf_writer		*writer;
+	struct bt_ctf_stream		*stream;
+	struct bt_ctf_stream_class	*stream_class;
+	struct bt_ctf_clock		*clock;
+
+	/* data types */
+	union {
+		struct {
+			struct bt_ctf_field_type	*s64;
+			struct bt_ctf_field_type	*u64;
+			struct bt_ctf_field_type	*s32;
+			struct bt_ctf_field_type	*u32;
+			struct bt_ctf_field_type	*string;
+			struct bt_ctf_field_type	*u64_hex;
+		};
+		struct bt_ctf_field_type *array[6];
+	} data;
+};
+
+struct convert {
+	struct perf_tool	tool;
+	struct ctf_writer	writer;
+
+	u64			events_size;
+	u64			events_count;
+};
+
+static int value_set(struct bt_ctf_field_type *type,
+		     struct bt_ctf_event *event,
+		     const char *name, u64 val)
+{
+	struct bt_ctf_field *field;
+	bool sign = bt_ctf_field_type_integer_get_signed(type);
+	int ret;
+
+	field = bt_ctf_field_create(type);
+	if (!field) {
+		pr_err("failed to create a field %s\n", name);
+		return -1;
+	}
+
+	if (sign) {
+		ret = bt_ctf_field_signed_integer_set_value(field, val);
+		if (ret) {
+			pr_err("failed to set field value %s\n", name);
+			goto err;
+		}
+	} else {
+		ret = bt_ctf_field_unsigned_integer_set_value(field, val);
+		if (ret) {
+			pr_err("failed to set field value %s\n", name);
+			goto err;
+		}
+	}
+
+	ret = bt_ctf_event_set_payload(event, name, field);
+	if (ret) {
+		pr_err("failed to set payload %s\n", name);
+		goto err;
+	}
+
+	pr2("  SET [%s = %" PRIu64 "]\n", name, val);
+
+err:
+	bt_ctf_field_put(field);
+	return ret;
+}
+
+#define __FUNC_VALUE_SET(_name, _val_type)				\
+static __maybe_unused int value_set_##_name(struct ctf_writer *cw,	\
+			     struct bt_ctf_event *event,		\
+			     const char *name,				\
+			     _val_type val)				\
+{									\
+	struct bt_ctf_field_type *type = cw->data._name;		\
+	return value_set(type, event, name, (u64) val);			\
+}
+
+#define FUNC_VALUE_SET(_name) __FUNC_VALUE_SET(_name, _name)
+
+FUNC_VALUE_SET(s32)
+FUNC_VALUE_SET(u32)
+FUNC_VALUE_SET(s64)
+FUNC_VALUE_SET(u64)
+__FUNC_VALUE_SET(u64_hex, u64)
+
+static struct bt_ctf_field_type*
+get_tracepoint_field_type(struct ctf_writer *cw, struct format_field *field)
+{
+	unsigned long flags = field->flags;
+
+	if (flags & FIELD_IS_STRING)
+		return cw->data.string;
+
+	if (!(flags & FIELD_IS_SIGNED)) {
+		/* unsigned long are mostly pointers */
+		if (flags & FIELD_IS_LONG || flags & FIELD_IS_POINTER)
+			return cw->data.u64_hex;
+	}
+
+	if (flags & FIELD_IS_SIGNED) {
+		if (field->size == 8)
+			return cw->data.s64;
+		else
+			return cw->data.s32;
+	}
+
+	if (field->size == 8)
+		return cw->data.u64;
+	else
+		return cw->data.u32;
+}
+
+static int add_tracepoint_field_value(struct ctf_writer *cw,
+				      struct bt_ctf_event_class *event_class,
+				      struct bt_ctf_event *event,
+				      struct perf_sample *sample,
+				      struct format_field *fmtf)
+{
+	struct bt_ctf_field_type *type;
+	struct bt_ctf_field *array_field;
+	struct bt_ctf_field *field;
+	const char *name = fmtf->name;
+	void *data = sample->raw_data;
+	unsigned long long value_int;
+	unsigned long flags = fmtf->flags;
+	unsigned int n_items;
+	unsigned int i;
+	unsigned int offset;
+	unsigned int len;
+	int ret;
+
+	offset = fmtf->offset;
+	len = fmtf->size;
+	if (flags & FIELD_IS_STRING)
+		flags &= ~FIELD_IS_ARRAY;
+
+	if (flags & FIELD_IS_DYNAMIC) {
+		unsigned long long tmp_val;
+
+		tmp_val = pevent_read_number(fmtf->event->pevent,
+				data + offset, len);
+		offset = tmp_val;
+		len = offset >> 16;
+		offset &= 0xffff;
+	}
+
+	if (flags & FIELD_IS_ARRAY) {
+
+		type = bt_ctf_event_class_get_field_by_name(
+				event_class, name);
+		array_field = bt_ctf_field_create(type);
+		bt_ctf_field_type_put(type);
+		if (!array_field) {
+			pr_err("Failed to create array type %s\n", name);
+			return -1;
+		}
+
+		len = fmtf->size / fmtf->arraylen;
+		n_items = fmtf->arraylen;
+	} else {
+		n_items = 1;
+		array_field = NULL;
+	}
+
+	type = get_tracepoint_field_type(cw, fmtf);
+
+	for (i = 0; i < n_items; i++) {
+		if (!(flags & FIELD_IS_STRING))
+			value_int = pevent_read_number(
+					fmtf->event->pevent,
+					data + offset + i * len, len);
+
+		if (flags & FIELD_IS_ARRAY)
+			field = bt_ctf_field_array_get_field(array_field, i);
+		else
+			field = bt_ctf_field_create(type);
+
+		if (!field) {
+			pr_err("failed to create a field %s\n", name);
+			return -1;
+		}
+
+		if (flags & FIELD_IS_STRING)
+			ret = bt_ctf_field_string_set_value(field,
+					data + offset + i * len);
+		else if (!(flags & FIELD_IS_SIGNED))
+			ret = bt_ctf_field_unsigned_integer_set_value(
+					field, value_int);
+		else
+			ret = bt_ctf_field_signed_integer_set_value(
+					field, value_int);
+		if (ret) {
+			pr_err("failed to set file value %s\n", name);
+			goto err_put_field;
+		}
+		if (!(flags & FIELD_IS_ARRAY)) {
+			ret = bt_ctf_event_set_payload(event, name, field);
+			if (ret) {
+				pr_err("failed to set payload %s\n", name);
+				goto err_put_field;
+			}
+		}
+		bt_ctf_field_put(field);
+	}
+	if (flags & FIELD_IS_ARRAY) {
+		ret = bt_ctf_event_set_payload(event, name, array_field);
+		if (ret) {
+			pr_err("Failed add payload array %s\n", name);
+			return -1;
+		}
+		bt_ctf_field_put(array_field);
+	}
+	return 0;
+
+err_put_field:
+	bt_ctf_field_put(field);
+	return -1;
+}
+
+static int add_tracepoint_fields_values(struct ctf_writer *cw,
+					struct bt_ctf_event_class *event_class,
+					struct bt_ctf_event *event,
+					struct format_field *fields,
+					struct perf_sample *sample)
+{
+	struct format_field *field;
+	int ret;
+
+	for (field = fields; field; field = field->next) {
+		ret = add_tracepoint_field_value(cw, event_class, event, sample,
+				field);
+		if (ret)
+			return -1;
+	}
+	return 0;
+}
+
+static int add_tracepoint_values(struct ctf_writer *cw,
+				 struct bt_ctf_event_class *event_class,
+				 struct bt_ctf_event *event,
+				 struct perf_evsel *evsel,
+				 struct perf_sample *sample)
+{
+	struct format_field *common_fields = evsel->tp_format->format.common_fields;
+	struct format_field *fields        = evsel->tp_format->format.fields;
+	int ret;
+
+	ret = add_tracepoint_fields_values(cw, event_class, event,
+					   common_fields, sample);
+	if (!ret)
+		ret = add_tracepoint_fields_values(cw, event_class, event,
+						   fields, sample);
+
+	return ret;
+}
+
+static int add_generic_values(struct ctf_writer *cw,
+			      struct bt_ctf_event *event,
+			      struct perf_evsel *evsel,
+			      struct perf_sample *sample)
+{
+	u64 type = evsel->attr.sample_type;
+	int ret;
+
+	/*
+	 * missing:
+	 *   PERF_SAMPLE_TIME         - not needed as we have it in
+	 *                              ctf event header
+	 *   PERF_SAMPLE_READ         - TODO
+	 *   PERF_SAMPLE_CALLCHAIN    - TODO
+	 *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+	 *   PERF_SAMPLE_BRANCH_STACK - TODO
+	 *   PERF_SAMPLE_REGS_USER    - TODO
+	 *   PERF_SAMPLE_STACK_USER   - TODO
+	 */
+
+	if (type & PERF_SAMPLE_IP) {
+		ret = value_set_u64_hex(cw, event, "perf_ip", sample->ip);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		ret = value_set_s32(cw, event, "perf_tid", sample->tid);
+		if (ret)
+			return -1;
+
+		ret = value_set_s32(cw, event, "perf_pid", sample->pid);
+		if (ret)
+			return -1;
+	}
+
+	if ((type & PERF_SAMPLE_ID) ||
+	    (type & PERF_SAMPLE_IDENTIFIER)) {
+		ret = value_set_u64(cw, event, "perf_id", sample->id);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		ret = value_set_u64(cw, event, "perf_stream_id", sample->stream_id);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_CPU) {
+		ret = value_set_u32(cw, event, "perf_cpu", sample->cpu);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_PERIOD) {
+		ret = value_set_u64(cw, event, "perf_period", sample->period);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_WEIGHT) {
+		ret = value_set_u64(cw, event, "perf_weight", sample->weight);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_DATA_SRC) {
+		ret = value_set_u64(cw, event, "perf_data_src",
+				sample->data_src);
+		if (ret)
+			return -1;
+	}
+
+	if (type & PERF_SAMPLE_TRANSACTION) {
+		ret = value_set_u64(cw, event, "perf_transaction",
+				sample->transaction);
+		if (ret)
+			return -1;
+	}
+
+	return 0;
+}
+
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *_event __maybe_unused,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine __maybe_unused)
+{
+	struct convert *c = container_of(tool, struct convert, tool);
+	struct evsel_priv *priv = evsel->priv;
+	struct ctf_writer *cw = &c->writer;
+	struct bt_ctf_event_class *event_class;
+	struct bt_ctf_event *event;
+	int ret;
+
+	if (WARN_ONCE(!priv, "Failed to setup all events.\n"))
+		return 0;
+
+	event_class = priv->event_class;
+
+	/* update stats */
+	c->events_count++;
+	c->events_size += _event->header.size;
+
+	pr_time2(sample->time, "sample %" PRIu64 "\n", c->events_count);
+
+	event = bt_ctf_event_create(event_class);
+	if (!event) {
+		pr_err("Failed to create an CTF event\n");
+		return -1;
+	}
+
+	bt_ctf_clock_set_time(cw->clock, sample->time);
+
+	ret = add_generic_values(cw, event, evsel, sample);
+	if (ret)
+		return -1;
+
+	if (evsel->attr.type == PERF_TYPE_TRACEPOINT) {
+		ret = add_tracepoint_values(cw, event_class, event,
+					    evsel, sample);
+		if (ret)
+			return -1;
+	}
+
+	bt_ctf_stream_append_event(cw->stream, event);
+	bt_ctf_event_put(event);
+	return 0;
+}
+
+static int add_tracepoint_fields_types(struct ctf_writer *cw,
+				       struct format_field *fields,
+				       struct bt_ctf_event_class *event_class)
+{
+	struct format_field *field;
+	int ret;
+
+	for (field = fields; field; field = field->next) {
+		struct bt_ctf_field_type *type;
+		unsigned long flags = field->flags;
+
+		pr2("  field '%s'\n", field->name);
+
+		type = get_tracepoint_field_type(cw, field);
+		if (!type)
+			return -1;
+
+		/*
+		 * A string is an array of chars. For this we use the string
+		 * type and don't care that it is an array. What we don't
+		 * support is an array of strings.
+		 */
+		if (flags & FIELD_IS_STRING)
+			flags &= ~FIELD_IS_ARRAY;
+
+		if (flags & FIELD_IS_ARRAY)
+			type = bt_ctf_field_type_array_create(type, field->arraylen);
+
+		ret = bt_ctf_event_class_add_field(event_class, type,
+				field->name);
+
+		if (flags & FIELD_IS_ARRAY)
+			bt_ctf_field_type_put(type);
+
+		if (ret) {
+			pr_err("Failed to add field '%s\n", field->name);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int add_tracepoint_types(struct ctf_writer *cw,
+				struct perf_evsel *evsel,
+				struct bt_ctf_event_class *class)
+{
+	struct format_field *common_fields = evsel->tp_format->format.common_fields;
+	struct format_field *fields        = evsel->tp_format->format.fields;
+	int ret;
+
+	ret = add_tracepoint_fields_types(cw, common_fields, class);
+	if (!ret)
+		ret = add_tracepoint_fields_types(cw, fields, class);
+
+	return ret;
+}
+
+static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
+			     struct bt_ctf_event_class *event_class)
+{
+	u64 type = evsel->attr.sample_type;
+
+	/*
+	 * missing:
+	 *   PERF_SAMPLE_TIME         - not needed as we have it in
+	 *                              ctf event header
+	 *   PERF_SAMPLE_READ         - TODO
+	 *   PERF_SAMPLE_CALLCHAIN    - TODO
+	 *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+	 *   PERF_SAMPLE_BRANCH_STACK - TODO
+	 *   PERF_SAMPLE_REGS_USER    - TODO
+	 *   PERF_SAMPLE_STACK_USER   - TODO
+	 */
+
+#define ADD_FIELD(cl, t, n)						\
+	do {								\
+		pr2("  field '%s'\n", n);				\
+		if (bt_ctf_event_class_add_field(cl, t, n)) {		\
+			pr_err("Failed to add field '%s;\n", n);	\
+			return -1;					\
+		}							\
+	} while (0)
+
+	if (type & PERF_SAMPLE_IP)
+		ADD_FIELD(event_class, cw->data.u64_hex, "perf_ip");
+
+	if (type & PERF_SAMPLE_TID) {
+		ADD_FIELD(event_class, cw->data.s32, "perf_tid");
+		ADD_FIELD(event_class, cw->data.s32, "perf_pid");
+	}
+
+	if ((type & PERF_SAMPLE_ID) ||
+	    (type & PERF_SAMPLE_IDENTIFIER))
+		ADD_FIELD(event_class, cw->data.u64, "perf_id");
+
+	if (type & PERF_SAMPLE_STREAM_ID)
+		ADD_FIELD(event_class, cw->data.u64, "perf_stream_id");
+
+	if (type & PERF_SAMPLE_CPU)
+		ADD_FIELD(event_class, cw->data.u32, "perf_cpu");
+
+	if (type & PERF_SAMPLE_PERIOD)
+		ADD_FIELD(event_class, cw->data.u64, "perf_period");
+
+	if (type & PERF_SAMPLE_WEIGHT)
+		ADD_FIELD(event_class, cw->data.u64, "perf_weight");
+
+	if (type & PERF_SAMPLE_DATA_SRC)
+		ADD_FIELD(event_class, cw->data.u64, "perf_data_src");
+
+	if (type & PERF_SAMPLE_TRANSACTION)
+		ADD_FIELD(event_class, cw->data.u64, "perf_transaction");
+
+#undef ADD_FIELD
+	return 0;
+}
+
+static int add_event(struct ctf_writer *cw, struct perf_evsel *evsel)
+{
+	struct bt_ctf_event_class *event_class;
+	struct evsel_priv *priv;
+	const char *name = perf_evsel__name(evsel);
+	int ret;
+
+	pr("Adding event '%s' (type %d)\n", name, evsel->attr.type);
+
+	event_class = bt_ctf_event_class_create(name);
+	if (!event_class)
+		return -1;
+
+	ret = add_generic_types(cw, evsel, event_class);
+	if (ret)
+		goto err;
+
+	if (evsel->attr.type == PERF_TYPE_TRACEPOINT) {
+		ret = add_tracepoint_types(cw, evsel, event_class);
+		if (ret)
+			goto err;
+	}
+
+	ret = bt_ctf_stream_class_add_event_class(cw->stream_class, event_class);
+	if (ret) {
+		pr("Failed to add event class into stream.\n");
+		goto err;
+	}
+
+	priv = malloc(sizeof(*priv));
+	if (!priv)
+		goto err;
+
+	priv->event_class = event_class;
+	evsel->priv       = priv;
+	return 0;
+
+err:
+	bt_ctf_event_class_put(event_class);
+	pr_err("Failed to add event '%s'.\n", name);
+	return -1;
+}
+
+static int setup_events(struct ctf_writer *cw, struct perf_session *session)
+{
+	struct perf_evlist *evlist = session->evlist;
+	struct perf_evsel *evsel;
+	int ret;
+
+	evlist__for_each(evlist, evsel) {
+		ret = add_event(cw, evsel);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int ctf_writer__setup_env(struct ctf_writer *cw,
+				 struct perf_session *session)
+{
+	struct perf_header *header = &session->header;
+	struct bt_ctf_writer *writer = cw->writer;
+
+#define ADD(__n, __v)							\
+do {									\
+	if (bt_ctf_writer_add_environment_field(writer, __n, __v))	\
+		return -1;						\
+} while (0)
+
+	ADD("host",    header->env.hostname);
+	ADD("sysname", "Linux");
+	ADD("release", header->env.os_release);
+	ADD("version", header->env.version);
+	ADD("machine", header->env.arch);
+	ADD("domain", "kernel");
+	ADD("tracer_name", "perf");
+
+#undef ADD
+	return 0;
+}
+
+static int ctf_writer__setup_clock(struct ctf_writer *cw)
+{
+	struct bt_ctf_clock *clock = cw->clock;
+
+	bt_ctf_clock_set_description(clock, "perf clock");
+
+#define SET(__n, __v)				\
+do {						\
+	if (bt_ctf_clock_set_##__n(clock, __v))	\
+		return -1;			\
+} while (0)
+
+	SET(frequency,   1000000000);
+	SET(offset_s,    0);
+	SET(offset,      0);
+	SET(precision,   10);
+	SET(is_absolute, 0);
+
+#undef SET
+	return 0;
+}
+
+static struct bt_ctf_field_type *create_int_type(int size, bool sign, bool hex)
+{
+	struct bt_ctf_field_type *type;
+
+	type = bt_ctf_field_type_integer_create(size);
+	if (!type)
+		return NULL;
+
+	if (sign &&
+	    bt_ctf_field_type_integer_set_signed(type, 1))
+		goto err;
+
+	if (hex &&
+	    bt_ctf_field_type_integer_set_base(type, BT_CTF_INTEGER_BASE_HEXADECIMAL))
+		goto err;
+
+	pr2("Created type: INTEGER %d-bit %ssigned %s\n",
+	    size, sign ? "un" : "", hex ? "hex" : "");
+	return type;
+
+err:
+	bt_ctf_field_type_put(type);
+	return NULL;
+}
+
+static void ctf_writer__cleanup_data(struct ctf_writer *cw)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(cw->data.array); i++)
+		bt_ctf_field_type_put(cw->data.array[i]);
+}
+
+static int ctf_writer__init_data(struct ctf_writer *cw)
+{
+#define CREATE_INT_TYPE(type, size, sign, hex)		\
+do {							\
+	(type) = create_int_type(size, sign, hex);	\
+	if (!(type))					\
+		goto err;				\
+} while (0)
+
+	CREATE_INT_TYPE(cw->data.s64, 64, true,  false);
+	CREATE_INT_TYPE(cw->data.u64, 64, false, false);
+	CREATE_INT_TYPE(cw->data.s32, 32, true,  false);
+	CREATE_INT_TYPE(cw->data.u32, 32, false, false);
+	CREATE_INT_TYPE(cw->data.u64_hex, 64, false, true);
+
+	cw->data.string  = bt_ctf_field_type_string_create();
+	if (cw->data.string)
+		return 0;
+
+err:
+	ctf_writer__cleanup_data(cw);
+	pr_err("Failed to create data types.\n");
+	return -1;
+}
+
+static void ctf_writer__cleanup(struct ctf_writer *cw)
+{
+	ctf_writer__cleanup_data(cw);
+
+	bt_ctf_clock_put(cw->clock);
+	bt_ctf_stream_put(cw->stream);
+	bt_ctf_stream_class_put(cw->stream_class);
+	bt_ctf_writer_put(cw->writer);
+
+	/* and NULL all the pointers */
+	memset(cw, 0, sizeof(*cw));
+}
+
+static int ctf_writer__init(struct ctf_writer *cw, const char *path)
+{
+	struct bt_ctf_writer		*writer;
+	struct bt_ctf_stream_class	*stream_class;
+	struct bt_ctf_stream		*stream;
+	struct bt_ctf_clock		*clock;
+
+	/* CTF writer */
+	writer = bt_ctf_writer_create(path);
+	if (!writer)
+		goto err;
+
+	cw->writer = writer;
+
+	/* CTF clock */
+	clock = bt_ctf_clock_create("perf_clock");
+	if (!clock) {
+		pr("Failed to create CTF clock.\n");
+		goto err_cleanup;
+	}
+
+	cw->clock = clock;
+
+	if (ctf_writer__setup_clock(cw)) {
+		pr("Failed to setup CTF clock.\n");
+		goto err_cleanup;
+	}
+
+	/* CTF stream class */
+	stream_class = bt_ctf_stream_class_create("perf_stream");
+	if (!stream_class) {
+		pr("Failed to create CTF stream class.\n");
+		goto err_cleanup;
+	}
+
+	cw->stream_class = stream_class;
+
+	/* CTF clock stream setup */
+	if (bt_ctf_stream_class_set_clock(stream_class, clock)) {
+		pr("Failed to assign CTF clock to stream class.\n");
+		goto err_cleanup;
+	}
+
+	if (ctf_writer__init_data(cw))
+		goto err_cleanup;
+
+	/* CTF stream instance */
+	stream = bt_ctf_writer_create_stream(writer, stream_class);
+	if (!stream) {
+		pr("Failed to create CTF stream.\n");
+		goto err_cleanup;
+	}
+
+	cw->stream = stream;
+
+	/* CTF clock writer setup */
+	if (bt_ctf_writer_add_clock(writer, clock)) {
+		pr("Failed to assign CTF clock to writer.\n");
+		goto err_cleanup;
+	}
+
+	return 0;
+
+err_cleanup:
+	ctf_writer__cleanup(cw);
+err:
+	pr_err("Failed to setup CTF writer.\n");
+	return -1;
+}
+
+int bt_convert__perf2ctf(const char *input, const char *path, bool force)
+{
+	struct perf_session *session;
+	struct perf_data_file file = {
+		.path = input,
+		.mode = PERF_DATA_MODE_READ,
+		.force = force,
+	};
+	struct convert c = {
+		.tool = {
+			.sample          = process_sample_event,
+			.mmap            = perf_event__process_mmap,
+			.mmap2           = perf_event__process_mmap2,
+			.comm            = perf_event__process_comm,
+			.exit            = perf_event__process_exit,
+			.fork            = perf_event__process_fork,
+			.lost            = perf_event__process_lost,
+			.tracing_data    = perf_event__process_tracing_data,
+			.build_id        = perf_event__process_build_id,
+			.ordered_events  = true,
+			.ordering_requires_timestamps = true,
+		},
+	};
+	struct ctf_writer *cw = &c.writer;
+	int err = -1;
+
+	/* CTF writer */
+	if (ctf_writer__init(cw, path))
+		return -1;
+
+	/* perf.data session */
+	session = perf_session__new(&file, 0, &c.tool);
+	if (!session)
+		goto free_writer;
+
+	/* CTF writer env/clock setup  */
+	if (ctf_writer__setup_env(cw, session))
+		goto free_session;
+
+	/* CTF events setup */
+	if (setup_events(cw, session))
+		goto free_session;
+
+	err = perf_session__process_events(session);
+	if (!err)
+		err = bt_ctf_stream_flush(cw->stream);
+
+	fprintf(stderr,
+		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
+		file.path, path);
+
+	fprintf(stderr,
+		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
+		(double) c.events_size / 1024.0 / 1024.0,
+		c.events_count);
+
+	/* its all good */
+free_session:
+	perf_session__delete(session);
+
+free_writer:
+	ctf_writer__cleanup(cw);
+	return err;
+}
diff --git a/tools/perf/util/data-convert-bt.h b/tools/perf/util/data-convert-bt.h
new file mode 100644
index 000000000000..4c204342a9d8
--- /dev/null
+++ b/tools/perf/util/data-convert-bt.h
@@ -0,0 +1,8 @@
+#ifndef __DATA_CONVERT_BT_H
+#define __DATA_CONVERT_BT_H
+#ifdef HAVE_LIBBABELTRACE_SUPPORT
+
+int bt_convert__perf2ctf(const char *input_name, const char *to_ctf, bool force);
+
+#endif /* HAVE_LIBBABELTRACE_SUPPORT */
+#endif /* __DATA_CONVERT_BT_H */
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index c81dae399763..bb39a3ffc70b 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -282,13 +282,13 @@ int db_export__branch_type(struct db_export *dbe, u32 branch_type,
 
 int db_export__sample(struct db_export *dbe, union perf_event *event,
 		      struct perf_sample *sample, struct perf_evsel *evsel,
-		      struct thread *thread, struct addr_location *al)
+		      struct addr_location *al)
 {
+	struct thread* thread = al->thread;
 	struct export_sample es = {
 		.event = event,
 		.sample = sample,
 		.evsel = evsel,
-		.thread = thread,
 		.al = al,
 	};
 	struct thread *main_thread;
diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h
index adbd22d66798..25e22fd76aca 100644
--- a/tools/perf/util/db-export.h
+++ b/tools/perf/util/db-export.h
@@ -34,7 +34,6 @@ struct export_sample {
 	union perf_event	*event;
 	struct perf_sample	*sample;
 	struct perf_evsel	*evsel;
-	struct thread		*thread;
 	struct addr_location	*al;
 	u64			db_id;
 	u64			comm_db_id;
@@ -97,7 +96,7 @@ int db_export__branch_type(struct db_export *dbe, u32 branch_type,
 			   const char *name);
 int db_export__sample(struct db_export *dbe, union perf_event *event,
 		      struct perf_sample *sample, struct perf_evsel *evsel,
-		      struct thread *thread, struct addr_location *al);
+		      struct addr_location *al);
 
 int db_export__branch_types(struct db_export *dbe);
 
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index ad60b2f20258..2da5581ec74d 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -20,6 +20,7 @@ int verbose;
 bool dump_trace = false, quiet = false;
 int debug_ordered_events;
 static int redirect_to_stderr;
+int debug_data_convert;
 
 static int _eprintf(int level, int var, const char *fmt, va_list args)
 {
@@ -147,6 +148,7 @@ static struct debug_variable {
 	{ .name = "verbose",		.ptr = &verbose },
 	{ .name = "ordered-events",	.ptr = &debug_ordered_events},
 	{ .name = "stderr",		.ptr = &redirect_to_stderr},
+	{ .name = "data-convert",	.ptr = &debug_data_convert },
 	{ .name = NULL, }
 };
 
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index be264d6f3b30..caac2fdc6105 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -12,6 +12,7 @@
 extern int verbose;
 extern bool quiet, dump_trace;
 extern int debug_ordered_events;
+extern int debug_data_convert;
 
 #ifndef pr_fmt
 #define pr_fmt(fmt) fmt
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index c2f7d3b90966..fc0ddd5792a9 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -45,13 +45,13 @@ int dso__read_binary_type_filename(const struct dso *dso,
 	case DSO_BINARY_TYPE__DEBUGLINK: {
 		char *debuglink;
 
-		strncpy(filename, dso->long_name, size);
-		debuglink = filename + dso->long_name_len;
+		len = __symbol__join_symfs(filename, size, dso->long_name);
+		debuglink = filename + len;
 		while (debuglink != filename && *debuglink != '/')
 			debuglink--;
 		if (*debuglink == '/')
 			debuglink++;
-		ret = filename__read_debuglink(dso->long_name, debuglink,
+		ret = filename__read_debuglink(filename, debuglink,
 					       size - (debuglink - filename));
 		}
 		break;
@@ -147,6 +147,9 @@ static const struct {
 } compressions[] = {
 #ifdef HAVE_ZLIB_SUPPORT
 	{ "gz", gzip_decompress_to_file },
+#endif
+#ifdef HAVE_LZMA_SUPPORT
+	{ "xz", lzma_decompress_to_file },
 #endif
 	{ NULL, NULL },
 };
@@ -162,32 +165,14 @@ bool is_supported_compression(const char *ext)
 	return false;
 }
 
-bool is_kmodule_extension(const char *ext)
+bool is_kernel_module(const char *pathname)
 {
-	if (strncmp(ext, "ko", 2))
-		return false;
+	struct kmod_path m;
 
-	if (ext[2] == '\0' || (ext[2] == '.' && is_supported_compression(ext+3)))
-		return true;
+	if (kmod_path__parse(&m, pathname))
+		return NULL;
 
-	return false;
-}
-
-bool is_kernel_module(const char *pathname, bool *compressed)
-{
-	const char *ext = strrchr(pathname, '.');
-
-	if (ext == NULL)
-		return false;
-
-	if (is_supported_compression(ext + 1)) {
-		if (compressed)
-			*compressed = true;
-		ext -= 3;
-	} else if (compressed)
-		*compressed = false;
-
-	return is_kmodule_extension(ext + 1);
+	return m.kmod;
 }
 
 bool decompress_to_file(const char *ext, const char *filename, int output_fd)
@@ -208,6 +193,72 @@ bool dso__needs_decompress(struct dso *dso)
 		dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP;
 }
 
+/*
+ * Parses kernel module specified in @path and updates
+ * @m argument like:
+ *
+ *    @comp - true if @path contains supported compression suffix,
+ *            false otherwise
+ *    @kmod - true if @path contains '.ko' suffix in right position,
+ *            false otherwise
+ *    @name - if (@alloc_name && @kmod) is true, it contains strdup-ed base name
+ *            of the kernel module without suffixes, otherwise strudup-ed
+ *            base name of @path
+ *    @ext  - if (@alloc_ext && @comp) is true, it contains strdup-ed string
+ *            the compression suffix
+ *
+ * Returns 0 if there's no strdup error, -ENOMEM otherwise.
+ */
+int __kmod_path__parse(struct kmod_path *m, const char *path,
+		       bool alloc_name, bool alloc_ext)
+{
+	const char *name = strrchr(path, '/');
+	const char *ext  = strrchr(path, '.');
+
+	memset(m, 0x0, sizeof(*m));
+	name = name ? name + 1 : path;
+
+	/* No extension, just return name. */
+	if (ext == NULL) {
+		if (alloc_name) {
+			m->name = strdup(name);
+			return m->name ? 0 : -ENOMEM;
+		}
+		return 0;
+	}
+
+	if (is_supported_compression(ext + 1)) {
+		m->comp = true;
+		ext -= 3;
+	}
+
+	/* Check .ko extension only if there's enough name left. */
+	if (ext > name)
+		m->kmod = !strncmp(ext, ".ko", 3);
+
+	if (alloc_name) {
+		if (m->kmod) {
+			if (asprintf(&m->name, "[%.*s]", (int) (ext - name), name) == -1)
+				return -ENOMEM;
+		} else {
+			if (asprintf(&m->name, "%s", name) == -1)
+				return -ENOMEM;
+		}
+
+		strxfrchar(m->name, '-', '_');
+	}
+
+	if (alloc_ext && m->comp) {
+		m->ext = strdup(ext + 4);
+		if (!m->ext) {
+			free((void *) m->name);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Global list of open DSOs and the counter.
  */
@@ -240,7 +291,7 @@ static int do_open(char *name)
 		if (fd >= 0)
 			return fd;
 
-		pr_debug("dso open failed, mmap: %s\n",
+		pr_debug("dso open failed: %s\n",
 			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		if (!dso__data_open_cnt || errno != EMFILE)
 			break;
@@ -1002,19 +1053,22 @@ struct dso *dsos__find(const struct dsos *dsos, const char *name,
 	return dso__find_by_longname(&dsos->root, name);
 }
 
+struct dso *dsos__addnew(struct dsos *dsos, const char *name)
+{
+	struct dso *dso = dso__new(name);
+
+	if (dso != NULL) {
+		dsos__add(dsos, dso);
+		dso__set_basename(dso);
+	}
+	return dso;
+}
+
 struct dso *__dsos__findnew(struct dsos *dsos, const char *name)
 {
 	struct dso *dso = dsos__find(dsos, name, false);
 
-	if (!dso) {
-		dso = dso__new(name);
-		if (dso != NULL) {
-			dsos__add(dsos, dso);
-			dso__set_basename(dso);
-		}
-	}
-
-	return dso;
+	return dso ? dso : dsos__addnew(dsos, name);
 }
 
 size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
@@ -1083,3 +1137,36 @@ enum dso_type dso__type(struct dso *dso, struct machine *machine)
 
 	return dso__type_fd(fd);
 }
+
+int dso__strerror_load(struct dso *dso, char *buf, size_t buflen)
+{
+	int idx, errnum = dso->load_errno;
+	/*
+	 * This must have a same ordering as the enum dso_load_errno.
+	 */
+	static const char *dso_load__error_str[] = {
+	"Internal tools/perf/ library error",
+	"Invalid ELF file",
+	"Can not read build id",
+	"Mismatching build id",
+	"Decompression failure",
+	};
+
+	BUG_ON(buflen == 0);
+
+	if (errnum >= 0) {
+		const char *err = strerror_r(errnum, buf, buflen);
+
+		if (err != buf)
+			scnprintf(buf, buflen, "%s", err);
+
+		return 0;
+	}
+
+	if (errnum <  __DSO_LOAD_ERRNO__START || errnum >= __DSO_LOAD_ERRNO__END)
+		return -1;
+
+	idx = errnum - __DSO_LOAD_ERRNO__START;
+	scnprintf(buf, buflen, "%s", dso_load__error_str[idx]);
+	return 0;
+}
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index ced92841ff97..e0901b4ed8de 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -60,6 +60,31 @@ enum dso_type {
 	DSO__TYPE_X32BIT,
 };
 
+enum dso_load_errno {
+	DSO_LOAD_ERRNO__SUCCESS		= 0,
+
+	/*
+	 * Choose an arbitrary negative big number not to clash with standard
+	 * errno since SUS requires the errno has distinct positive values.
+	 * See 'Issue 6' in the link below.
+	 *
+	 * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
+	 */
+	__DSO_LOAD_ERRNO__START		= -10000,
+
+	DSO_LOAD_ERRNO__INTERNAL_ERROR	= __DSO_LOAD_ERRNO__START,
+
+	/* for symsrc__init() */
+	DSO_LOAD_ERRNO__INVALID_ELF,
+	DSO_LOAD_ERRNO__CANNOT_READ_BUILDID,
+	DSO_LOAD_ERRNO__MISMATCHING_BUILDID,
+
+	/* for decompress_kmodule */
+	DSO_LOAD_ERRNO__DECOMPRESSION_FAILURE,
+
+	__DSO_LOAD_ERRNO__END,
+};
+
 #define DSO__SWAP(dso, type, val)			\
 ({							\
 	type ____r = val;				\
@@ -113,6 +138,7 @@ struct dso {
 	enum dso_swap_type	needs_swap;
 	enum dso_binary_type	symtab_type;
 	enum dso_binary_type	binary_type;
+	enum dso_load_errno	load_errno;
 	u8		 adjust_symbols:1;
 	u8		 has_build_id:1;
 	u8		 has_srcline:1;
@@ -139,7 +165,8 @@ struct dso {
 		u32		 status_seen;
 		size_t		 file_size;
 		struct list_head open_entry;
-		u64		 frame_offset;
+		u64		 debug_frame_offset;
+		u64		 eh_frame_hdr_offset;
 	} data;
 
 	union { /* Tool specific area */
@@ -189,11 +216,24 @@ char dso__symtab_origin(const struct dso *dso);
 int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type,
 				   char *root_dir, char *filename, size_t size);
 bool is_supported_compression(const char *ext);
-bool is_kmodule_extension(const char *ext);
-bool is_kernel_module(const char *pathname, bool *compressed);
+bool is_kernel_module(const char *pathname);
 bool decompress_to_file(const char *ext, const char *filename, int output_fd);
 bool dso__needs_decompress(struct dso *dso);
 
+struct kmod_path {
+	char *name;
+	char *ext;
+	bool  comp;
+	bool  kmod;
+};
+
+int __kmod_path__parse(struct kmod_path *m, const char *path,
+		     bool alloc_name, bool alloc_ext);
+
+#define kmod_path__parse(__m, __p)      __kmod_path__parse(__m, __p, false, false)
+#define kmod_path__parse_name(__m, __p) __kmod_path__parse(__m, __p, true , false)
+#define kmod_path__parse_ext(__m, __p)  __kmod_path__parse(__m, __p, false, true)
+
 /*
  * The dso__data_* external interface provides following functions:
  *   dso__data_fd
@@ -249,6 +289,7 @@ struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
 				const char *short_name, int dso_type);
 
 void dsos__add(struct dsos *dsos, struct dso *dso);
+struct dso *dsos__addnew(struct dsos *dsos, const char *name);
 struct dso *dsos__find(const struct dsos *dsos, const char *name,
 		       bool cmp_short);
 struct dso *__dsos__findnew(struct dsos *dsos, const char *name);
@@ -279,4 +320,6 @@ void dso__free_a2l(struct dso *dso);
 
 enum dso_type dso__type(struct dso *dso, struct machine *machine);
 
+int dso__strerror_load(struct dso *dso, char *buf, size_t buflen);
+
 #endif /* __PERF_DSO */
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index cc66c4049e09..c34e024020c7 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -277,6 +277,21 @@ bool die_is_func_def(Dwarf_Die *dw_die)
 		dwarf_attr(dw_die, DW_AT_declaration, &attr) == NULL);
 }
 
+/**
+ * die_is_func_instance - Ensure that this DIE is an instance of a subprogram
+ * @dw_die: a DIE
+ *
+ * Ensure that this DIE is an instance (which has an entry address).
+ * This returns true if @dw_die is a function instance. If not, you need to
+ * call die_walk_instances() to find actual instances.
+ **/
+bool die_is_func_instance(Dwarf_Die *dw_die)
+{
+	Dwarf_Addr tmp;
+
+	/* Actually gcc optimizes non-inline as like as inlined */
+	return !dwarf_func_inline(dw_die) && dwarf_entrypc(dw_die, &tmp) == 0;
+}
 /**
  * die_get_data_member_location - Get the data-member offset
  * @mb_die: a DIE of a member of a data structure
@@ -786,10 +801,16 @@ static int __die_find_member_cb(Dwarf_Die *die_mem, void *data)
 {
 	const char *name = data;
 
-	if ((dwarf_tag(die_mem) == DW_TAG_member) &&
-	    die_compare_name(die_mem, name))
-		return DIE_FIND_CB_END;
-
+	if (dwarf_tag(die_mem) == DW_TAG_member) {
+		if (die_compare_name(die_mem, name))
+			return DIE_FIND_CB_END;
+		else if (!dwarf_diename(die_mem)) {	/* Unnamed structure */
+			Dwarf_Die type_die, tmp_die;
+			if (die_get_type(die_mem, &type_die) &&
+			    die_find_member(&type_die, name, &tmp_die))
+				return DIE_FIND_CB_END;
+		}
+	}
 	return DIE_FIND_CB_SIBLING;
 }
 
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index b4fe90c6cb2d..af7dbcd5f929 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -41,6 +41,9 @@ extern int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
 /* Ensure that this DIE is a subprogram and definition (not declaration) */
 extern bool die_is_func_def(Dwarf_Die *dw_die);
 
+/* Ensure that this DIE is an instance of a subprogram */
+extern bool die_is_func_instance(Dwarf_Die *dw_die);
+
 /* Compare diename and tname */
 extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname);
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 6c6d044e959a..ff866c4d2e2f 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -49,72 +49,103 @@ static struct perf_sample synth_sample = {
 	.period	   = 1,
 };
 
-static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len)
+/*
+ * Assumes that the first 4095 bytes of /proc/pid/stat contains
+ * the comm, tgid and ppid.
+ */
+static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len,
+				    pid_t *tgid, pid_t *ppid)
 {
 	char filename[PATH_MAX];
-	char bf[BUFSIZ];
-	FILE *fp;
-	size_t size = 0;
-	pid_t tgid = -1;
+	char bf[4096];
+	int fd;
+	size_t size = 0, n;
+	char *nl, *name, *tgids, *ppids;
+
+	*tgid = -1;
+	*ppid = -1;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
 
-	fp = fopen(filename, "r");
-	if (fp == NULL) {
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
 		pr_debug("couldn't open %s\n", filename);
-		return 0;
+		return -1;
 	}
 
-	while (!comm[0] || (tgid < 0)) {
-		if (fgets(bf, sizeof(bf), fp) == NULL) {
-			pr_warning("couldn't get COMM and pgid, malformed %s\n",
-				   filename);
-			break;
-		}
+	n = read(fd, bf, sizeof(bf) - 1);
+	close(fd);
+	if (n <= 0) {
+		pr_warning("Couldn't get COMM, tigd and ppid for pid %d\n",
+			   pid);
+		return -1;
+	}
+	bf[n] = '\0';
 
-		if (memcmp(bf, "Name:", 5) == 0) {
-			char *name = bf + 5;
-			while (*name && isspace(*name))
-				++name;
-			size = strlen(name) - 1;
-			if (size >= len)
-				size = len - 1;
-			memcpy(comm, name, size);
-			comm[size] = '\0';
+	name = strstr(bf, "Name:");
+	tgids = strstr(bf, "Tgid:");
+	ppids = strstr(bf, "PPid:");
 
-		} else if (memcmp(bf, "Tgid:", 5) == 0) {
-			char *tgids = bf + 5;
-			while (*tgids && isspace(*tgids))
-				++tgids;
-			tgid = atoi(tgids);
-		}
+	if (name) {
+		name += 5;  /* strlen("Name:") */
+
+		while (*name && isspace(*name))
+			++name;
+
+		nl = strchr(name, '\n');
+		if (nl)
+			*nl = '\0';
+
+		size = strlen(name);
+		if (size >= len)
+			size = len - 1;
+		memcpy(comm, name, size);
+		comm[size] = '\0';
+	} else {
+		pr_debug("Name: string not found for pid %d\n", pid);
 	}
 
-	fclose(fp);
+	if (tgids) {
+		tgids += 5;  /* strlen("Tgid:") */
+		*tgid = atoi(tgids);
+	} else {
+		pr_debug("Tgid: string not found for pid %d\n", pid);
+	}
 
-	return tgid;
+	if (ppids) {
+		ppids += 5;  /* strlen("PPid:") */
+		*ppid = atoi(ppids);
+	} else {
+		pr_debug("PPid: string not found for pid %d\n", pid);
+	}
+
+	return 0;
 }
 
-static pid_t perf_event__synthesize_comm(struct perf_tool *tool,
-					 union perf_event *event, pid_t pid,
-					 perf_event__handler_t process,
-					 struct machine *machine)
+static int perf_event__prepare_comm(union perf_event *event, pid_t pid,
+				    struct machine *machine,
+				    pid_t *tgid, pid_t *ppid)
 {
 	size_t size;
-	pid_t tgid;
+
+	*ppid = -1;
 
 	memset(&event->comm, 0, sizeof(event->comm));
 
-	if (machine__is_host(machine))
-		tgid = perf_event__get_comm_tgid(pid, event->comm.comm,
-						 sizeof(event->comm.comm));
-	else
-		tgid = machine->pid;
+	if (machine__is_host(machine)) {
+		if (perf_event__get_comm_ids(pid, event->comm.comm,
+					     sizeof(event->comm.comm),
+					     tgid, ppid) != 0) {
+			return -1;
+		}
+	} else {
+		*tgid = machine->pid;
+	}
 
-	if (tgid < 0)
-		goto out;
+	if (*tgid < 0)
+		return -1;
 
-	event->comm.pid = tgid;
+	event->comm.pid = *tgid;
 	event->comm.header.type = PERF_RECORD_COMM;
 
 	size = strlen(event->comm.comm) + 1;
@@ -125,23 +156,45 @@ static pid_t perf_event__synthesize_comm(struct perf_tool *tool,
 				machine->id_hdr_size);
 	event->comm.tid = pid;
 
+	return 0;
+}
+
+static pid_t perf_event__synthesize_comm(struct perf_tool *tool,
+					 union perf_event *event, pid_t pid,
+					 perf_event__handler_t process,
+					 struct machine *machine)
+{
+	pid_t tgid, ppid;
+
+	if (perf_event__prepare_comm(event, pid, machine, &tgid, &ppid) != 0)
+		return -1;
+
 	if (process(tool, event, &synth_sample, machine) != 0)
 		return -1;
 
-out:
 	return tgid;
 }
 
 static int perf_event__synthesize_fork(struct perf_tool *tool,
-				       union perf_event *event, pid_t pid,
-				       pid_t tgid, perf_event__handler_t process,
+				       union perf_event *event,
+				       pid_t pid, pid_t tgid, pid_t ppid,
+				       perf_event__handler_t process,
 				       struct machine *machine)
 {
 	memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size);
 
-	/* this is really a clone event but we use fork to synthesize it */
-	event->fork.ppid = tgid;
-	event->fork.ptid = tgid;
+	/*
+	 * for main thread set parent to ppid from status file. For other
+	 * threads set parent pid to main thread. ie., assume main thread
+	 * spawns all threads in a process
+	*/
+	if (tgid == pid) {
+		event->fork.ppid = ppid;
+		event->fork.ptid = ppid;
+	} else {
+		event->fork.ppid = tgid;
+		event->fork.ptid = tgid;
+	}
 	event->fork.pid  = tgid;
 	event->fork.tid  = pid;
 	event->fork.header.type = PERF_RECORD_FORK;
@@ -333,7 +386,8 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 	char filename[PATH_MAX];
 	DIR *tasks;
 	struct dirent dirent, *next;
-	pid_t tgid;
+	pid_t tgid, ppid;
+	int rc = 0;
 
 	/* special case: only send one comm event using passed in pid */
 	if (!full) {
@@ -361,34 +415,38 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 
 	while (!readdir_r(tasks, &dirent, &next) && next) {
 		char *end;
-		int rc = 0;
 		pid_t _pid;
 
 		_pid = strtol(dirent.d_name, &end, 10);
 		if (*end)
 			continue;
 
-		tgid = perf_event__synthesize_comm(tool, comm_event, _pid,
-						   process, machine);
-		if (tgid == -1)
-			return -1;
+		rc = -1;
+		if (perf_event__prepare_comm(comm_event, _pid, machine,
+					     &tgid, &ppid) != 0)
+			break;
 
+		if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
+						ppid, process, machine) < 0)
+			break;
+		/*
+		 * Send the prepared comm event
+		 */
+		if (process(tool, comm_event, &synth_sample, machine) != 0)
+			break;
+
+		rc = 0;
 		if (_pid == pid) {
 			/* process the parent's maps too */
 			rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
 						process, machine, mmap_data);
-		} else {
-			/* only fork the tid's map, to save time */
-			rc = perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
-						 process, machine);
+			if (rc)
+				break;
 		}
-
-		if (rc)
-			return rc;
 	}
 
 	closedir(tasks);
-	return 0;
+	return rc;
 }
 
 int perf_event__synthesize_thread_map(struct perf_tool *tool,
@@ -615,7 +673,7 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
 	else
 		s = "";
 
-	return fprintf(fp, "%s: %s:%d\n", s, event->comm.comm, event->comm.tid);
+	return fprintf(fp, "%s: %s:%d/%d\n", s, event->comm.comm, event->comm.pid, event->comm.tid);
 }
 
 int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index c4ffe2bd0738..09b9e8d3fcf7 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -242,7 +242,6 @@ struct events_stats {
 	u32 nr_invalid_chains;
 	u32 nr_unknown_id;
 	u32 nr_unprocessable_samples;
-	u32 nr_unordered_events;
 };
 
 struct attr_event {
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 28b8ce86bf12..080be93eea96 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -7,7 +7,6 @@
  * Released under the GPL v2. (and only v2, not any later version)
  */
 #include "util.h"
-#include <api/fs/debugfs.h>
 #include <api/fs/fs.h>
 #include <poll.h>
 #include "cpumap.h"
@@ -635,8 +634,8 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
 union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 {
 	struct perf_mmap *md = &evlist->mmap[idx];
-	unsigned int head = perf_mmap__read_head(md);
-	unsigned int old = md->prev;
+	u64 head = perf_mmap__read_head(md);
+	u64 old = md->prev;
 	unsigned char *data = md->base + page_size;
 	union perf_event *event = NULL;
 
@@ -696,7 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 
 static bool perf_mmap__empty(struct perf_mmap *md)
 {
-	return perf_mmap__read_head(md) != md->prev;
+	return perf_mmap__read_head(md) == md->prev;
 }
 
 static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx)
@@ -717,7 +716,7 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
 	struct perf_mmap *md = &evlist->mmap[idx];
 
 	if (!evlist->overwrite) {
-		unsigned int old = md->prev;
+		u64 old = md->prev;
 
 		perf_mmap__write_tail(md, old);
 	}
@@ -1051,7 +1050,7 @@ out_delete_threads:
 	return -1;
 }
 
-int perf_evlist__apply_filters(struct perf_evlist *evlist)
+int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel)
 {
 	struct perf_evsel *evsel;
 	int err = 0;
@@ -1063,8 +1062,10 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist)
 			continue;
 
 		err = perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter);
-		if (err)
+		if (err) {
+			*err_evsel = evsel;
 			break;
+		}
 	}
 
 	return err;
@@ -1086,6 +1087,38 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
 	return err;
 }
 
+int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids)
+{
+	char *filter;
+	int ret = -1;
+	size_t i;
+
+	for (i = 0; i < npids; ++i) {
+		if (i == 0) {
+			if (asprintf(&filter, "common_pid != %d", pids[i]) < 0)
+				return -1;
+		} else {
+			char *tmp;
+
+			if (asprintf(&tmp, "%s && common_pid != %d", filter, pids[i]) < 0)
+				goto out_free;
+
+			free(filter);
+			filter = tmp;
+		}
+	}
+
+	ret = perf_evlist__set_filter(evlist, filter);
+out_free:
+	free(filter);
+	return ret;
+}
+
+int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid)
+{
+	return perf_evlist__set_filter_pids(evlist, 1, &pid);
+}
+
 bool perf_evlist__valid_sample_type(struct perf_evlist *evlist)
 {
 	struct perf_evsel *pos;
@@ -1329,7 +1362,7 @@ int perf_evlist__prepare_workload(struct perf_evlist *evlist, struct target *tar
 		 * writing exactly one byte, in workload.cork_fd, usually via
 		 * perf_evlist__start_workload().
 		 *
-		 * For cancelling the workload without actuallin running it,
+		 * For cancelling the workload without actually running it,
 		 * the parent will just close workload.cork_fd, without writing
 		 * anything, i.e. read will return zero and we just exit()
 		 * here.
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index e99a67632831..b5cce95d644e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -27,7 +27,7 @@ struct perf_mmap {
 	void		 *base;
 	int		 mask;
 	int		 refcnt;
-	unsigned int	 prev;
+	u64		 prev;
 	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
 };
 
@@ -51,6 +51,7 @@ struct perf_evlist {
 	struct thread_map *threads;
 	struct cpu_map	  *cpus;
 	struct perf_evsel *selected;
+	struct events_stats stats;
 };
 
 struct perf_evsel_str_handler {
@@ -77,6 +78,8 @@ int perf_evlist__add_newtp(struct perf_evlist *evlist,
 			   const char *sys, const char *name, void *handler);
 
 int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter);
+int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid);
+int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids);
 
 struct perf_evsel *
 perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id);
@@ -149,7 +152,7 @@ static inline void perf_evlist__set_maps(struct perf_evlist *evlist,
 }
 
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target);
-int perf_evlist__apply_filters(struct perf_evlist *evlist);
+int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel);
 
 void __perf_evlist__set_leader(struct list_head *list);
 void perf_evlist__set_leader(struct perf_evlist *evlist);
@@ -186,16 +189,15 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp);
 int perf_evlist__strerror_open(struct perf_evlist *evlist, int err, char *buf, size_t size);
 int perf_evlist__strerror_mmap(struct perf_evlist *evlist, int err, char *buf, size_t size);
 
-static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
+static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->base;
-	int head = ACCESS_ONCE(pc->data_head);
+	u64 head = ACCESS_ONCE(pc->data_head);
 	rmb();
 	return head;
 }
 
-static inline void perf_mmap__write_tail(struct perf_mmap *md,
-					 unsigned long tail)
+static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 {
 	struct perf_event_mmap_page *pc = md->base;
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ea51a90e20a0..33e3fd8c2e68 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -32,8 +32,12 @@ static struct {
 	bool exclude_guest;
 	bool mmap2;
 	bool cloexec;
+	bool clockid;
+	bool clockid_wrong;
 } perf_missing_features;
 
+static clockid_t clockid;
+
 static int perf_evsel__no_extra_init(struct perf_evsel *evsel __maybe_unused)
 {
 	return 0;
@@ -537,13 +541,30 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size)
 }
 
 static void
-perf_evsel__config_callgraph(struct perf_evsel *evsel)
+perf_evsel__config_callgraph(struct perf_evsel *evsel,
+			     struct record_opts *opts)
 {
 	bool function = perf_evsel__is_function_event(evsel);
 	struct perf_event_attr *attr = &evsel->attr;
 
 	perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
+	if (callchain_param.record_mode == CALLCHAIN_LBR) {
+		if (!opts->branch_stack) {
+			if (attr->exclude_user) {
+				pr_warning("LBR callstack option is only available "
+					   "to get user callchain information. "
+					   "Falling back to framepointers.\n");
+			} else {
+				perf_evsel__set_sample_bit(evsel, BRANCH_STACK);
+				attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
+							PERF_SAMPLE_BRANCH_CALL_STACK;
+			}
+		} else
+			 pr_warning("Cannot use LBR callstack with branch stack. "
+				    "Falling back to framepointers.\n");
+	}
+
 	if (callchain_param.record_mode == CALLCHAIN_DWARF) {
 		if (!function) {
 			perf_evsel__set_sample_bit(evsel, REGS_USER);
@@ -667,7 +688,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 		evsel->attr.exclude_callchain_user = 1;
 
 	if (callchain_param.enabled && !evsel->no_aux_samples)
-		perf_evsel__config_callgraph(evsel);
+		perf_evsel__config_callgraph(evsel, opts);
 
 	if (opts->sample_intr_regs) {
 		attr->sample_regs_intr = PERF_REGS_MASK;
@@ -717,6 +738,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 	if (opts->sample_transaction)
 		perf_evsel__set_sample_bit(evsel, TRANSACTION);
 
+	if (opts->running_time) {
+		evsel->attr.read_format |=
+			PERF_FORMAT_TOTAL_TIME_ENABLED |
+			PERF_FORMAT_TOTAL_TIME_RUNNING;
+	}
+
 	/*
 	 * XXX see the function comment above
 	 *
@@ -738,6 +765,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 		attr->disabled = 0;
 		attr->enable_on_exec = 0;
 	}
+
+	clockid = opts->clockid;
+	if (opts->use_clockid) {
+		attr->use_clockid = 1;
+		attr->clockid = opts->clockid;
+	}
 }
 
 static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
@@ -978,67 +1011,126 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
 	return fd;
 }
 
-#define __PRINT_ATTR(fmt, cast, field)  \
-	fprintf(fp, "  %-19s "fmt"\n", #field, cast attr->field)
+struct bit_names {
+	int bit;
+	const char *name;
+};
 
-#define PRINT_ATTR_U32(field)  __PRINT_ATTR("%u" , , field)
-#define PRINT_ATTR_X32(field)  __PRINT_ATTR("%#x", , field)
-#define PRINT_ATTR_U64(field)  __PRINT_ATTR("%" PRIu64, (uint64_t), field)
-#define PRINT_ATTR_X64(field)  __PRINT_ATTR("%#"PRIx64, (uint64_t), field)
-
-#define PRINT_ATTR2N(name1, field1, name2, field2)	\
-	fprintf(fp, "  %-19s %u    %-19s %u\n",		\
-	name1, attr->field1, name2, attr->field2)
-
-#define PRINT_ATTR2(field1, field2) \
-	PRINT_ATTR2N(#field1, field1, #field2, field2)
-
-static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
+static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits)
 {
-	size_t ret = 0;
+	bool first_bit = true;
+	int i = 0;
 
-	ret += fprintf(fp, "%.60s\n", graph_dotted_line);
-	ret += fprintf(fp, "perf_event_attr:\n");
+	do {
+		if (value & bits[i].bit) {
+			buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name);
+			first_bit = false;
+		}
+	} while (bits[++i].name != NULL);
+}
 
-	ret += PRINT_ATTR_U32(type);
-	ret += PRINT_ATTR_U32(size);
-	ret += PRINT_ATTR_X64(config);
-	ret += PRINT_ATTR_U64(sample_period);
-	ret += PRINT_ATTR_U64(sample_freq);
-	ret += PRINT_ATTR_X64(sample_type);
-	ret += PRINT_ATTR_X64(read_format);
+static void __p_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
+		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
+		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
+		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
+		bit_name(IDENTIFIER), bit_name(REGS_INTR),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
 
-	ret += PRINT_ATTR2(disabled, inherit);
-	ret += PRINT_ATTR2(pinned, exclusive);
-	ret += PRINT_ATTR2(exclude_user, exclude_kernel);
-	ret += PRINT_ATTR2(exclude_hv, exclude_idle);
-	ret += PRINT_ATTR2(mmap, comm);
-	ret += PRINT_ATTR2(mmap2, comm_exec);
-	ret += PRINT_ATTR2(freq, inherit_stat);
-	ret += PRINT_ATTR2(enable_on_exec, task);
-	ret += PRINT_ATTR2(watermark, precise_ip);
-	ret += PRINT_ATTR2(mmap_data, sample_id_all);
-	ret += PRINT_ATTR2(exclude_host, exclude_guest);
-	ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel,
-			    "excl.callchain_user", exclude_callchain_user);
+static void __p_read_format(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_FORMAT_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
+		bit_name(ID), bit_name(GROUP),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
 
-	ret += PRINT_ATTR_U32(wakeup_events);
-	ret += PRINT_ATTR_U32(wakeup_watermark);
-	ret += PRINT_ATTR_X32(bp_type);
-	ret += PRINT_ATTR_X64(bp_addr);
-	ret += PRINT_ATTR_X64(config1);
-	ret += PRINT_ATTR_U64(bp_len);
-	ret += PRINT_ATTR_X64(config2);
-	ret += PRINT_ATTR_X64(branch_sample_type);
-	ret += PRINT_ATTR_X64(sample_regs_user);
-	ret += PRINT_ATTR_U32(sample_stack_user);
-	ret += PRINT_ATTR_X64(sample_regs_intr);
+#define BUF_SIZE		1024
 
-	ret += fprintf(fp, "%.60s\n", graph_dotted_line);
+#define p_hex(val)		snprintf(buf, BUF_SIZE, "%"PRIx64, (uint64_t)(val))
+#define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
+#define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
+#define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
+#define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
+
+#define PRINT_ATTRn(_n, _f, _p)				\
+do {							\
+	if (attr->_f) {					\
+		_p(attr->_f);				\
+		ret += attr__fprintf(fp, _n, buf, priv);\
+	}						\
+} while (0)
+
+#define PRINT_ATTRf(_f, _p)	PRINT_ATTRn(#_f, _f, _p)
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv)
+{
+	char buf[BUF_SIZE];
+	int ret = 0;
+
+	PRINT_ATTRf(type, p_unsigned);
+	PRINT_ATTRf(size, p_unsigned);
+	PRINT_ATTRf(config, p_hex);
+	PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned);
+	PRINT_ATTRf(sample_type, p_sample_type);
+	PRINT_ATTRf(read_format, p_read_format);
+
+	PRINT_ATTRf(disabled, p_unsigned);
+	PRINT_ATTRf(inherit, p_unsigned);
+	PRINT_ATTRf(pinned, p_unsigned);
+	PRINT_ATTRf(exclusive, p_unsigned);
+	PRINT_ATTRf(exclude_user, p_unsigned);
+	PRINT_ATTRf(exclude_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_hv, p_unsigned);
+	PRINT_ATTRf(exclude_idle, p_unsigned);
+	PRINT_ATTRf(mmap, p_unsigned);
+	PRINT_ATTRf(comm, p_unsigned);
+	PRINT_ATTRf(freq, p_unsigned);
+	PRINT_ATTRf(inherit_stat, p_unsigned);
+	PRINT_ATTRf(enable_on_exec, p_unsigned);
+	PRINT_ATTRf(task, p_unsigned);
+	PRINT_ATTRf(watermark, p_unsigned);
+	PRINT_ATTRf(precise_ip, p_unsigned);
+	PRINT_ATTRf(mmap_data, p_unsigned);
+	PRINT_ATTRf(sample_id_all, p_unsigned);
+	PRINT_ATTRf(exclude_host, p_unsigned);
+	PRINT_ATTRf(exclude_guest, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_user, p_unsigned);
+	PRINT_ATTRf(mmap2, p_unsigned);
+	PRINT_ATTRf(comm_exec, p_unsigned);
+	PRINT_ATTRf(use_clockid, p_unsigned);
+
+	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
+	PRINT_ATTRf(bp_type, p_unsigned);
+	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
+	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
+	PRINT_ATTRf(sample_regs_user, p_hex);
+	PRINT_ATTRf(sample_stack_user, p_unsigned);
+	PRINT_ATTRf(clockid, p_signed);
+	PRINT_ATTRf(sample_regs_intr, p_hex);
 
 	return ret;
 }
 
+static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
+				void *priv __attribute__((unused)))
+{
+	return fprintf(fp, "  %-32s %s\n", name, val);
+}
+
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			      struct thread_map *threads)
 {
@@ -1062,6 +1154,12 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	}
 
 fallback_missing_features:
+	if (perf_missing_features.clockid_wrong)
+		evsel->attr.clockid = CLOCK_MONOTONIC; /* should always work */
+	if (perf_missing_features.clockid) {
+		evsel->attr.use_clockid = 0;
+		evsel->attr.clockid = 0;
+	}
 	if (perf_missing_features.cloexec)
 		flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC;
 	if (perf_missing_features.mmap2)
@@ -1072,8 +1170,12 @@ retry_sample_id:
 	if (perf_missing_features.sample_id_all)
 		evsel->attr.sample_id_all = 0;
 
-	if (verbose >= 2)
-		perf_event_attr__fprintf(&evsel->attr, stderr);
+	if (verbose >= 2) {
+		fprintf(stderr, "%.60s\n", graph_dotted_line);
+		fprintf(stderr, "perf_event_attr:\n");
+		perf_event_attr__fprintf(stderr, &evsel->attr, __open_attr__fprintf, NULL);
+		fprintf(stderr, "%.60s\n", graph_dotted_line);
+	}
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 
@@ -1099,6 +1201,17 @@ retry_open:
 				goto try_fallback;
 			}
 			set_rlimit = NO_CHANGE;
+
+			/*
+			 * If we succeeded but had to kill clockid, fail and
+			 * have perf_evsel__open_strerror() print us a nice
+			 * error.
+			 */
+			if (perf_missing_features.clockid ||
+			    perf_missing_features.clockid_wrong) {
+				err = -EINVAL;
+				goto out_close;
+			}
 		}
 	}
 
@@ -1132,7 +1245,17 @@ try_fallback:
 	if (err != -EINVAL || cpu > 0 || thread > 0)
 		goto out_close;
 
-	if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
+	/*
+	 * Must probe features in the order they were added to the
+	 * perf_event_attr interface.
+	 */
+	if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) {
+		perf_missing_features.clockid_wrong = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.clockid && evsel->attr.use_clockid) {
+		perf_missing_features.clockid = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
 		perf_missing_features.cloexec = true;
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.mmap2 && evsel->attr.mmap2) {
@@ -1892,7 +2015,7 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
 		value = *(u32 *)ptr;
 		break;
 	case 8:
-		value = *(u64 *)ptr;
+		memcpy(&value, ptr, sizeof(u64));
 		break;
 	default:
 		return 0;
@@ -1933,62 +2056,9 @@ static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
 	return ret;
 }
 
-static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value)
+static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
 {
-	if (value == 0)
-		return 0;
-
-	return comma_fprintf(fp, first, " %s: %" PRIu64, field, value);
-}
-
-#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field)
-
-struct bit_names {
-	int bit;
-	const char *name;
-};
-
-static int bits__fprintf(FILE *fp, const char *field, u64 value,
-			 struct bit_names *bits, bool *first)
-{
-	int i = 0, printed = comma_fprintf(fp, first, " %s: ", field);
-	bool first_bit = true;
-
-	do {
-		if (value & bits[i].bit) {
-			printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name);
-			first_bit = false;
-		}
-	} while (bits[++i].name != NULL);
-
-	return printed;
-}
-
-static int sample_type__fprintf(FILE *fp, bool *first, u64 value)
-{
-#define bit_name(n) { PERF_SAMPLE_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
-		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
-		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
-		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
-		bit_name(IDENTIFIER), bit_name(REGS_INTR),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	return bits__fprintf(fp, "sample_type", value, bits, first);
-}
-
-static int read_format__fprintf(FILE *fp, bool *first, u64 value)
-{
-#define bit_name(n) { PERF_FORMAT_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
-		bit_name(ID), bit_name(GROUP),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	return bits__fprintf(fp, "read_format", value, bits, first);
+	return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
 }
 
 int perf_evsel__fprintf(struct perf_evsel *evsel,
@@ -2017,47 +2087,13 @@ int perf_evsel__fprintf(struct perf_evsel *evsel,
 
 	printed += fprintf(fp, "%s", perf_evsel__name(evsel));
 
-	if (details->verbose || details->freq) {
+	if (details->verbose) {
+		printed += perf_event_attr__fprintf(fp, &evsel->attr,
+						    __print_attr__fprintf, &first);
+	} else if (details->freq) {
 		printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64,
 					 (u64)evsel->attr.sample_freq);
 	}
-
-	if (details->verbose) {
-		if_print(type);
-		if_print(config);
-		if_print(config1);
-		if_print(config2);
-		if_print(size);
-		printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type);
-		if (evsel->attr.read_format)
-			printed += read_format__fprintf(fp, &first, evsel->attr.read_format);
-		if_print(disabled);
-		if_print(inherit);
-		if_print(pinned);
-		if_print(exclusive);
-		if_print(exclude_user);
-		if_print(exclude_kernel);
-		if_print(exclude_hv);
-		if_print(exclude_idle);
-		if_print(mmap);
-		if_print(mmap2);
-		if_print(comm);
-		if_print(comm_exec);
-		if_print(freq);
-		if_print(inherit_stat);
-		if_print(enable_on_exec);
-		if_print(task);
-		if_print(watermark);
-		if_print(precise_ip);
-		if_print(mmap_data);
-		if_print(sample_id_all);
-		if_print(exclude_host);
-		if_print(exclude_guest);
-		if_print(__reserved_1);
-		if_print(wakeup_events);
-		if_print(bp_type);
-		if_print(branch_sample_type);
-	}
 out:
 	fputc('\n', fp);
 	return ++printed;
@@ -2135,6 +2171,12 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	"The PMU counters are busy/taken by another profiler.\n"
 	"We found oprofile daemon running, please stop it and try again.");
 		break;
+	case EINVAL:
+		if (perf_missing_features.clockid)
+			return scnprintf(msg, size, "clockid feature not supported.");
+		if (perf_missing_features.clockid_wrong)
+			return scnprintf(msg, size, "wrong clockid (%d).", clockid);
+		break;
 	default:
 		break;
 	}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 38622747d130..e486151b0308 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -335,6 +335,7 @@ struct perf_attr_details {
 	bool freq;
 	bool verbose;
 	bool event_group;
+	bool force;
 };
 
 int perf_evsel__fprintf(struct perf_evsel *evsel,
@@ -355,4 +356,14 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); 	\
      (_evsel) && (_evsel)->leader == (_leader);					\
      (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
 
+static inline bool has_branch_callstack(struct perf_evsel *evsel)
+{
+	return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
+}
+
+typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv);
+
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 1f407f7352a7..918fd8ae2d80 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1055,6 +1055,12 @@ error:
 	goto out;
 }
 
+static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val,
+				void *priv __attribute__((unused)))
+{
+	return fprintf(fp, ", %s = %s", name, val);
+}
+
 static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 {
 	struct perf_evsel *evsel, *events = read_event_desc(ph, fd);
@@ -1069,26 +1075,6 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 	for (evsel = events; evsel->attr.size; evsel++) {
 		fprintf(fp, "# event : name = %s, ", evsel->name);
 
-		fprintf(fp, "type = %d, config = 0x%"PRIx64
-			    ", config1 = 0x%"PRIx64", config2 = 0x%"PRIx64,
-				evsel->attr.type,
-				(u64)evsel->attr.config,
-				(u64)evsel->attr.config1,
-				(u64)evsel->attr.config2);
-
-		fprintf(fp, ", excl_usr = %d, excl_kern = %d",
-				evsel->attr.exclude_user,
-				evsel->attr.exclude_kernel);
-
-		fprintf(fp, ", excl_host = %d, excl_guest = %d",
-				evsel->attr.exclude_host,
-				evsel->attr.exclude_guest);
-
-		fprintf(fp, ", precise_ip = %d", evsel->attr.precise_ip);
-
-		fprintf(fp, ", attr_mmap2 = %d", evsel->attr.mmap2);
-		fprintf(fp, ", attr_mmap  = %d", evsel->attr.mmap);
-		fprintf(fp, ", attr_mmap_data = %d", evsel->attr.mmap_data);
 		if (evsel->ids) {
 			fprintf(fp, ", id = {");
 			for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) {
@@ -1099,6 +1085,8 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 			fprintf(fp, " }");
 		}
 
+		perf_event_attr__fprintf(fp, &evsel->attr, __desc_attr__fprintf, NULL);
+
 		fputc('\n', fp);
 	}
 
@@ -1266,7 +1254,7 @@ static int __event_process_build_id(struct build_id_event *bev,
 
 		dso__set_build_id(dso, &bev->build_id);
 
-		if (!is_kernel_module(filename, NULL))
+		if (!is_kernel_module(filename))
 			dso->kernel = dso_type;
 
 		build_id__sprintf(dso->build_id, sizeof(dso->build_id),
@@ -2516,8 +2504,11 @@ int perf_session__read_header(struct perf_session *session)
 		if (read_attr(fd, header, &f_attr) < 0)
 			goto out_errno;
 
-		if (header->needs_swap)
+		if (header->needs_swap) {
+			f_attr.ids.size   = bswap_64(f_attr.ids.size);
+			f_attr.ids.offset = bswap_64(f_attr.ids.offset);
 			perf_event__attr_swap(&f_attr.attr);
+		}
 
 		tmp = lseek(fd, 0, SEEK_CUR);
 		evsel = perf_evsel__new(&f_attr.attr);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 70b48a65064c..cc22b9158b93 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -263,15 +263,9 @@ void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel)
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
-		/*
-		 * We may be annotating this, for instance, so keep it here in
-		 * case some it gets new samples, we'll eventually free it when
-		 * the user stops browsing and it agains gets fully decayed.
-		 */
 		if (((zap_user && n->level == '.') ||
 		     (zap_kernel && n->level != '.') ||
-		     hists__decay_entry(hists, n)) &&
-		    !n->used) {
+		     hists__decay_entry(hists, n))) {
 			hists__delete_entry(hists, n);
 		}
 	}
@@ -355,6 +349,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
 			callchain_init(he->callchain);
 
 		INIT_LIST_HEAD(&he->pairs.node);
+		thread__get(he->thread);
 	}
 
 	return he;
@@ -941,6 +936,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 
 void hist_entry__delete(struct hist_entry *he)
 {
+	thread__zput(he->thread);
 	zfree(&he->branch_info);
 	zfree(&he->mem_info);
 	zfree(&he->stat_acc);
@@ -1169,6 +1165,7 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
 	/* force fold unfiltered entry for simplicity */
 	h->ms.unfolded = false;
 	h->row_offset = 0;
+	h->nr_rows = 0;
 
 	hists->stats.nr_non_filtered_samples += h->stat.nr_events;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 2b690d028907..9f31b89a527a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -60,7 +60,7 @@ struct hists {
 	struct rb_root		entries_collapsed;
 	u64			nr_entries;
 	u64			nr_non_filtered_entries;
-	const struct thread	*thread_filter;
+	struct thread		*thread_filter;
 	const struct dso	*dso_filter;
 	const char		*uid_filter_str;
 	const char		*symbol_filter_str;
@@ -303,6 +303,9 @@ struct hist_browser_timer {
 
 #ifdef HAVE_SLANG_SUPPORT
 #include "../ui/keysyms.h"
+int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel,
+			     struct hist_browser_timer *hbt);
+
 int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
 			     struct hist_browser_timer *hbt);
 
@@ -321,6 +324,12 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused,
 {
 	return 0;
 }
+static inline int map_symbol__tui_annotate(struct map_symbol *ms __maybe_unused,
+					   struct perf_evsel *evsel __maybe_unused,
+					   struct hist_browser_timer *hbt __maybe_unused)
+{
+	return 0;
+}
 
 static inline int hist_entry__tui_annotate(struct hist_entry *he __maybe_unused,
 					   struct perf_evsel *evsel __maybe_unused,
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index cf1d7913783b..ae825d4ec110 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -99,6 +99,7 @@ struct perf_kvm_stat {
 	int timerfd;
 	unsigned int display_time;
 	bool live;
+	bool force;
 };
 
 struct kvm_reg_events_ops {
diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c
new file mode 100644
index 000000000000..95a1acb61245
--- /dev/null
+++ b/tools/perf/util/lzma.c
@@ -0,0 +1,95 @@
+#include <lzma.h>
+#include <stdio.h>
+#include <linux/compiler.h>
+#include "util.h"
+#include "debug.h"
+
+#define BUFSIZE 8192
+
+static const char *lzma_strerror(lzma_ret ret)
+{
+	switch ((int) ret) {
+	case LZMA_MEM_ERROR:
+		return "Memory allocation failed";
+	case LZMA_OPTIONS_ERROR:
+		return "Unsupported decompressor flags";
+	case LZMA_FORMAT_ERROR:
+		return "The input is not in the .xz format";
+	case LZMA_DATA_ERROR:
+		return "Compressed file is corrupt";
+	case LZMA_BUF_ERROR:
+		return "Compressed file is truncated or otherwise corrupt";
+	default:
+		return "Unknown error, possibly a bug";
+	}
+}
+
+int lzma_decompress_to_file(const char *input, int output_fd)
+{
+	lzma_action action = LZMA_RUN;
+	lzma_stream strm   = LZMA_STREAM_INIT;
+	lzma_ret ret;
+
+	u8 buf_in[BUFSIZE];
+	u8 buf_out[BUFSIZE];
+	FILE *infile;
+
+	infile = fopen(input, "rb");
+	if (!infile) {
+		pr_err("lzma: fopen failed on %s: '%s'\n",
+		       input, strerror(errno));
+		return -1;
+	}
+
+	ret = lzma_stream_decoder(&strm, UINT64_MAX, LZMA_CONCATENATED);
+	if (ret != LZMA_OK) {
+		pr_err("lzma: lzma_stream_decoder failed %s (%d)\n",
+			lzma_strerror(ret), ret);
+		return -1;
+	}
+
+	strm.next_in   = NULL;
+	strm.avail_in  = 0;
+	strm.next_out  = buf_out;
+	strm.avail_out = sizeof(buf_out);
+
+	while (1) {
+		if (strm.avail_in == 0 && !feof(infile)) {
+			strm.next_in  = buf_in;
+			strm.avail_in = fread(buf_in, 1, sizeof(buf_in), infile);
+
+			if (ferror(infile)) {
+				pr_err("lzma: read error: %s\n", strerror(errno));
+				return -1;
+			}
+
+			if (feof(infile))
+				action = LZMA_FINISH;
+		}
+
+		ret = lzma_code(&strm, action);
+
+		if (strm.avail_out == 0 || ret == LZMA_STREAM_END) {
+			ssize_t write_size = sizeof(buf_out) - strm.avail_out;
+
+			if (writen(output_fd, buf_out, write_size) != write_size) {
+				pr_err("lzma: write error: %s\n", strerror(errno));
+				return -1;
+			}
+
+			strm.next_out  = buf_out;
+			strm.avail_out = sizeof(buf_out);
+		}
+
+		if (ret != LZMA_OK) {
+			if (ret == LZMA_STREAM_END)
+				return 0;
+
+			pr_err("lzma: failed %s\n", lzma_strerror(ret));
+			return -1;
+		}
+	}
+
+	fclose(infile);
+	return 0;
+}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1bca3a9f2b16..527e032e24f6 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -89,16 +89,6 @@ static void dsos__delete(struct dsos *dsos)
 	}
 }
 
-void machine__delete_dead_threads(struct machine *machine)
-{
-	struct thread *n, *t;
-
-	list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
-		list_del(&t->node);
-		thread__delete(t);
-	}
-}
-
 void machine__delete_threads(struct machine *machine)
 {
 	struct rb_node *nd = rb_first(&machine->threads);
@@ -106,9 +96,8 @@ void machine__delete_threads(struct machine *machine)
 	while (nd) {
 		struct thread *t = rb_entry(nd, struct thread, rb_node);
 
-		rb_erase(&t->rb_node, &machine->threads);
 		nd = rb_next(nd);
-		thread__delete(t);
+		machine__remove_thread(machine, t);
 	}
 }
 
@@ -361,9 +350,13 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 	 * the full rbtree:
 	 */
 	th = machine->last_match;
-	if (th && th->tid == tid) {
-		machine__update_thread_pid(machine, th, pid);
-		return th;
+	if (th != NULL) {
+		if (th->tid == tid) {
+			machine__update_thread_pid(machine, th, pid);
+			return th;
+		}
+
+		thread__zput(machine->last_match);
 	}
 
 	while (*p != NULL) {
@@ -371,7 +364,7 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 		th = rb_entry(parent, struct thread, rb_node);
 
 		if (th->tid == tid) {
-			machine->last_match = th;
+			machine->last_match = thread__get(th);
 			machine__update_thread_pid(machine, th, pid);
 			return th;
 		}
@@ -403,8 +396,11 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 			thread__delete(th);
 			return NULL;
 		}
-
-		machine->last_match = th;
+		/*
+		 * It is now in the rbtree, get a ref
+		 */
+		thread__get(th);
+		machine->last_match = thread__get(th);
 	}
 
 	return th;
@@ -462,30 +458,61 @@ int machine__process_lost_event(struct machine *machine __maybe_unused,
 	return 0;
 }
 
+static struct dso*
+machine__module_dso(struct machine *machine, struct kmod_path *m,
+		    const char *filename)
+{
+	struct dso *dso;
+
+	dso = dsos__find(&machine->kernel_dsos, m->name, true);
+	if (!dso) {
+		dso = dsos__addnew(&machine->kernel_dsos, m->name);
+		if (dso == NULL)
+			return NULL;
+
+		if (machine__is_host(machine))
+			dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
+		else
+			dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
+
+		/* _KMODULE_COMP should be next to _KMODULE */
+		if (m->kmod && m->comp)
+			dso->symtab_type++;
+
+		dso__set_short_name(dso, strdup(m->name), true);
+		dso__set_long_name(dso, strdup(filename), true);
+	}
+
+	return dso;
+}
+
 struct map *machine__new_module(struct machine *machine, u64 start,
 				const char *filename)
 {
-	struct map *map;
-	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename);
-	bool compressed;
+	struct map *map = NULL;
+	struct dso *dso;
+	struct kmod_path m;
 
-	if (dso == NULL)
+	if (kmod_path__parse_name(&m, filename))
 		return NULL;
 
+	map = map_groups__find_by_name(&machine->kmaps, MAP__FUNCTION,
+				       m.name);
+	if (map)
+		goto out;
+
+	dso = machine__module_dso(machine, &m, filename);
+	if (dso == NULL)
+		goto out;
+
 	map = map__new2(start, dso, MAP__FUNCTION);
 	if (map == NULL)
-		return NULL;
-
-	if (machine__is_host(machine))
-		dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
-	else
-		dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
-
-	/* _KMODULE_COMP should be next to _KMODULE */
-	if (is_kernel_module(filename, &compressed) && compressed)
-		dso->symtab_type++;
+		goto out;
 
 	map_groups__insert(&machine->kmaps, map);
+
+out:
+	free(m.name);
 	return map;
 }
 
@@ -650,6 +677,9 @@ int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
 			machine->vmlinux_maps[type]->unmap_ip =
 				identity__map_ip;
 		kmap = map__kmap(machine->vmlinux_maps[type]);
+		if (!kmap)
+			return -1;
+
 		kmap->kmaps = &machine->kmaps;
 		map_groups__insert(&machine->kmaps,
 				   machine->vmlinux_maps[type]);
@@ -671,7 +701,7 @@ void machine__destroy_kernel_maps(struct machine *machine)
 		kmap = map__kmap(machine->vmlinux_maps[type]);
 		map_groups__remove(&machine->kmaps,
 				   machine->vmlinux_maps[type]);
-		if (kmap->ref_reloc_sym) {
+		if (kmap && kmap->ref_reloc_sym) {
 			/*
 			 * ref_reloc_sym is shared among all maps, so free just
 			 * on one of them.
@@ -827,6 +857,39 @@ static char *get_kernel_version(const char *root_dir)
 	return strdup(name);
 }
 
+static bool is_kmod_dso(struct dso *dso)
+{
+	return dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE ||
+	       dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE;
+}
+
+static int map_groups__set_module_path(struct map_groups *mg, const char *path,
+				       struct kmod_path *m)
+{
+	struct map *map;
+	char *long_name;
+
+	map = map_groups__find_by_name(mg, MAP__FUNCTION, m->name);
+	if (map == NULL)
+		return 0;
+
+	long_name = strdup(path);
+	if (long_name == NULL)
+		return -ENOMEM;
+
+	dso__set_long_name(map->dso, long_name, true);
+	dso__kernel_module_get_build_id(map->dso, "");
+
+	/*
+	 * Full name could reveal us kmod compression, so
+	 * we need to update the symtab_type if needed.
+	 */
+	if (m->comp && is_kmod_dso(map->dso))
+		map->dso->symtab_type++;
+
+	return 0;
+}
+
 static int map_groups__set_modules_path_dir(struct map_groups *mg,
 				const char *dir_name, int depth)
 {
@@ -865,35 +928,19 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
 			if (ret < 0)
 				goto out;
 		} else {
-			char *dot = strrchr(dent->d_name, '.'),
-			     dso_name[PATH_MAX];
-			struct map *map;
-			char *long_name;
+			struct kmod_path m;
 
-			if (dot == NULL)
-				continue;
-
-			/* On some system, modules are compressed like .ko.gz */
-			if (is_supported_compression(dot + 1) &&
-			    is_kmodule_extension(dot - 2))
-				dot -= 3;
-
-			snprintf(dso_name, sizeof(dso_name), "[%.*s]",
-				 (int)(dot - dent->d_name), dent->d_name);
-
-			strxfrchar(dso_name, '-', '_');
-			map = map_groups__find_by_name(mg, MAP__FUNCTION,
-						       dso_name);
-			if (map == NULL)
-				continue;
-
-			long_name = strdup(path);
-			if (long_name == NULL) {
-				ret = -1;
+			ret = kmod_path__parse_name(&m, dent->d_name);
+			if (ret)
+				goto out;
+
+			if (m.kmod)
+				ret = map_groups__set_module_path(mg, path, &m);
+
+			free(m.name);
+
+			if (ret)
 				goto out;
-			}
-			dso__set_long_name(map->dso, long_name, true);
-			dso__kernel_module_get_build_id(map->dso, "");
 		}
 	}
 
@@ -1046,40 +1093,11 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 				strlen(kmmap_prefix) - 1) == 0;
 	if (event->mmap.filename[0] == '/' ||
 	    (!is_kernel_mmap && event->mmap.filename[0] == '[')) {
-
-		char short_module_name[1024];
-		char *name, *dot;
-
-		if (event->mmap.filename[0] == '/') {
-			name = strrchr(event->mmap.filename, '/');
-			if (name == NULL)
-				goto out_problem;
-
-			++name; /* skip / */
-			dot = strrchr(name, '.');
-			if (dot == NULL)
-				goto out_problem;
-			/* On some system, modules are compressed like .ko.gz */
-			if (is_supported_compression(dot + 1))
-				dot -= 3;
-			if (!is_kmodule_extension(dot + 1))
-				goto out_problem;
-			snprintf(short_module_name, sizeof(short_module_name),
-					"[%.*s]", (int)(dot - name), name);
-			strxfrchar(short_module_name, '-', '_');
-		} else
-			strcpy(short_module_name, event->mmap.filename);
-
 		map = machine__new_module(machine, event->mmap.start,
 					  event->mmap.filename);
 		if (map == NULL)
 			goto out_problem;
 
-		name = strdup(short_module_name);
-		if (name == NULL)
-			goto out_problem;
-
-		dso__set_short_name(map->dso, name, true);
 		map->end = map->start + event->mmap.len;
 	} else if (is_kernel_mmap) {
 		const char *symbol_name = (event->mmap.filename +
@@ -1092,7 +1110,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 		struct dso *dso;
 
 		list_for_each_entry(dso, &machine->kernel_dsos.head, node) {
-			if (is_kernel_module(dso->long_name, NULL))
+			if (is_kernel_module(dso->long_name))
 				continue;
 
 			kernel = dso;
@@ -1236,15 +1254,19 @@ out_problem:
 	return 0;
 }
 
-static void machine__remove_thread(struct machine *machine, struct thread *th)
+void machine__remove_thread(struct machine *machine, struct thread *th)
 {
-	machine->last_match = NULL;
+	if (machine->last_match == th)
+		thread__zput(machine->last_match);
+
 	rb_erase(&th->rb_node, &machine->threads);
 	/*
-	 * We may have references to this thread, for instance in some hist_entry
-	 * instances, so just move them to a separate list.
+	 * Move it first to the dead_threads list, then drop the reference,
+	 * if this is the last reference, then the thread__delete destructor
+	 * will be called and we will remove it from the dead_threads list.
 	 */
 	list_add_tail(&th->node, &machine->dead_threads);
+	thread__put(th);
 }
 
 int machine__process_fork_event(struct machine *machine, union perf_event *event,
@@ -1387,29 +1409,27 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 static int add_callchain_ip(struct thread *thread,
 			    struct symbol **parent,
 			    struct addr_location *root_al,
-			    bool branch_history,
+			    u8 *cpumode,
 			    u64 ip)
 {
 	struct addr_location al;
 
 	al.filtered = 0;
 	al.sym = NULL;
-	if (branch_history)
+	if (!cpumode) {
 		thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
 						   ip, &al);
-	else {
-		u8 cpumode = PERF_RECORD_MISC_USER;
-
+	} else {
 		if (ip >= PERF_CONTEXT_MAX) {
 			switch (ip) {
 			case PERF_CONTEXT_HV:
-				cpumode = PERF_RECORD_MISC_HYPERVISOR;
+				*cpumode = PERF_RECORD_MISC_HYPERVISOR;
 				break;
 			case PERF_CONTEXT_KERNEL:
-				cpumode = PERF_RECORD_MISC_KERNEL;
+				*cpumode = PERF_RECORD_MISC_KERNEL;
 				break;
 			case PERF_CONTEXT_USER:
-				cpumode = PERF_RECORD_MISC_USER;
+				*cpumode = PERF_RECORD_MISC_USER;
 				break;
 			default:
 				pr_debug("invalid callchain context: "
@@ -1423,8 +1443,8 @@ static int add_callchain_ip(struct thread *thread,
 			}
 			return 0;
 		}
-		thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
-				   ip, &al);
+		thread__find_addr_location(thread, *cpumode, MAP__FUNCTION,
+					   ip, &al);
 	}
 
 	if (al.sym != NULL) {
@@ -1502,18 +1522,102 @@ static int remove_loops(struct branch_entry *l, int nr)
 	return nr;
 }
 
-static int thread__resolve_callchain_sample(struct thread *thread,
-					     struct ip_callchain *chain,
-					     struct branch_stack *branch,
-					     struct symbol **parent,
-					     struct addr_location *root_al,
-					     int max_stack)
+/*
+ * Recolve LBR callstack chain sample
+ * Return:
+ * 1 on success get LBR callchain information
+ * 0 no available LBR callchain information, should try fp
+ * negative error code on other errors.
+ */
+static int resolve_lbr_callchain_sample(struct thread *thread,
+					struct perf_sample *sample,
+					struct symbol **parent,
+					struct addr_location *root_al,
+					int max_stack)
 {
+	struct ip_callchain *chain = sample->callchain;
 	int chain_nr = min(max_stack, (int)chain->nr);
+	u8 cpumode = PERF_RECORD_MISC_USER;
+	int i, j, err;
+	u64 ip;
+
+	for (i = 0; i < chain_nr; i++) {
+		if (chain->ips[i] == PERF_CONTEXT_USER)
+			break;
+	}
+
+	/* LBR only affects the user callchain */
+	if (i != chain_nr) {
+		struct branch_stack *lbr_stack = sample->branch_stack;
+		int lbr_nr = lbr_stack->nr;
+		/*
+		 * LBR callstack can only get user call chain.
+		 * The mix_chain_nr is kernel call chain
+		 * number plus LBR user call chain number.
+		 * i is kernel call chain number,
+		 * 1 is PERF_CONTEXT_USER,
+		 * lbr_nr + 1 is the user call chain number.
+		 * For details, please refer to the comments
+		 * in callchain__printf
+		 */
+		int mix_chain_nr = i + 1 + lbr_nr + 1;
+
+		if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+			pr_warning("corrupted callchain. skipping...\n");
+			return 0;
+		}
+
+		for (j = 0; j < mix_chain_nr; j++) {
+			if (callchain_param.order == ORDER_CALLEE) {
+				if (j < i + 1)
+					ip = chain->ips[j];
+				else if (j > i + 1)
+					ip = lbr_stack->entries[j - i - 2].from;
+				else
+					ip = lbr_stack->entries[0].to;
+			} else {
+				if (j < lbr_nr)
+					ip = lbr_stack->entries[lbr_nr - j - 1].from;
+				else if (j > lbr_nr)
+					ip = chain->ips[i + 1 - (j - lbr_nr)];
+				else
+					ip = lbr_stack->entries[0].to;
+			}
+
+			err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
+			if (err)
+				return (err < 0) ? err : 0;
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+static int thread__resolve_callchain_sample(struct thread *thread,
+					    struct perf_evsel *evsel,
+					    struct perf_sample *sample,
+					    struct symbol **parent,
+					    struct addr_location *root_al,
+					    int max_stack)
+{
+	struct branch_stack *branch = sample->branch_stack;
+	struct ip_callchain *chain = sample->callchain;
+	int chain_nr = min(max_stack, (int)chain->nr);
+	u8 cpumode = PERF_RECORD_MISC_USER;
 	int i, j, err;
 	int skip_idx = -1;
 	int first_call = 0;
 
+	callchain_cursor_reset(&callchain_cursor);
+
+	if (has_branch_callstack(evsel)) {
+		err = resolve_lbr_callchain_sample(thread, sample, parent,
+						   root_al, max_stack);
+		if (err)
+			return (err < 0) ? err : 0;
+	}
+
 	/*
 	 * Based on DWARF debug information, some architectures skip
 	 * a callchain entry saved by the kernel.
@@ -1521,8 +1625,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	if (chain->nr < PERF_MAX_STACK_DEPTH)
 		skip_idx = arch_skip_callchain_idx(thread, chain);
 
-	callchain_cursor_reset(&callchain_cursor);
-
 	/*
 	 * Add branches to call stack for easier browsing. This gives
 	 * more context for a sample than just the callers.
@@ -1568,10 +1670,10 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 
 		for (i = 0; i < nr; i++) {
 			err = add_callchain_ip(thread, parent, root_al,
-					       true, be[i].to);
+					       NULL, be[i].to);
 			if (!err)
 				err = add_callchain_ip(thread, parent, root_al,
-						       true, be[i].from);
+						       NULL, be[i].from);
 			if (err == -EINVAL)
 				break;
 			if (err)
@@ -1600,7 +1702,7 @@ check_calls:
 #endif
 		ip = chain->ips[j];
 
-		err = add_callchain_ip(thread, parent, root_al, false, ip);
+		err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
 
 		if (err)
 			return (err < 0) ? err : 0;
@@ -1623,9 +1725,9 @@ int thread__resolve_callchain(struct thread *thread,
 			      struct addr_location *root_al,
 			      int max_stack)
 {
-	int ret = thread__resolve_callchain_sample(thread, sample->callchain,
-						   sample->branch_stack,
-						   parent, root_al, max_stack);
+	int ret = thread__resolve_callchain_sample(thread, evsel,
+						   sample, parent,
+						   root_al, max_stack);
 	if (ret)
 		return ret;
 
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index e8b7779a0a3f..6d64cedb9d1e 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -118,9 +118,9 @@ void machines__set_comm_exec(struct machines *machines, bool comm_exec);
 struct machine *machine__new_host(void);
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
 void machine__exit(struct machine *machine);
-void machine__delete_dead_threads(struct machine *machine);
 void machine__delete_threads(struct machine *machine);
 void machine__delete(struct machine *machine);
+void machine__remove_thread(struct machine *machine, struct thread *th);
 
 struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
 					   struct addr_location *al);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 62ca9f2607d5..a14f08f41686 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -778,3 +778,23 @@ struct map *maps__next(struct map *map)
 		return rb_entry(next, struct map, rb_node);
 	return NULL;
 }
+
+struct kmap *map__kmap(struct map *map)
+{
+	if (!map->dso || !map->dso->kernel) {
+		pr_err("Internal error: map__kmap with a non-kernel map\n");
+		return NULL;
+	}
+	return (struct kmap *)(map + 1);
+}
+
+struct map_groups *map__kmaps(struct map *map)
+{
+	struct kmap *kmap = map__kmap(map);
+
+	if (!kmap || !kmap->kmaps) {
+		pr_err("Internal error: map__kmaps with a non-kernel map\n");
+		return NULL;
+	}
+	return kmap->kmaps;
+}
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 0e42438b1e59..ec19c59ca38e 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -76,10 +76,8 @@ static inline struct map_groups *map_groups__get(struct map_groups *mg)
 
 void map_groups__put(struct map_groups *mg);
 
-static inline struct kmap *map__kmap(struct map *map)
-{
-	return (struct kmap *)(map + 1);
-}
+struct kmap *map__kmap(struct map *map);
+struct map_groups *map__kmaps(struct map *map);
 
 static inline u64 map__map_ip(struct map *map, u64 ip)
 {
diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
index fd4be94125fb..52be201b9b25 100644
--- a/tools/perf/util/ordered-events.c
+++ b/tools/perf/util/ordered-events.c
@@ -2,7 +2,6 @@
 #include <linux/compiler.h>
 #include <linux/string.h>
 #include "ordered-events.h"
-#include "evlist.h"
 #include "session.h"
 #include "asm/bug.h"
 #include "debug.h"
@@ -131,8 +130,8 @@ static struct ordered_event *alloc_event(struct ordered_events *oe,
 	return new;
 }
 
-struct ordered_event *
-ordered_events__new(struct ordered_events *oe, u64 timestamp,
+static struct ordered_event *
+ordered_events__new_event(struct ordered_events *oe, u64 timestamp,
 		    union perf_event *event)
 {
 	struct ordered_event *new;
@@ -153,20 +152,47 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve
 	free_dup_event(oe, event->event);
 }
 
-static int __ordered_events__flush(struct perf_session *s,
-				   struct perf_tool *tool)
+int ordered_events__queue(struct ordered_events *oe, union perf_event *event,
+			  struct perf_sample *sample, u64 file_offset)
+{
+	u64 timestamp = sample->time;
+	struct ordered_event *oevent;
+
+	if (!timestamp || timestamp == ~0ULL)
+		return -ETIME;
+
+	if (timestamp < oe->last_flush) {
+		pr_oe_time(timestamp,      "out of order event\n");
+		pr_oe_time(oe->last_flush, "last flush, last_flush_type %d\n",
+			   oe->last_flush_type);
+
+		oe->nr_unordered_events++;
+	}
+
+	oevent = ordered_events__new_event(oe, timestamp, event);
+	if (!oevent) {
+		ordered_events__flush(oe, OE_FLUSH__HALF);
+		oevent = ordered_events__new_event(oe, timestamp, event);
+	}
+
+	if (!oevent)
+		return -ENOMEM;
+
+	oevent->file_offset = file_offset;
+	return 0;
+}
+
+static int __ordered_events__flush(struct ordered_events *oe)
 {
-	struct ordered_events *oe = &s->ordered_events;
 	struct list_head *head = &oe->events;
 	struct ordered_event *tmp, *iter;
-	struct perf_sample sample;
 	u64 limit = oe->next_flush;
 	u64 last_ts = oe->last ? oe->last->timestamp : 0ULL;
 	bool show_progress = limit == ULLONG_MAX;
 	struct ui_progress prog;
 	int ret;
 
-	if (!tool->ordered_events || !limit)
+	if (!limit)
 		return 0;
 
 	if (show_progress)
@@ -178,16 +204,9 @@ static int __ordered_events__flush(struct perf_session *s,
 
 		if (iter->timestamp > limit)
 			break;
-
-		ret = perf_evlist__parse_sample(s->evlist, iter->event, &sample);
+		ret = oe->deliver(oe, iter);
 		if (ret)
-			pr_err("Can't parse sample, err = %d\n", ret);
-		else {
-			ret = perf_session__deliver_event(s, iter->event, &sample, tool,
-							  iter->file_offset);
-			if (ret)
-				return ret;
-		}
+			return ret;
 
 		ordered_events__delete(oe, iter);
 		oe->last_flush = iter->timestamp;
@@ -204,10 +223,8 @@ static int __ordered_events__flush(struct perf_session *s,
 	return 0;
 }
 
-int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
-			  enum oe_flush how)
+int ordered_events__flush(struct ordered_events *oe, enum oe_flush how)
 {
-	struct ordered_events *oe = &s->ordered_events;
 	static const char * const str[] = {
 		"NONE",
 		"FINAL",
@@ -216,6 +233,9 @@ int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
 	};
 	int err;
 
+	if (oe->nr_events == 0)
+		return 0;
+
 	switch (how) {
 	case OE_FLUSH__FINAL:
 		oe->next_flush = ULLONG_MAX;
@@ -248,7 +268,7 @@ int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
 		   str[how], oe->nr_events);
 	pr_oe_time(oe->max_timestamp, "max_timestamp\n");
 
-	err = __ordered_events__flush(s, tool);
+	err = __ordered_events__flush(oe);
 
 	if (!err) {
 		if (how == OE_FLUSH__ROUND)
@@ -264,13 +284,14 @@ int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
 	return err;
 }
 
-void ordered_events__init(struct ordered_events *oe)
+void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver)
 {
 	INIT_LIST_HEAD(&oe->events);
 	INIT_LIST_HEAD(&oe->cache);
 	INIT_LIST_HEAD(&oe->to_free);
 	oe->max_alloc_size = (u64) -1;
 	oe->cur_alloc_size = 0;
+	oe->deliver	   = deliver;
 }
 
 void ordered_events__free(struct ordered_events *oe)
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
index 7b8f9b011f38..f403991e3bfd 100644
--- a/tools/perf/util/ordered-events.h
+++ b/tools/perf/util/ordered-events.h
@@ -2,9 +2,8 @@
 #define __ORDERED_EVENTS_H
 
 #include <linux/types.h>
-#include "tool.h"
 
-struct perf_session;
+struct perf_sample;
 
 struct ordered_event {
 	u64			timestamp;
@@ -20,6 +19,11 @@ enum oe_flush {
 	OE_FLUSH__HALF,
 };
 
+struct ordered_events;
+
+typedef int (*ordered_events__deliver_t)(struct ordered_events *oe,
+					 struct ordered_event *event);
+
 struct ordered_events {
 	u64			last_flush;
 	u64			next_flush;
@@ -31,18 +35,19 @@ struct ordered_events {
 	struct list_head	to_free;
 	struct ordered_event	*buffer;
 	struct ordered_event	*last;
+	ordered_events__deliver_t deliver;
 	int			buffer_idx;
 	unsigned int		nr_events;
 	enum oe_flush		last_flush_type;
+	u32			nr_unordered_events;
 	bool                    copy_on_queue;
 };
 
-struct ordered_event *ordered_events__new(struct ordered_events *oe, u64 timestamp,
-					  union perf_event *event);
+int ordered_events__queue(struct ordered_events *oe, union perf_event *event,
+			  struct perf_sample *sample, u64 file_offset);
 void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event);
-int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
-			  enum oe_flush how);
-void ordered_events__init(struct ordered_events *oe);
+int ordered_events__flush(struct ordered_events *oe, enum oe_flush how);
+void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver);
 void ordered_events__free(struct ordered_events *oe);
 
 static inline
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 7f8ec6ce2823..be0655388b38 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -20,11 +20,6 @@
 
 #define MAX_NAME_LEN 100
 
-struct event_symbol {
-	const char	*symbol;
-	const char	*alias;
-};
-
 #ifdef PARSER_DEBUG
 extern int parse_events_debug;
 #endif
@@ -39,7 +34,7 @@ static struct perf_pmu_event_symbol *perf_pmu_events_list;
  */
 static int perf_pmu_events_list_num;
 
-static struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = {
+struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = {
 	[PERF_COUNT_HW_CPU_CYCLES] = {
 		.symbol = "cpu-cycles",
 		.alias  = "cycles",
@@ -82,7 +77,7 @@ static struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = {
 	},
 };
 
-static struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
+struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
 	[PERF_COUNT_SW_CPU_CLOCK] = {
 		.symbol = "cpu-clock",
 		.alias  = "",
@@ -175,9 +170,6 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
 
-	if (debugfs_valid_mountpoint(tracing_events_path))
-		return NULL;
-
 	sys_dir = opendir(tracing_events_path);
 	if (!sys_dir)
 		return NULL;
@@ -473,12 +465,6 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
 int parse_events_add_tracepoint(struct list_head *list, int *idx,
 				char *sys, char *event)
 {
-	int ret;
-
-	ret = debugfs_valid_mountpoint(tracing_events_path);
-	if (ret)
-		return ret;
-
 	if (strpbrk(sys, "*?"))
 		return add_tracepoint_multi_sys(list, idx, sys, event);
 	else
@@ -723,6 +709,7 @@ struct event_modifier {
 	int eh;
 	int eH;
 	int eG;
+	int eI;
 	int precise;
 	int exclude_GH;
 	int sample_read;
@@ -737,6 +724,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	int eh = evsel ? evsel->attr.exclude_hv : 0;
 	int eH = evsel ? evsel->attr.exclude_host : 0;
 	int eG = evsel ? evsel->attr.exclude_guest : 0;
+	int eI = evsel ? evsel->attr.exclude_idle : 0;
 	int precise = evsel ? evsel->attr.precise_ip : 0;
 	int sample_read = 0;
 	int pinned = evsel ? evsel->attr.pinned : 0;
@@ -767,6 +755,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 			if (!exclude_GH)
 				exclude_GH = eG = eH = 1;
 			eH = 0;
+		} else if (*str == 'I') {
+			eI = 1;
 		} else if (*str == 'p') {
 			precise++;
 			/* use of precise requires exclude_guest */
@@ -800,6 +790,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	mod->eh = eh;
 	mod->eH = eH;
 	mod->eG = eG;
+	mod->eI = eI;
 	mod->precise = precise;
 	mod->exclude_GH = exclude_GH;
 	mod->sample_read = sample_read;
@@ -817,7 +808,7 @@ static int check_modifier(char *str)
 	char *p = str;
 
 	/* The sizeof includes 0 byte as well. */
-	if (strlen(str) > (sizeof("ukhGHpppSD") - 1))
+	if (strlen(str) > (sizeof("ukhGHpppSDI") - 1))
 		return -1;
 
 	while (*p) {
@@ -853,6 +844,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add)
 		evsel->attr.precise_ip     = mod.precise;
 		evsel->attr.exclude_host   = mod.eH;
 		evsel->attr.exclude_guest  = mod.eG;
+		evsel->attr.exclude_idle   = mod.eI;
 		evsel->exclude_GH          = mod.exclude_GH;
 		evsel->sample_read         = mod.sample_read;
 
@@ -1098,6 +1090,14 @@ static const char * const event_type_descriptors[] = {
 	"Hardware breakpoint",
 };
 
+static int cmp_string(const void *a, const void *b)
+{
+	const char * const *as = a;
+	const char * const *bs = b;
+
+	return strcmp(*as, *bs);
+}
+
 /*
  * Print the events from <debugfs_mount_point>/tracing/events
  */
@@ -1109,18 +1109,21 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
 	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
-	char sbuf[STRERR_BUFSIZE];
-
-	if (debugfs_valid_mountpoint(tracing_events_path)) {
-		printf("  [ Tracepoints not available: %s ]\n",
-			strerror_r(errno, sbuf, sizeof(sbuf)));
-		return;
-	}
+	char **evt_list = NULL;
+	unsigned int evt_i = 0, evt_num = 0;
+	bool evt_num_known = false;
 
+restart:
 	sys_dir = opendir(tracing_events_path);
 	if (!sys_dir)
 		return;
 
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_close_sys_dir;
+	}
+
 	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
 		if (subsys_glob != NULL &&
 		    !strglobmatch(sys_dirent.d_name, subsys_glob))
@@ -1137,19 +1140,56 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
 			    !strglobmatch(evt_dirent.d_name, event_glob))
 				continue;
 
-			if (name_only) {
-				printf("%s:%s ", sys_dirent.d_name, evt_dirent.d_name);
+			if (!evt_num_known) {
+				evt_num++;
 				continue;
 			}
 
 			snprintf(evt_path, MAXPATHLEN, "%s:%s",
 				 sys_dirent.d_name, evt_dirent.d_name);
-			printf("  %-50s [%s]\n", evt_path,
-				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+
+			evt_list[evt_i] = strdup(evt_path);
+			if (evt_list[evt_i] == NULL)
+				goto out_close_evt_dir;
+			evt_i++;
 		}
 		closedir(evt_dir);
 	}
 	closedir(sys_dir);
+
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++],
+				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+	}
+	if (evt_num)
+		printf("\n");
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+	return;
+
+out_close_evt_dir:
+	closedir(evt_dir);
+out_close_sys_dir:
+	closedir(sys_dir);
+
+	printf("FATAL: not enough memory to print %s\n",
+			event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+	if (evt_list)
+		goto out_free;
 }
 
 /*
@@ -1163,9 +1203,6 @@ int is_valid_tracepoint(const char *event_string)
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
 
-	if (debugfs_valid_mountpoint(tracing_events_path))
-		return 0;
-
 	sys_dir = opendir(tracing_events_path);
 	if (!sys_dir)
 		return 0;
@@ -1233,38 +1270,19 @@ static bool is_event_supported(u8 type, unsigned config)
 	return ret;
 }
 
-static void __print_events_type(u8 type, struct event_symbol *syms,
-				unsigned max)
-{
-	char name[64];
-	unsigned i;
-
-	for (i = 0; i < max ; i++, syms++) {
-		if (!is_event_supported(type, i))
-			continue;
-
-		if (strlen(syms->alias))
-			snprintf(name, sizeof(name),  "%s OR %s",
-				 syms->symbol, syms->alias);
-		else
-			snprintf(name, sizeof(name), "%s", syms->symbol);
-
-		printf("  %-50s [%s]\n", name, event_type_descriptors[type]);
-	}
-}
-
-void print_events_type(u8 type)
-{
-	if (type == PERF_TYPE_SOFTWARE)
-		__print_events_type(type, event_symbols_sw, PERF_COUNT_SW_MAX);
-	else
-		__print_events_type(type, event_symbols_hw, PERF_COUNT_HW_MAX);
-}
-
 int print_hwcache_events(const char *event_glob, bool name_only)
 {
-	unsigned int type, op, i, printed = 0;
+	unsigned int type, op, i, evt_i = 0, evt_num = 0;
 	char name[64];
+	char **evt_list = NULL;
+	bool evt_num_known = false;
+
+restart:
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_enomem;
+	}
 
 	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
 		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
@@ -1282,27 +1300,66 @@ int print_hwcache_events(const char *event_glob, bool name_only)
 							type | (op << 8) | (i << 16)))
 					continue;
 
-				if (name_only)
-					printf("%s ", name);
-				else
-					printf("  %-50s [%s]\n", name,
-					       event_type_descriptors[PERF_TYPE_HW_CACHE]);
-				++printed;
+				if (!evt_num_known) {
+					evt_num++;
+					continue;
+				}
+
+				evt_list[evt_i] = strdup(name);
+				if (evt_list[evt_i] == NULL)
+					goto out_enomem;
+				evt_i++;
 			}
 		}
 	}
 
-	if (printed)
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++],
+				event_type_descriptors[PERF_TYPE_HW_CACHE]);
+	}
+	if (evt_num)
 		printf("\n");
-	return printed;
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+	return evt_num;
+
+out_enomem:
+	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[PERF_TYPE_HW_CACHE]);
+	if (evt_list)
+		goto out_free;
+	return evt_num;
 }
 
-static void print_symbol_events(const char *event_glob, unsigned type,
+void print_symbol_events(const char *event_glob, unsigned type,
 				struct event_symbol *syms, unsigned max,
 				bool name_only)
 {
-	unsigned i, printed = 0;
+	unsigned int i, evt_i = 0, evt_num = 0;
 	char name[MAX_NAME_LEN];
+	char **evt_list = NULL;
+	bool evt_num_known = false;
+
+restart:
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_enomem;
+		syms -= max;
+	}
 
 	for (i = 0; i < max; i++, syms++) {
 
@@ -1314,23 +1371,49 @@ static void print_symbol_events(const char *event_glob, unsigned type,
 		if (!is_event_supported(type, i))
 			continue;
 
-		if (name_only) {
-			printf("%s ", syms->symbol);
+		if (!evt_num_known) {
+			evt_num++;
 			continue;
 		}
 
-		if (strlen(syms->alias))
+		if (!name_only && strlen(syms->alias))
 			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
 		else
 			strncpy(name, syms->symbol, MAX_NAME_LEN);
 
-		printf("  %-50s [%s]\n", name, event_type_descriptors[type]);
-
-		printed++;
+		evt_list[evt_i] = strdup(name);
+		if (evt_list[evt_i] == NULL)
+			goto out_enomem;
+		evt_i++;
 	}
 
-	if (printed)
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++], event_type_descriptors[type]);
+	}
+	if (evt_num)
 		printf("\n");
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+	return;
+
+out_enomem:
+	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[type]);
+	if (evt_list)
+		goto out_free;
 }
 
 /*
@@ -1338,11 +1421,6 @@ static void print_symbol_events(const char *event_glob, unsigned type,
  */
 void print_events(const char *event_glob, bool name_only)
 {
-	if (!name_only) {
-		printf("\n");
-		printf("List of pre-defined events (to be used in -e):\n");
-	}
-
 	print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
 			    event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index ff6e1fa4111e..52a2dda4f954 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -116,12 +116,21 @@ void parse_events_update_lists(struct list_head *list_event,
 void parse_events_error(void *data, void *scanner, char const *msg);
 
 void print_events(const char *event_glob, bool name_only);
-void print_events_type(u8 type);
+
+struct event_symbol {
+	const char	*symbol;
+	const char	*alias;
+};
+extern struct event_symbol event_symbols_hw[];
+extern struct event_symbol event_symbols_sw[];
+void print_symbol_events(const char *event_glob, unsigned type,
+				struct event_symbol *syms, unsigned max,
+				bool name_only);
 void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
 			     bool name_only);
 int print_hwcache_events(const char *event_glob, bool name_only);
 extern int is_valid_tracepoint(const char *event_string);
 
-extern int valid_debugfs_mount(const char *debugfs);
+int valid_event_mount(const char *eventfs);
 
 #endif /* __PERF_PARSE_EVENTS_H */
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 94eacb6c1ef7..8895cf3132ab 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -101,7 +101,7 @@ num_raw_hex	[a-fA-F0-9]+
 name		[a-zA-Z_*?][a-zA-Z0-9_*?]*
 name_minus	[a-zA-Z_*?][a-zA-Z0-9\-_*?]*
 /* If you add a modifier you need to update check_modifier() */
-modifier_event	[ukhpGHSD]+
+modifier_event	[ukhpGHSDI]+
 modifier_bp	[rwx]{1,3}
 
 %%
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 4a015f77e2b5..01626be2a8eb 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -37,6 +37,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 {
 	const char *s, *arg = NULL;
 	const int unset = flags & OPT_UNSET;
+	int err;
 
 	if (unset && p->opt)
 		return opterror(opt, "takes no value", flags);
@@ -114,13 +115,29 @@ static int get_value(struct parse_opt_ctx_t *p,
 		return 0;
 
 	case OPTION_STRING:
+		err = 0;
 		if (unset)
 			*(const char **)opt->value = NULL;
 		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
 			*(const char **)opt->value = (const char *)opt->defval;
 		else
-			return get_arg(p, opt, flags, (const char **)opt->value);
-		return 0;
+			err = get_arg(p, opt, flags, (const char **)opt->value);
+
+		/* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
+		if (opt->flags & PARSE_OPT_NOEMPTY) {
+			const char *val = *(const char **)opt->value;
+
+			if (!val)
+				return err;
+
+			/* Similar to unset if we are given an empty string. */
+			if (val[0] == '\0') {
+				*(const char **)opt->value = NULL;
+				return 0;
+			}
+		}
+
+		return err;
 
 	case OPTION_CALLBACK:
 		if (unset)
@@ -505,13 +522,18 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 		break;
 	case PARSE_OPT_LIST_OPTS:
 		while (options->type != OPTION_END) {
-			printf("--%s ", options->long_name);
+			if (options->long_name)
+				printf("--%s ", options->long_name);
 			options++;
 		}
+		putchar('\n');
 		exit(130);
 	case PARSE_OPT_LIST_SUBCMDS:
-		for (int i = 0; subcommands[i]; i++)
-			printf("%s ", subcommands[i]);
+		if (subcommands) {
+			for (int i = 0; subcommands[i]; i++)
+				printf("%s ", subcommands[i]);
+		}
+		putchar('\n');
 		exit(130);
 	default: /* PARSE_OPT_UNKNOWN */
 		if (ctx.argv[0][1] == '-') {
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 97b153fb4999..59561fd86278 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -40,6 +40,7 @@ enum parse_opt_option_flags {
 	PARSE_OPT_LASTARG_DEFAULT = 16,
 	PARSE_OPT_DISABLED = 32,
 	PARSE_OPT_EXCLUSIVE = 64,
+	PARSE_OPT_NOEMPTY  = 128,
 };
 
 struct option;
@@ -122,6 +123,7 @@ struct option {
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
 #define OPT_CALLBACK(s, l, v, a, h, f) \
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 919937eb0be2..30545ce2c712 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -41,6 +41,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include <api/fs/debugfs.h>
+#include <api/fs/tracefs.h>
 #include "trace-event.h"	/* For __maybe_unused */
 #include "probe-event.h"
 #include "probe-finder.h"
@@ -79,6 +80,7 @@ static int init_symbol_maps(bool user_only)
 	int ret;
 
 	symbol_conf.sort_by_name = true;
+	symbol_conf.allow_aliases = true;
 	ret = symbol__init(NULL);
 	if (ret < 0) {
 		pr_debug("Failed to init symbol map.\n");
@@ -133,6 +135,8 @@ static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void)
 		return NULL;
 
 	kmap = map__kmap(host_machine->vmlinux_maps[MAP__FUNCTION]);
+	if (!kmap)
+		return NULL;
 	return kmap->ref_reloc_sym;
 }
 
@@ -150,7 +154,7 @@ static u64 kernel_get_symbol_address_by_name(const char *name, bool reloc)
 		sym = __find_kernel_function_by_name(name, &map);
 		if (sym)
 			return map->unmap_ip(map, sym->start) -
-				(reloc) ? 0 : map->reloc;
+				((reloc) ? 0 : map->reloc);
 	}
 	return 0;
 }
@@ -177,6 +181,25 @@ static struct map *kernel_get_module_map(const char *module)
 	return NULL;
 }
 
+static struct map *get_target_map(const char *target, bool user)
+{
+	/* Init maps of given executable or kernel */
+	if (user)
+		return dso__new_map(target);
+	else
+		return kernel_get_module_map(target);
+}
+
+static void put_target_map(struct map *map, bool user)
+{
+	if (map && user) {
+		/* Only the user map needs to be released */
+		dso__delete(map->dso);
+		map__delete(map);
+	}
+}
+
+
 static struct dso *kernel_get_module_dso(const char *module)
 {
 	struct dso *dso;
@@ -248,6 +271,13 @@ out:
 	return ret;
 }
 
+static void clear_perf_probe_point(struct perf_probe_point *pp)
+{
+	free(pp->file);
+	free(pp->function);
+	free(pp->lazy_line);
+}
+
 static void clear_probe_trace_events(struct probe_trace_event *tevs, int ntevs)
 {
 	int i;
@@ -257,6 +287,103 @@ static void clear_probe_trace_events(struct probe_trace_event *tevs, int ntevs)
 }
 
 #ifdef HAVE_DWARF_SUPPORT
+/*
+ * Some binaries like glibc have special symbols which are on the symbol
+ * table, but not in the debuginfo. If we can find the address of the
+ * symbol from map, we can translate the address back to the probe point.
+ */
+static int find_alternative_probe_point(struct debuginfo *dinfo,
+					struct perf_probe_point *pp,
+					struct perf_probe_point *result,
+					const char *target, bool uprobes)
+{
+	struct map *map = NULL;
+	struct symbol *sym;
+	u64 address = 0;
+	int ret = -ENOENT;
+
+	/* This can work only for function-name based one */
+	if (!pp->function || pp->file)
+		return -ENOTSUP;
+
+	map = get_target_map(target, uprobes);
+	if (!map)
+		return -EINVAL;
+
+	/* Find the address of given function */
+	map__for_each_symbol_by_name(map, pp->function, sym) {
+		if (uprobes)
+			address = sym->start;
+		else
+			address = map->unmap_ip(map, sym->start);
+		break;
+	}
+	if (!address) {
+		ret = -ENOENT;
+		goto out;
+	}
+	pr_debug("Symbol %s address found : %" PRIx64 "\n",
+			pp->function, address);
+
+	ret = debuginfo__find_probe_point(dinfo, (unsigned long)address,
+					  result);
+	if (ret <= 0)
+		ret = (!ret) ? -ENOENT : ret;
+	else {
+		result->offset += pp->offset;
+		result->line += pp->line;
+		ret = 0;
+	}
+
+out:
+	put_target_map(map, uprobes);
+	return ret;
+
+}
+
+static int get_alternative_probe_event(struct debuginfo *dinfo,
+				       struct perf_probe_event *pev,
+				       struct perf_probe_point *tmp,
+				       const char *target)
+{
+	int ret;
+
+	memcpy(tmp, &pev->point, sizeof(*tmp));
+	memset(&pev->point, 0, sizeof(pev->point));
+	ret = find_alternative_probe_point(dinfo, tmp, &pev->point,
+					   target, pev->uprobes);
+	if (ret < 0)
+		memcpy(&pev->point, tmp, sizeof(*tmp));
+
+	return ret;
+}
+
+static int get_alternative_line_range(struct debuginfo *dinfo,
+				      struct line_range *lr,
+				      const char *target, bool user)
+{
+	struct perf_probe_point pp = { .function = lr->function,
+				       .file = lr->file,
+				       .line = lr->start };
+	struct perf_probe_point result;
+	int ret, len = 0;
+
+	memset(&result, 0, sizeof(result));
+
+	if (lr->end != INT_MAX)
+		len = lr->end - lr->start;
+	ret = find_alternative_probe_point(dinfo, &pp, &result,
+					   target, user);
+	if (!ret) {
+		lr->function = result.function;
+		lr->file = result.file;
+		lr->start = result.line;
+		if (lr->end != INT_MAX)
+			lr->end = lr->start + len;
+		clear_perf_probe_point(&pp);
+	}
+	return ret;
+}
 
 /* Open new debuginfo of given module */
 static struct debuginfo *open_debuginfo(const char *module, bool silent)
@@ -465,6 +592,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 					  int max_tevs, const char *target)
 {
 	bool need_dwarf = perf_probe_event_need_dwarf(pev);
+	struct perf_probe_point tmp;
 	struct debuginfo *dinfo;
 	int ntevs, ret = 0;
 
@@ -481,6 +609,20 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 	/* Searching trace events corresponding to a probe event */
 	ntevs = debuginfo__find_trace_events(dinfo, pev, tevs, max_tevs);
 
+	if (ntevs == 0)	{  /* Not found, retry with an alternative */
+		ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+		if (!ret) {
+			ntevs = debuginfo__find_trace_events(dinfo, pev,
+							     tevs, max_tevs);
+			/*
+			 * Write back to the original probe_event for
+			 * setting appropriate (user given) event name
+			 */
+			clear_perf_probe_point(&pev->point);
+			memcpy(&pev->point, &tmp, sizeof(tmp));
+		}
+	}
+
 	debuginfo__delete(dinfo);
 
 	if (ntevs > 0) {	/* Succeeded to find trace events */
@@ -495,11 +637,9 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 	}
 
 	if (ntevs == 0)	{	/* No error but failed to find probe point. */
-		pr_warning("Probe point '%s' not found in debuginfo.\n",
+		pr_warning("Probe point '%s' not found.\n",
 			   synthesize_perf_probe_point(&pev->point));
-		if (need_dwarf)
-			return -ENOENT;
-		return 0;
+		return -ENOENT;
 	}
 	/* Error path : ntevs < 0 */
 	pr_debug("An error occurred in debuginfo analysis (%d).\n", ntevs);
@@ -532,7 +672,7 @@ static int get_real_path(const char *raw_path, const char *comp_dir,
 		else {
 			if (access(raw_path, R_OK) == 0) {
 				*new_path = strdup(raw_path);
-				return 0;
+				return *new_path ? 0 : -ENOMEM;
 			} else
 				return -errno;
 		}
@@ -548,9 +688,11 @@ static int get_real_path(const char *raw_path, const char *comp_dir,
 		if (access(*new_path, R_OK) == 0)
 			return 0;
 
-		if (!symbol_conf.source_prefix)
+		if (!symbol_conf.source_prefix) {
 			/* In case of searching comp_dir, don't retry */
+			zfree(new_path);
 			return -errno;
+		}
 
 		switch (errno) {
 		case ENAMETOOLONG:
@@ -622,7 +764,8 @@ static int _show_one_line(FILE *fp, int l, bool skip, bool show_num)
  * Show line-range always requires debuginfo to find source file and
  * line number.
  */
-static int __show_line_range(struct line_range *lr, const char *module)
+static int __show_line_range(struct line_range *lr, const char *module,
+			     bool user)
 {
 	int l = 1;
 	struct int_node *ln;
@@ -638,6 +781,11 @@ static int __show_line_range(struct line_range *lr, const char *module)
 		return -ENOENT;
 
 	ret = debuginfo__find_line_range(dinfo, lr);
+	if (!ret) {	/* Not found, retry with an alternative */
+		ret = get_alternative_line_range(dinfo, lr, module, user);
+		if (!ret)
+			ret = debuginfo__find_line_range(dinfo, lr);
+	}
 	debuginfo__delete(dinfo);
 	if (ret == 0 || ret == -ENOENT) {
 		pr_warning("Specified source line is not found.\n");
@@ -650,7 +798,11 @@ static int __show_line_range(struct line_range *lr, const char *module)
 	/* Convert source file path */
 	tmp = lr->path;
 	ret = get_real_path(tmp, lr->comp_dir, &lr->path);
-	free(tmp);	/* Free old path */
+
+	/* Free old path when new path is assigned */
+	if (tmp != lr->path)
+		free(tmp);
+
 	if (ret < 0) {
 		pr_warning("Failed to find source file path.\n");
 		return ret;
@@ -707,7 +859,7 @@ int show_line_range(struct line_range *lr, const char *module, bool user)
 	ret = init_symbol_maps(user);
 	if (ret < 0)
 		return ret;
-	ret = __show_line_range(lr, module);
+	ret = __show_line_range(lr, module, user);
 	exit_symbol_maps();
 
 	return ret;
@@ -716,12 +868,13 @@ int show_line_range(struct line_range *lr, const char *module, bool user)
 static int show_available_vars_at(struct debuginfo *dinfo,
 				  struct perf_probe_event *pev,
 				  int max_vls, struct strfilter *_filter,
-				  bool externs)
+				  bool externs, const char *target)
 {
 	char *buf;
 	int ret, i, nvars;
 	struct str_node *node;
 	struct variable_list *vls = NULL, *vl;
+	struct perf_probe_point tmp;
 	const char *var;
 
 	buf = synthesize_perf_probe_point(&pev->point);
@@ -731,6 +884,15 @@ static int show_available_vars_at(struct debuginfo *dinfo,
 
 	ret = debuginfo__find_available_vars_at(dinfo, pev, &vls,
 						max_vls, externs);
+	if (!ret) {  /* Not found, retry with an alternative */
+		ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+		if (!ret) {
+			ret = debuginfo__find_available_vars_at(dinfo, pev,
+						&vls, max_vls, externs);
+			/* Release the old probe_point */
+			clear_perf_probe_point(&tmp);
+		}
+	}
 	if (ret <= 0) {
 		if (ret == 0 || ret == -ENOENT) {
 			pr_err("Failed to find the address of %s\n", buf);
@@ -793,7 +955,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 
 	for (i = 0; i < npevs && ret >= 0; i++)
 		ret = show_available_vars_at(dinfo, &pevs[i], max_vls, _filter,
-					     externs);
+					     externs, module);
 
 	debuginfo__delete(dinfo);
 out:
@@ -1739,15 +1901,13 @@ static int convert_to_perf_probe_event(struct probe_trace_event *tev,
 
 void clear_perf_probe_event(struct perf_probe_event *pev)
 {
-	struct perf_probe_point *pp = &pev->point;
 	struct perf_probe_arg_field *field, *next;
 	int i;
 
 	free(pev->event);
 	free(pev->group);
-	free(pp->file);
-	free(pp->function);
-	free(pp->lazy_line);
+	free(pev->target);
+	clear_perf_probe_point(&pev->point);
 
 	for (i = 0; i < pev->nargs; i++) {
 		free(pev->args[i].name);
@@ -1805,7 +1965,7 @@ static void print_open_warning(int err, bool is_kprobe)
 			   " - please rebuild kernel with %s.\n",
 			   is_kprobe ? 'k' : 'u', config);
 	} else if (err == -ENOTSUP)
-		pr_warning("Debugfs is not mounted.\n");
+		pr_warning("Tracefs or debugfs is not mounted.\n");
 	else
 		pr_warning("Failed to open %cprobe_events: %s\n",
 			   is_kprobe ? 'k' : 'u',
@@ -1816,7 +1976,7 @@ static void print_both_open_warning(int kerr, int uerr)
 {
 	/* Both kprobes and uprobes are disabled, warn it. */
 	if (kerr == -ENOTSUP && uerr == -ENOTSUP)
-		pr_warning("Debugfs is not mounted.\n");
+		pr_warning("Tracefs or debugfs is not mounted.\n");
 	else if (kerr == -ENOENT && uerr == -ENOENT)
 		pr_warning("Please rebuild kernel with CONFIG_KPROBE_EVENTS "
 			   "or/and CONFIG_UPROBE_EVENTS.\n");
@@ -1833,13 +1993,20 @@ static int open_probe_events(const char *trace_file, bool readwrite)
 {
 	char buf[PATH_MAX];
 	const char *__debugfs;
+	const char *tracing_dir = "";
 	int ret;
 
-	__debugfs = debugfs_find_mountpoint();
-	if (__debugfs == NULL)
-		return -ENOTSUP;
+	__debugfs = tracefs_find_mountpoint();
+	if (__debugfs == NULL) {
+		tracing_dir = "tracing/";
 
-	ret = e_snprintf(buf, PATH_MAX, "%s/%s", __debugfs, trace_file);
+		__debugfs = debugfs_find_mountpoint();
+		if (__debugfs == NULL)
+			return -ENOTSUP;
+	}
+
+	ret = e_snprintf(buf, PATH_MAX, "%s/%s%s",
+			 __debugfs, tracing_dir, trace_file);
 	if (ret >= 0) {
 		pr_debug("Opening %s write=%d\n", buf, readwrite);
 		if (readwrite && !probe_event_dry_run)
@@ -1855,12 +2022,12 @@ static int open_probe_events(const char *trace_file, bool readwrite)
 
 static int open_kprobe_events(bool readwrite)
 {
-	return open_probe_events("tracing/kprobe_events", readwrite);
+	return open_probe_events("kprobe_events", readwrite);
 }
 
 static int open_uprobe_events(bool readwrite)
 {
-	return open_probe_events("tracing/uprobe_events", readwrite);
+	return open_probe_events("uprobe_events", readwrite);
 }
 
 /* Get raw string list of current kprobe_events  or uprobe_events */
@@ -1895,6 +2062,95 @@ static struct strlist *get_probe_trace_command_rawlist(int fd)
 	return sl;
 }
 
+struct kprobe_blacklist_node {
+	struct list_head list;
+	unsigned long start;
+	unsigned long end;
+	char *symbol;
+};
+
+static void kprobe_blacklist__delete(struct list_head *blacklist)
+{
+	struct kprobe_blacklist_node *node;
+
+	while (!list_empty(blacklist)) {
+		node = list_first_entry(blacklist,
+					struct kprobe_blacklist_node, list);
+		list_del(&node->list);
+		free(node->symbol);
+		free(node);
+	}
+}
+
+static int kprobe_blacklist__load(struct list_head *blacklist)
+{
+	struct kprobe_blacklist_node *node;
+	const char *__debugfs = debugfs_find_mountpoint();
+	char buf[PATH_MAX], *p;
+	FILE *fp;
+	int ret;
+
+	if (__debugfs == NULL)
+		return -ENOTSUP;
+
+	ret = e_snprintf(buf, PATH_MAX, "%s/kprobes/blacklist", __debugfs);
+	if (ret < 0)
+		return ret;
+
+	fp = fopen(buf, "r");
+	if (!fp)
+		return -errno;
+
+	ret = 0;
+	while (fgets(buf, PATH_MAX, fp)) {
+		node = zalloc(sizeof(*node));
+		if (!node) {
+			ret = -ENOMEM;
+			break;
+		}
+		INIT_LIST_HEAD(&node->list);
+		list_add_tail(&node->list, blacklist);
+		if (sscanf(buf, "0x%lx-0x%lx", &node->start, &node->end) != 2) {
+			ret = -EINVAL;
+			break;
+		}
+		p = strchr(buf, '\t');
+		if (p) {
+			p++;
+			if (p[strlen(p) - 1] == '\n')
+				p[strlen(p) - 1] = '\0';
+		} else
+			p = (char *)"unknown";
+		node->symbol = strdup(p);
+		if (!node->symbol) {
+			ret = -ENOMEM;
+			break;
+		}
+		pr_debug2("Blacklist: 0x%lx-0x%lx, %s\n",
+			  node->start, node->end, node->symbol);
+		ret++;
+	}
+	if (ret < 0)
+		kprobe_blacklist__delete(blacklist);
+	fclose(fp);
+
+	return ret;
+}
+
+static struct kprobe_blacklist_node *
+kprobe_blacklist__find_by_address(struct list_head *blacklist,
+				  unsigned long address)
+{
+	struct kprobe_blacklist_node *node;
+
+	list_for_each_entry(node, blacklist, list) {
+		if (node->start <= address && address <= node->end)
+			return node;
+	}
+
+	return NULL;
+}
+
 /* Show an event */
 static int show_perf_probe_event(struct perf_probe_event *pev,
 				 const char *module)
@@ -2100,6 +2356,27 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 	return ret;
 }
 
+/* Warn if the current kernel's uprobe implementation is old */
+static void warn_uprobe_event_compat(struct probe_trace_event *tev)
+{
+	int i;
+	char *buf = synthesize_probe_trace_command(tev);
+
+	/* Old uprobe event doesn't support memory dereference */
+	if (!tev->uprobes || tev->nargs == 0 || !buf)
+		goto out;
+
+	for (i = 0; i < tev->nargs; i++)
+		if (strglobmatch(tev->args[i].value, "[$@+-]*")) {
+			pr_warning("Please upgrade your kernel to at least "
+				   "3.14 to have access to feature %s\n",
+				   tev->args[i].value);
+			break;
+		}
+out:
+	free(buf);
+}
+
 static int __add_probe_trace_events(struct perf_probe_event *pev,
 				     struct probe_trace_event *tevs,
 				     int ntevs, bool allow_suffix)
@@ -2109,6 +2386,8 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 	char buf[64];
 	const char *event, *group;
 	struct strlist *namelist;
+	LIST_HEAD(blacklist);
+	struct kprobe_blacklist_node *node;
 
 	if (pev->uprobes)
 		fd = open_uprobe_events(true);
@@ -2126,11 +2405,25 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 		pr_debug("Failed to get current event list.\n");
 		return -EIO;
 	}
+	/* Get kprobe blacklist if exists */
+	if (!pev->uprobes) {
+		ret = kprobe_blacklist__load(&blacklist);
+		if (ret < 0)
+			pr_debug("No kprobe blacklist support, ignored\n");
+	}
 
 	ret = 0;
 	pr_info("Added new event%s\n", (ntevs > 1) ? "s:" : ":");
 	for (i = 0; i < ntevs; i++) {
 		tev = &tevs[i];
+		/* Ensure that the address is NOT blacklisted */
+		node = kprobe_blacklist__find_by_address(&blacklist,
+							 tev->point.address);
+		if (node) {
+			pr_warning("Warning: Skipped probing on blacklisted function: %s\n", node->symbol);
+			continue;
+		}
+
 		if (pev->event)
 			event = pev->event;
 		else
@@ -2180,14 +2473,18 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 		 */
 		allow_suffix = true;
 	}
+	if (ret == -EINVAL && pev->uprobes)
+		warn_uprobe_event_compat(tev);
 
-	if (ret >= 0) {
+	/* Note that it is possible to skip all events because of blacklist */
+	if (ret >= 0 && tev->event) {
 		/* Show how to use the event. */
 		pr_info("\nYou can now use it in all perf tools, such as:\n\n");
 		pr_info("\tperf record -e %s:%s -aR sleep 1\n\n", tev->group,
 			 tev->event);
 	}
 
+	kprobe_blacklist__delete(&blacklist);
 	strlist__delete(namelist);
 	close(fd);
 	return ret;
@@ -2199,8 +2496,7 @@ static int find_probe_functions(struct map *map, char *name)
 	struct symbol *sym;
 
 	map__for_each_symbol_by_name(map, name, sym) {
-		if (sym->binding == STB_GLOBAL || sym->binding == STB_LOCAL)
-			found++;
+		found++;
 	}
 
 	return found;
@@ -2218,7 +2514,6 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 					    int max_tevs, const char *target)
 {
 	struct map *map = NULL;
-	struct kmap *kmap = NULL;
 	struct ref_reloc_sym *reloc_sym = NULL;
 	struct symbol *sym;
 	struct probe_trace_event *tev;
@@ -2227,11 +2522,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	int num_matched_functions;
 	int ret, i;
 
-	/* Init maps of given executable or kernel */
-	if (pev->uprobes)
-		map = dso__new_map(target);
-	else
-		map = kernel_get_module_map(target);
+	map = get_target_map(target, pev->uprobes);
 	if (!map) {
 		ret = -EINVAL;
 		goto out;
@@ -2255,8 +2546,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	}
 
 	if (!pev->uprobes && !pp->retprobe) {
-		kmap = map__kmap(map);
-		reloc_sym = kmap->ref_reloc_sym;
+		reloc_sym = kernel_get_ref_reloc_sym();
 		if (!reloc_sym) {
 			pr_warning("Relocated base symbol is not found!\n");
 			ret = -EINVAL;
@@ -2324,11 +2614,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	}
 
 out:
-	if (map && pev->uprobes) {
-		/* Only when using uprobe(exec) map needs to be released */
-		dso__delete(map->dso);
-		map__delete(map);
-	}
+	put_target_map(map, pev->uprobes);
 	return ret;
 
 nomem_out:
@@ -2369,7 +2655,7 @@ struct __event_package {
 };
 
 int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-			  int max_tevs, const char *target, bool force_add)
+			  int max_tevs, bool force_add)
 {
 	int i, j, ret;
 	struct __event_package *pkgs;
@@ -2393,7 +2679,7 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 		ret  = convert_to_probe_trace_events(pkgs[i].pev,
 						     &pkgs[i].tevs,
 						     max_tevs,
-						     target);
+						     pkgs[i].pev->target);
 		if (ret < 0)
 			goto end;
 		pkgs[i].ntevs = ret;
@@ -2568,8 +2854,7 @@ static struct strfilter *available_func_filter;
 static int filter_available_functions(struct map *map __maybe_unused,
 				      struct symbol *sym)
 {
-	if ((sym->binding == STB_GLOBAL || sym->binding == STB_LOCAL) &&
-	    strfilter__compare(available_func_filter, sym->name))
+	if (strfilter__compare(available_func_filter, sym->name))
 		return 0;
 	return 1;
 }
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index e01e9943139f..d6b783447be9 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -73,7 +73,8 @@ struct perf_probe_event {
 	char			*group;	/* Group name */
 	struct perf_probe_point	point;	/* Probe point */
 	int			nargs;	/* Number of arguments */
-	bool			uprobes;
+	bool			uprobes;	/* Uprobe event flag */
+	char			*target;	/* Target binary */
 	struct perf_probe_arg	*args;	/* Arguments */
 };
 
@@ -124,8 +125,7 @@ extern int line_range__init(struct line_range *lr);
 extern const char *kernel_get_module_path(const char *module);
 
 extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-				 int max_probe_points, const char *module,
-				 bool force_add);
+				 int max_probe_points, bool force_add);
 extern int del_perf_probe_events(struct strlist *dellist);
 extern int show_perf_probe_events(void);
 extern int show_line_range(struct line_range *lr, const char *module,
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index d32cf93565da..e3074230f236 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -460,7 +460,8 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			       " nor array.\n", varname);
 			return -EINVAL;
 		}
-		if (field->ref) {
+		/* While prcessing unnamed field, we don't care about this */
+		if (field->ref && dwarf_diename(vr_die)) {
 			pr_err("Semantic error: %s must be referred by '.'\n",
 			       field->name);
 			return -EINVAL;
@@ -491,6 +492,11 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 	}
 	ref->offset += (long)offs;
 
+	/* If this member is unnamed, we need to reuse this field */
+	if (!dwarf_diename(die_mem))
+		return convert_variable_fields(die_mem, varname, field,
+						&ref, die_mem);
+
 next:
 	/* Converting next field */
 	if (field->next)
@@ -915,17 +921,13 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 		dwarf_decl_line(sp_die, &pf->lno);
 		pf->lno += pp->line;
 		param->retval = find_probe_point_by_line(pf);
-	} else if (!dwarf_func_inline(sp_die)) {
+	} else if (die_is_func_instance(sp_die)) {
+		/* Instances always have the entry address */
+		dwarf_entrypc(sp_die, &pf->addr);
 		/* Real function */
 		if (pp->lazy_line)
 			param->retval = find_probe_point_lazy(sp_die, pf);
 		else {
-			if (dwarf_entrypc(sp_die, &pf->addr) != 0) {
-				pr_warning("Failed to get entry address of "
-					   "%s.\n", dwarf_diename(sp_die));
-				param->retval = -ENOENT;
-				return DWARF_CB_ABORT;
-			}
 			pf->addr += pp->offset;
 			/* TODO: Check the address in this function */
 			param->retval = call_probe_finder(sp_die, pf);
@@ -1349,11 +1351,8 @@ int debuginfo__find_probe_point(struct debuginfo *dbg, unsigned long addr,
 	const char *fname = NULL, *func = NULL, *basefunc = NULL, *tmp;
 	int baseline = 0, lineno = 0, ret = 0;
 
-	/* Adjust address with bias */
-	addr += dbg->bias;
-
 	/* Find cu die */
-	if (!dwarf_addrdie(dbg->dbg, (Dwarf_Addr)addr - dbg->bias, &cudie)) {
+	if (!dwarf_addrdie(dbg->dbg, (Dwarf_Addr)addr, &cudie)) {
 		pr_warning("Failed to find debug information for address %lx\n",
 			   addr);
 		ret = -EINVAL;
@@ -1536,7 +1535,7 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 		pr_debug("New line range: %d to %d\n", lf->lno_s, lf->lno_e);
 		lr->start = lf->lno_s;
 		lr->end = lf->lno_e;
-		if (dwarf_func_inline(sp_die))
+		if (!die_is_func_instance(sp_die))
 			param->retval = die_walk_instances(sp_die,
 						line_range_inline_cb, lf);
 		else
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 6c6a6953fa93..4d28624a1eca 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -17,6 +17,5 @@ util/xyarray.c
 util/cgroup.c
 util/rblist.c
 util/strlist.c
-../lib/api/fs/fs.c
 util/trace-event.c
 ../../lib/rbtree.c
diff --git a/tools/perf/util/scripting-engines/Build b/tools/perf/util/scripting-engines/Build
new file mode 100644
index 000000000000..6516e220c247
--- /dev/null
+++ b/tools/perf/util/scripting-engines/Build
@@ -0,0 +1,6 @@
+libperf-$(CONFIG_LIBPERL)   += trace-event-perl.o
+libperf-$(CONFIG_LIBPYTHON) += trace-event-python.o
+
+CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default
+
+CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 22ebc46226e7..430b5d27828e 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -214,6 +214,11 @@ static void define_event_symbols(struct event_format *event,
 		define_event_symbols(event, ev_name, args->hex.field);
 		define_event_symbols(event, ev_name, args->hex.size);
 		break;
+	case PRINT_INT_ARRAY:
+		define_event_symbols(event, ev_name, args->int_array.field);
+		define_event_symbols(event, ev_name, args->int_array.count);
+		define_event_symbols(event, ev_name, args->int_array.el_size);
+		break;
 	case PRINT_BSTRING:
 	case PRINT_DYNAMIC_ARRAY:
 	case PRINT_STRING:
@@ -355,10 +360,9 @@ static void perl_process_event_generic(union perf_event *event,
 static void perl_process_event(union perf_event *event,
 			       struct perf_sample *sample,
 			       struct perf_evsel *evsel,
-			       struct thread *thread,
-			       struct addr_location *al __maybe_unused)
+			       struct addr_location *al)
 {
-	perl_process_tracepoint(sample, evsel, thread);
+	perl_process_tracepoint(sample, evsel, al->thread);
 	perl_process_event_generic(event, sample, evsel);
 }
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 0c815a40a6e8..5544b8cdd1ee 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -231,6 +231,11 @@ static void define_event_symbols(struct event_format *event,
 		define_event_symbols(event, ev_name, args->hex.field);
 		define_event_symbols(event, ev_name, args->hex.size);
 		break;
+	case PRINT_INT_ARRAY:
+		define_event_symbols(event, ev_name, args->int_array.field);
+		define_event_symbols(event, ev_name, args->int_array.count);
+		define_event_symbols(event, ev_name, args->int_array.el_size);
+		break;
 	case PRINT_STRING:
 		break;
 	case PRINT_TYPE:
@@ -376,7 +381,6 @@ exit:
 
 static void python_process_tracepoint(struct perf_sample *sample,
 				      struct perf_evsel *evsel,
-				      struct thread *thread,
 				      struct addr_location *al)
 {
 	struct event_format *event = evsel->tp_format;
@@ -390,7 +394,7 @@ static void python_process_tracepoint(struct perf_sample *sample,
 	int cpu = sample->cpu;
 	void *data = sample->raw_data;
 	unsigned long long nsecs = sample->time;
-	const char *comm = thread__comm_str(thread);
+	const char *comm = thread__comm_str(al->thread);
 
 	t = PyTuple_New(MAX_FIELDS);
 	if (!t)
@@ -675,7 +679,7 @@ static int python_export_sample(struct db_export *dbe,
 	tuple_set_u64(t, 0, es->db_id);
 	tuple_set_u64(t, 1, es->evsel->db_id);
 	tuple_set_u64(t, 2, es->al->machine->db_id);
-	tuple_set_u64(t, 3, es->thread->db_id);
+	tuple_set_u64(t, 3, es->al->thread->db_id);
 	tuple_set_u64(t, 4, es->comm_db_id);
 	tuple_set_u64(t, 5, es->dso_db_id);
 	tuple_set_u64(t, 6, es->sym_db_id);
@@ -761,7 +765,6 @@ static int python_process_call_return(struct call_return *cr, void *data)
 
 static void python_process_general_event(struct perf_sample *sample,
 					 struct perf_evsel *evsel,
-					 struct thread *thread,
 					 struct addr_location *al)
 {
 	PyObject *handler, *t, *dict, *callchain, *dict_sample;
@@ -811,7 +814,7 @@ static void python_process_general_event(struct perf_sample *sample,
 	pydict_set_item_string_decref(dict, "raw_buf", PyString_FromStringAndSize(
 			(const char *)sample->raw_data, sample->raw_size));
 	pydict_set_item_string_decref(dict, "comm",
-			PyString_FromString(thread__comm_str(thread)));
+			PyString_FromString(thread__comm_str(al->thread)));
 	if (al->map) {
 		pydict_set_item_string_decref(dict, "dso",
 			PyString_FromString(al->map->dso->name));
@@ -838,22 +841,20 @@ exit:
 static void python_process_event(union perf_event *event,
 				 struct perf_sample *sample,
 				 struct perf_evsel *evsel,
-				 struct thread *thread,
 				 struct addr_location *al)
 {
 	struct tables *tables = &tables_global;
 
 	switch (evsel->attr.type) {
 	case PERF_TYPE_TRACEPOINT:
-		python_process_tracepoint(sample, evsel, thread, al);
+		python_process_tracepoint(sample, evsel, al);
 		break;
 	/* Reserve for future process_hw/sw/raw APIs */
 	default:
 		if (tables->db_export_mode)
-			db_export__sample(&tables->dbe, event, sample, evsel,
-					  thread, al);
+			db_export__sample(&tables->dbe, event, sample, evsel, al);
 		else
-			python_process_general_event(sample, evsel, thread, al);
+			python_process_general_event(sample, evsel, al);
 	}
 }
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0baf75f12b7c..0c74012575ac 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -16,6 +16,12 @@
 #include "perf_regs.h"
 #include "asm/bug.h"
 
+static int machines__deliver_event(struct machines *machines,
+				   struct perf_evlist *evlist,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct perf_tool *tool, u64 file_offset);
+
 static int perf_session__open(struct perf_session *session)
 {
 	struct perf_data_file *file = session->file;
@@ -86,6 +92,23 @@ static void perf_session__set_comm_exec(struct perf_session *session)
 	machines__set_comm_exec(&session->machines, comm_exec);
 }
 
+static int ordered_events__deliver_event(struct ordered_events *oe,
+					 struct ordered_event *event)
+{
+	struct perf_sample sample;
+	struct perf_session *session = container_of(oe, struct perf_session,
+						    ordered_events);
+	int ret = perf_evlist__parse_sample(session->evlist, event->event, &sample);
+
+	if (ret) {
+		pr_err("Can't parse sample, err = %d\n", ret);
+		return ret;
+	}
+
+	return machines__deliver_event(&session->machines, session->evlist, event->event,
+				       &sample, session->tool, event->file_offset);
+}
+
 struct perf_session *perf_session__new(struct perf_data_file *file,
 				       bool repipe, struct perf_tool *tool)
 {
@@ -95,8 +118,9 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
 		goto out;
 
 	session->repipe = repipe;
-	ordered_events__init(&session->ordered_events);
+	session->tool   = tool;
 	machines__init(&session->machines);
+	ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
 
 	if (file) {
 		if (perf_data_file__open(file))
@@ -138,11 +162,6 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
 	return NULL;
 }
 
-static void perf_session__delete_dead_threads(struct perf_session *session)
-{
-	machine__delete_dead_threads(&session->machines.host);
-}
-
 static void perf_session__delete_threads(struct perf_session *session)
 {
 	machine__delete_threads(&session->machines.host);
@@ -167,7 +186,6 @@ static void perf_session_env__delete(struct perf_session_env *env)
 void perf_session__delete(struct perf_session *session)
 {
 	perf_session__destroy_kernel_maps(session);
-	perf_session__delete_dead_threads(session);
 	perf_session__delete_threads(session);
 	perf_session_env__delete(&session->header.env);
 	machines__exit(&session->machines);
@@ -215,10 +233,17 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
+				 union perf_event *event __maybe_unused,
+				 struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
 				       union perf_event *event __maybe_unused,
-				       struct perf_session *perf_session
-				       __maybe_unused)
+				       struct ordered_events *oe __maybe_unused)
 {
 	dump_printf(": unhandled!\n");
 	return 0;
@@ -226,7 +251,7 @@ static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
 
 static int process_finished_round(struct perf_tool *tool,
 				  union perf_event *event,
-				  struct perf_session *session);
+				  struct ordered_events *oe);
 
 static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
 				 union perf_event *event __maybe_unused,
@@ -264,7 +289,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 	if (tool->tracing_data == NULL)
 		tool->tracing_data = process_event_synth_tracing_data_stub;
 	if (tool->build_id == NULL)
-		tool->build_id = process_finished_round_stub;
+		tool->build_id = process_build_id_stub;
 	if (tool->finished_round == NULL) {
 		if (tool->ordered_events)
 			tool->finished_round = process_finished_round;
@@ -514,54 +539,80 @@ static perf_event__swap_op perf_event__swap_ops[] = {
  *      Flush every events below timestamp 7
  *      etc...
  */
-static int process_finished_round(struct perf_tool *tool,
+static int process_finished_round(struct perf_tool *tool __maybe_unused,
 				  union perf_event *event __maybe_unused,
-				  struct perf_session *session)
+				  struct ordered_events *oe)
 {
-	return ordered_events__flush(session, tool, OE_FLUSH__ROUND);
+	return ordered_events__flush(oe, OE_FLUSH__ROUND);
 }
 
-int perf_session_queue_event(struct perf_session *s, union perf_event *event,
-			     struct perf_tool *tool, struct perf_sample *sample,
-			     u64 file_offset)
+int perf_session__queue_event(struct perf_session *s, union perf_event *event,
+			      struct perf_sample *sample, u64 file_offset)
 {
-	struct ordered_events *oe = &s->ordered_events;
-	u64 timestamp = sample->time;
-	struct ordered_event *new;
-
-	if (!timestamp || timestamp == ~0ULL)
-		return -ETIME;
-
-	if (timestamp < oe->last_flush) {
-		pr_oe_time(timestamp,      "out of order event\n");
-		pr_oe_time(oe->last_flush, "last flush, last_flush_type %d\n",
-			   oe->last_flush_type);
-
-		s->stats.nr_unordered_events++;
-	}
-
-	new = ordered_events__new(oe, timestamp, event);
-	if (!new) {
-		ordered_events__flush(s, tool, OE_FLUSH__HALF);
-		new = ordered_events__new(oe, timestamp, event);
-	}
-
-	if (!new)
-		return -ENOMEM;
-
-	new->file_offset = file_offset;
-	return 0;
+	return ordered_events__queue(&s->ordered_events, event, sample, file_offset);
 }
 
-static void callchain__printf(struct perf_sample *sample)
+static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 {
+	struct ip_callchain *callchain = sample->callchain;
+	struct branch_stack *lbr_stack = sample->branch_stack;
+	u64 kernel_callchain_nr = callchain->nr;
 	unsigned int i;
 
-	printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr);
+	for (i = 0; i < kernel_callchain_nr; i++) {
+		if (callchain->ips[i] == PERF_CONTEXT_USER)
+			break;
+	}
+
+	if ((i != kernel_callchain_nr) && lbr_stack->nr) {
+		u64 total_nr;
+		/*
+		 * LBR callstack can only get user call chain,
+		 * i is kernel call chain number,
+		 * 1 is PERF_CONTEXT_USER.
+		 *
+		 * The user call chain is stored in LBR registers.
+		 * LBR are pair registers. The caller is stored
+		 * in "from" register, while the callee is stored
+		 * in "to" register.
+		 * For example, there is a call stack
+		 * "A"->"B"->"C"->"D".
+		 * The LBR registers will recorde like
+		 * "C"->"D", "B"->"C", "A"->"B".
+		 * So only the first "to" register and all "from"
+		 * registers are needed to construct the whole stack.
+		 */
+		total_nr = i + 1 + lbr_stack->nr + 1;
+		kernel_callchain_nr = i + 1;
+
+		printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
+
+		for (i = 0; i < kernel_callchain_nr; i++)
+			printf("..... %2d: %016" PRIx64 "\n",
+			       i, callchain->ips[i]);
 
-	for (i = 0; i < sample->callchain->nr; i++)
 		printf("..... %2d: %016" PRIx64 "\n",
-		       i, sample->callchain->ips[i]);
+		       (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
+		for (i = 0; i < lbr_stack->nr; i++)
+			printf("..... %2d: %016" PRIx64 "\n",
+			       (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
+	}
+}
+
+static void callchain__printf(struct perf_evsel *evsel,
+			      struct perf_sample *sample)
+{
+	unsigned int i;
+	struct ip_callchain *callchain = sample->callchain;
+
+	if (has_branch_callstack(evsel))
+		callchain__lbr_callstack_printf(sample);
+
+	printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
+
+	for (i = 0; i < callchain->nr; i++)
+		printf("..... %2d: %016" PRIx64 "\n",
+		       i, callchain->ips[i]);
 }
 
 static void branch_stack__printf(struct perf_sample *sample)
@@ -636,14 +687,14 @@ static void stack_user__printf(struct stack_dump *dump)
 	       dump->size, dump->offset);
 }
 
-static void perf_session__print_tstamp(struct perf_session *session,
+static void perf_evlist__print_tstamp(struct perf_evlist *evlist,
 				       union perf_event *event,
 				       struct perf_sample *sample)
 {
-	u64 sample_type = __perf_evlist__combined_sample_type(session->evlist);
+	u64 sample_type = __perf_evlist__combined_sample_type(evlist);
 
 	if (event->header.type != PERF_RECORD_SAMPLE &&
-	    !perf_evlist__sample_id_all(session->evlist)) {
+	    !perf_evlist__sample_id_all(evlist)) {
 		fputs("-1 -1 ", stdout);
 		return;
 	}
@@ -685,7 +736,7 @@ static void sample_read__printf(struct perf_sample *sample, u64 read_format)
 			sample->read.one.id, sample->read.one.value);
 }
 
-static void dump_event(struct perf_session *session, union perf_event *event,
+static void dump_event(struct perf_evlist *evlist, union perf_event *event,
 		       u64 file_offset, struct perf_sample *sample)
 {
 	if (!dump_trace)
@@ -697,7 +748,7 @@ static void dump_event(struct perf_session *session, union perf_event *event,
 	trace_event(event);
 
 	if (sample)
-		perf_session__print_tstamp(session, event, sample);
+		perf_evlist__print_tstamp(evlist, event, sample);
 
 	printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset,
 	       event->header.size, perf_event__name(event->header.type));
@@ -718,9 +769,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 	sample_type = evsel->attr.sample_type;
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN)
-		callchain__printf(sample);
+		callchain__printf(evsel, sample);
 
-	if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+	if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
 		branch_stack__printf(sample);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
@@ -745,8 +796,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 		sample_read__printf(sample, evsel->attr.read_format);
 }
 
-static struct machine *
-	perf_session__find_machine_for_cpumode(struct perf_session *session,
+static struct machine *machines__find_for_cpumode(struct machines *machines,
 					       union perf_event *event,
 					       struct perf_sample *sample)
 {
@@ -764,26 +814,24 @@ static struct machine *
 		else
 			pid = sample->pid;
 
-		machine = perf_session__find_machine(session, pid);
+		machine = machines__find(machines, pid);
 		if (!machine)
-			machine = perf_session__findnew_machine(session,
-						DEFAULT_GUEST_KERNEL_ID);
+			machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID);
 		return machine;
 	}
 
-	return &session->machines.host;
+	return &machines->host;
 }
 
-static int deliver_sample_value(struct perf_session *session,
+static int deliver_sample_value(struct perf_evlist *evlist,
 				struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct sample_read_value *v,
 				struct machine *machine)
 {
-	struct perf_sample_id *sid;
+	struct perf_sample_id *sid = perf_evlist__id2sid(evlist, v->id);
 
-	sid = perf_evlist__id2sid(session->evlist, v->id);
 	if (sid) {
 		sample->id     = v->id;
 		sample->period = v->value - sid->period;
@@ -791,14 +839,14 @@ static int deliver_sample_value(struct perf_session *session,
 	}
 
 	if (!sid || sid->evsel == NULL) {
-		++session->stats.nr_unknown_id;
+		++evlist->stats.nr_unknown_id;
 		return 0;
 	}
 
 	return tool->sample(tool, event, sample, sid->evsel, machine);
 }
 
-static int deliver_sample_group(struct perf_session *session,
+static int deliver_sample_group(struct perf_evlist *evlist,
 				struct perf_tool *tool,
 				union  perf_event *event,
 				struct perf_sample *sample,
@@ -808,7 +856,7 @@ static int deliver_sample_group(struct perf_session *session,
 	u64 i;
 
 	for (i = 0; i < sample->read.group.nr; i++) {
-		ret = deliver_sample_value(session, tool, event, sample,
+		ret = deliver_sample_value(evlist, tool, event, sample,
 					   &sample->read.group.values[i],
 					   machine);
 		if (ret)
@@ -819,7 +867,7 @@ static int deliver_sample_group(struct perf_session *session,
 }
 
 static int
-perf_session__deliver_sample(struct perf_session *session,
+ perf_evlist__deliver_sample(struct perf_evlist *evlist,
 			     struct perf_tool *tool,
 			     union  perf_event *event,
 			     struct perf_sample *sample,
@@ -836,41 +884,40 @@ perf_session__deliver_sample(struct perf_session *session,
 
 	/* For PERF_SAMPLE_READ we have either single or group mode. */
 	if (read_format & PERF_FORMAT_GROUP)
-		return deliver_sample_group(session, tool, event, sample,
+		return deliver_sample_group(evlist, tool, event, sample,
 					    machine);
 	else
-		return deliver_sample_value(session, tool, event, sample,
+		return deliver_sample_value(evlist, tool, event, sample,
 					    &sample->read.one, machine);
 }
 
-int perf_session__deliver_event(struct perf_session *session,
-				union perf_event *event,
-				struct perf_sample *sample,
-				struct perf_tool *tool, u64 file_offset)
+static int machines__deliver_event(struct machines *machines,
+				   struct perf_evlist *evlist,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct perf_tool *tool, u64 file_offset)
 {
 	struct perf_evsel *evsel;
 	struct machine *machine;
 
-	dump_event(session, event, file_offset, sample);
+	dump_event(evlist, event, file_offset, sample);
 
-	evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+	evsel = perf_evlist__id2evsel(evlist, sample->id);
 
-	machine = perf_session__find_machine_for_cpumode(session, event,
-							 sample);
+	machine = machines__find_for_cpumode(machines, event, sample);
 
 	switch (event->header.type) {
 	case PERF_RECORD_SAMPLE:
 		dump_sample(evsel, event, sample);
 		if (evsel == NULL) {
-			++session->stats.nr_unknown_id;
+			++evlist->stats.nr_unknown_id;
 			return 0;
 		}
 		if (machine == NULL) {
-			++session->stats.nr_unprocessable_samples;
+			++evlist->stats.nr_unprocessable_samples;
 			return 0;
 		}
-		return perf_session__deliver_sample(session, tool, event,
-						    sample, evsel, machine);
+		return perf_evlist__deliver_sample(evlist, tool, event, sample, evsel, machine);
 	case PERF_RECORD_MMAP:
 		return tool->mmap(tool, event, sample, machine);
 	case PERF_RECORD_MMAP2:
@@ -883,7 +930,7 @@ int perf_session__deliver_event(struct perf_session *session,
 		return tool->exit(tool, event, sample, machine);
 	case PERF_RECORD_LOST:
 		if (tool->lost == perf_event__process_lost)
-			session->stats.total_lost += event->lost.lost;
+			evlist->stats.total_lost += event->lost.lost;
 		return tool->lost(tool, event, sample, machine);
 	case PERF_RECORD_READ:
 		return tool->read(tool, event, sample, evsel, machine);
@@ -892,20 +939,21 @@ int perf_session__deliver_event(struct perf_session *session,
 	case PERF_RECORD_UNTHROTTLE:
 		return tool->unthrottle(tool, event, sample, machine);
 	default:
-		++session->stats.nr_unknown_events;
+		++evlist->stats.nr_unknown_events;
 		return -1;
 	}
 }
 
 static s64 perf_session__process_user_event(struct perf_session *session,
 					    union perf_event *event,
-					    struct perf_tool *tool,
 					    u64 file_offset)
 {
+	struct ordered_events *oe = &session->ordered_events;
+	struct perf_tool *tool = session->tool;
 	int fd = perf_data_file__fd(session->file);
 	int err;
 
-	dump_event(session, event, file_offset, NULL);
+	dump_event(session->evlist, event, file_offset, NULL);
 
 	/* These events are processed right away */
 	switch (event->header.type) {
@@ -929,7 +977,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	case PERF_RECORD_HEADER_BUILD_ID:
 		return tool->build_id(tool, event, session);
 	case PERF_RECORD_FINISHED_ROUND:
-		return tool->finished_round(tool, event, session);
+		return tool->finished_round(tool, event, oe);
 	case PERF_RECORD_ID_INDEX:
 		return tool->id_index(tool, event, session);
 	default:
@@ -939,15 +987,17 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 
 int perf_session__deliver_synth_event(struct perf_session *session,
 				      union perf_event *event,
-				      struct perf_sample *sample,
-				      struct perf_tool *tool)
+				      struct perf_sample *sample)
 {
-	events_stats__inc(&session->stats, event->header.type);
+	struct perf_evlist *evlist = session->evlist;
+	struct perf_tool *tool = session->tool;
+
+	events_stats__inc(&evlist->stats, event->header.type);
 
 	if (event->header.type >= PERF_RECORD_USER_TYPE_START)
-		return perf_session__process_user_event(session, event, tool, 0);
+		return perf_session__process_user_event(session, event, 0);
 
-	return perf_session__deliver_event(session, event, sample, tool, 0);
+	return machines__deliver_event(&session->machines, evlist, event, sample, tool, 0);
 }
 
 static void event_swap(union perf_event *event, bool sample_id_all)
@@ -1015,40 +1065,39 @@ out_parse_sample:
 }
 
 static s64 perf_session__process_event(struct perf_session *session,
-				       union perf_event *event,
-				       struct perf_tool *tool,
-				       u64 file_offset)
+				       union perf_event *event, u64 file_offset)
 {
+	struct perf_evlist *evlist = session->evlist;
+	struct perf_tool *tool = session->tool;
 	struct perf_sample sample;
 	int ret;
 
 	if (session->header.needs_swap)
-		event_swap(event, perf_evlist__sample_id_all(session->evlist));
+		event_swap(event, perf_evlist__sample_id_all(evlist));
 
 	if (event->header.type >= PERF_RECORD_HEADER_MAX)
 		return -EINVAL;
 
-	events_stats__inc(&session->stats, event->header.type);
+	events_stats__inc(&evlist->stats, event->header.type);
 
 	if (event->header.type >= PERF_RECORD_USER_TYPE_START)
-		return perf_session__process_user_event(session, event, tool, file_offset);
+		return perf_session__process_user_event(session, event, file_offset);
 
 	/*
 	 * For all kernel events we get the sample data
 	 */
-	ret = perf_evlist__parse_sample(session->evlist, event, &sample);
+	ret = perf_evlist__parse_sample(evlist, event, &sample);
 	if (ret)
 		return ret;
 
 	if (tool->ordered_events) {
-		ret = perf_session_queue_event(session, event, tool, &sample,
-					       file_offset);
+		ret = perf_session__queue_event(session, event, &sample, file_offset);
 		if (ret != -ETIME)
 			return ret;
 	}
 
-	return perf_session__deliver_event(session, event, &sample, tool,
-					   file_offset);
+	return machines__deliver_event(&session->machines, evlist, event,
+				       &sample, tool, file_offset);
 }
 
 void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1076,54 +1125,57 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
 	return thread;
 }
 
-static void perf_session__warn_about_errors(const struct perf_session *session,
-					    const struct perf_tool *tool)
+static void perf_session__warn_about_errors(const struct perf_session *session)
 {
-	if (tool->lost == perf_event__process_lost &&
-	    session->stats.nr_events[PERF_RECORD_LOST] != 0) {
+	const struct events_stats *stats = &session->evlist->stats;
+	const struct ordered_events *oe = &session->ordered_events;
+
+	if (session->tool->lost == perf_event__process_lost &&
+	    stats->nr_events[PERF_RECORD_LOST] != 0) {
 		ui__warning("Processed %d events and lost %d chunks!\n\n"
 			    "Check IO/CPU overload!\n\n",
-			    session->stats.nr_events[0],
-			    session->stats.nr_events[PERF_RECORD_LOST]);
+			    stats->nr_events[0],
+			    stats->nr_events[PERF_RECORD_LOST]);
 	}
 
-	if (session->stats.nr_unknown_events != 0) {
+	if (stats->nr_unknown_events != 0) {
 		ui__warning("Found %u unknown events!\n\n"
 			    "Is this an older tool processing a perf.data "
 			    "file generated by a more recent tool?\n\n"
 			    "If that is not the case, consider "
 			    "reporting to linux-kernel@vger.kernel.org.\n\n",
-			    session->stats.nr_unknown_events);
+			    stats->nr_unknown_events);
 	}
 
-	if (session->stats.nr_unknown_id != 0) {
+	if (stats->nr_unknown_id != 0) {
 		ui__warning("%u samples with id not present in the header\n",
-			    session->stats.nr_unknown_id);
+			    stats->nr_unknown_id);
 	}
 
- 	if (session->stats.nr_invalid_chains != 0) {
- 		ui__warning("Found invalid callchains!\n\n"
- 			    "%u out of %u events were discarded for this reason.\n\n"
- 			    "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
- 			    session->stats.nr_invalid_chains,
- 			    session->stats.nr_events[PERF_RECORD_SAMPLE]);
- 	}
+	if (stats->nr_invalid_chains != 0) {
+		ui__warning("Found invalid callchains!\n\n"
+			    "%u out of %u events were discarded for this reason.\n\n"
+			    "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
+			    stats->nr_invalid_chains,
+			    stats->nr_events[PERF_RECORD_SAMPLE]);
+	}
 
-	if (session->stats.nr_unprocessable_samples != 0) {
+	if (stats->nr_unprocessable_samples != 0) {
 		ui__warning("%u unprocessable samples recorded.\n"
 			    "Do you have a KVM guest running and not using 'perf kvm'?\n",
-			    session->stats.nr_unprocessable_samples);
+			    stats->nr_unprocessable_samples);
 	}
 
-	if (session->stats.nr_unordered_events != 0)
-		ui__warning("%u out of order events recorded.\n", session->stats.nr_unordered_events);
+	if (oe->nr_unordered_events != 0)
+		ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
 }
 
 volatile int session_done;
 
-static int __perf_session__process_pipe_events(struct perf_session *session,
-					       struct perf_tool *tool)
+static int __perf_session__process_pipe_events(struct perf_session *session)
 {
+	struct ordered_events *oe = &session->ordered_events;
+	struct perf_tool *tool = session->tool;
 	int fd = perf_data_file__fd(session->file);
 	union perf_event *event;
 	uint32_t size, cur_size = 0;
@@ -1187,7 +1239,7 @@ more:
 		}
 	}
 
-	if ((skip = perf_session__process_event(session, event, tool, head)) < 0) {
+	if ((skip = perf_session__process_event(session, event, head)) < 0) {
 		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n",
 		       head, event->header.size, event->header.type);
 		err = -EINVAL;
@@ -1203,10 +1255,10 @@ more:
 		goto more;
 done:
 	/* do the final flush for ordered samples */
-	err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
+	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
 out_err:
 	free(buf);
-	perf_session__warn_about_errors(session, tool);
+	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
 	return err;
 }
@@ -1253,8 +1305,10 @@ fetch_mmaped_event(struct perf_session *session,
 
 static int __perf_session__process_events(struct perf_session *session,
 					  u64 data_offset, u64 data_size,
-					  u64 file_size, struct perf_tool *tool)
+					  u64 file_size)
 {
+	struct ordered_events *oe = &session->ordered_events;
+	struct perf_tool *tool = session->tool;
 	int fd = perf_data_file__fd(session->file);
 	u64 head, page_offset, file_offset, file_pos, size;
 	int err, mmap_prot, mmap_flags, map_idx = 0;
@@ -1323,8 +1377,7 @@ more:
 	size = event->header.size;
 
 	if (size < sizeof(struct perf_event_header) ||
-	    (skip = perf_session__process_event(session, event, tool, file_pos))
-									< 0) {
+	    (skip = perf_session__process_event(session, event, file_pos)) < 0) {
 		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n",
 		       file_offset + head, event->header.size,
 		       event->header.type);
@@ -1348,17 +1401,16 @@ more:
 
 out:
 	/* do the final flush for ordered samples */
-	err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
+	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
 out_err:
 	ui_progress__finish();
-	perf_session__warn_about_errors(session, tool);
+	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
 	session->one_mmap = false;
 	return err;
 }
 
-int perf_session__process_events(struct perf_session *session,
-				 struct perf_tool *tool)
+int perf_session__process_events(struct perf_session *session)
 {
 	u64 size = perf_data_file__size(session->file);
 	int err;
@@ -1369,10 +1421,9 @@ int perf_session__process_events(struct perf_session *session,
 	if (!perf_data_file__is_pipe(session->file))
 		err = __perf_session__process_events(session,
 						     session->header.data_offset,
-						     session->header.data_size,
-						     size, tool);
+						     session->header.data_size, size);
 	else
-		err = __perf_session__process_pipe_events(session, tool);
+		err = __perf_session__process_pipe_events(session);
 
 	return err;
 }
@@ -1415,6 +1466,9 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
 
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
 		struct kmap *kmap = map__kmap(maps[i]);
+
+		if (!kmap)
+			continue;
 		kmap->ref_reloc_sym = ref;
 	}
 
@@ -1436,7 +1490,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 {
 	size_t ret = fprintf(fp, "Aggregated stats:\n");
 
-	ret += events_stats__fprintf(&session->stats, fp);
+	ret += events_stats__fprintf(&session->evlist->stats, fp);
 	return ret;
 }
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 6d663dc76404..d5fa7b7916ef 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -20,13 +20,13 @@ struct perf_session {
 	struct machines		machines;
 	struct perf_evlist	*evlist;
 	struct trace_event	tevent;
-	struct events_stats	stats;
 	bool			repipe;
 	bool			one_mmap;
 	void			*one_mmap_addr;
 	u64			one_mmap_offset;
 	struct ordered_events	ordered_events;
 	struct perf_data_file	*file;
+	struct perf_tool	*tool;
 };
 
 #define PRINT_IP_OPT_IP		(1<<0)
@@ -49,20 +49,13 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
 			     union perf_event **event_ptr,
 			     struct perf_sample *sample);
 
-int perf_session__process_events(struct perf_session *session,
-				 struct perf_tool *tool);
+int perf_session__process_events(struct perf_session *session);
 
-int perf_session_queue_event(struct perf_session *s, union perf_event *event,
-			     struct perf_tool *tool, struct perf_sample *sample,
-			     u64 file_offset);
+int perf_session__queue_event(struct perf_session *s, union perf_event *event,
+			      struct perf_sample *sample, u64 file_offset);
 
 void perf_tool__fill_defaults(struct perf_tool *tool);
 
-int perf_session__deliver_event(struct perf_session *session,
-				union perf_event *event,
-				struct perf_sample *sample,
-				struct perf_tool *tool, u64 file_offset);
-
 int perf_session__resolve_callchain(struct perf_session *session,
 				    struct perf_evsel *evsel,
 				    struct thread *thread,
@@ -126,8 +119,7 @@ extern volatile int session_done;
 
 int perf_session__deliver_synth_event(struct perf_session *session,
 				      union perf_event *event,
-				      struct perf_sample *sample,
-				      struct perf_tool *tool);
+				      struct perf_sample *sample);
 
 int perf_event__process_id_index(struct perf_tool *tool,
 				 union perf_event *event,
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index d0aee4b9dfd4..1833103768cb 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -25,7 +25,7 @@ cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter'
 build_lib = getenv('PYTHON_EXTBUILD_LIB')
 build_tmp = getenv('PYTHON_EXTBUILD_TMP')
 libtraceevent = getenv('LIBTRACEEVENT')
-libapikfs = getenv('LIBAPIKFS')
+libapikfs = getenv('LIBAPI')
 
 ext_sources = [f.strip() for f in file('util/python-ext-sources')
 				if len(f.strip()) > 0 and f[0] != '#']
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 7a39c1ed8d37..4593f36ecc4c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1463,6 +1463,15 @@ int sort_dimension__add(const char *tok)
 			sort__has_parent = 1;
 		} else if (sd->entry == &sort_sym) {
 			sort__has_sym = 1;
+			/*
+			 * perf diff displays the performance difference amongst
+			 * two or more perf.data files. Those files could come
+			 * from different binaries. So we should not compare
+			 * their ips, but the name of symbol.
+			 */
+			if (sort__mode == SORT_MODE__DIFF)
+				sd->entry->se_collapse = sort__sym_sort;
+
 		} else if (sd->entry == &sort_dso) {
 			sort__has_dso = 1;
 		}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index c03e4ff8beff..846036a921dc 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -44,6 +44,7 @@ extern struct sort_entry sort_dso_to;
 extern struct sort_entry sort_sym_from;
 extern struct sort_entry sort_sym_to;
 extern enum sort_type sort__first_dimension;
+extern const char default_mem_sort_order[];
 
 struct he_stat {
 	u64			period;
@@ -102,7 +103,6 @@ struct hist_entry {
 
 	bool			init_have_children;
 	char			level;
-	bool			used;
 	u8			filtered;
 	char			*srcline;
 	struct symbol		*parent;
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 33b7a2aef713..a7ab6063e038 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -74,6 +74,10 @@ static inline uint8_t elf_sym__type(const GElf_Sym *sym)
 	return GELF_ST_TYPE(sym->st_info);
 }
 
+#ifndef STT_GNU_IFUNC
+#define STT_GNU_IFUNC 10
+#endif
+
 static inline int elf_sym__is_function(const GElf_Sym *sym)
 {
 	return (elf_sym__type(sym) == STT_FUNC ||
@@ -575,32 +579,37 @@ static int dso__swap_init(struct dso *dso, unsigned char eidata)
 static int decompress_kmodule(struct dso *dso, const char *name,
 			      enum dso_binary_type type)
 {
-	int fd;
-	const char *ext = strrchr(name, '.');
+	int fd = -1;
 	char tmpbuf[] = "/tmp/perf-kmod-XXXXXX";
+	struct kmod_path m;
 
 	if (type != DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP &&
 	    type != DSO_BINARY_TYPE__GUEST_KMODULE_COMP &&
 	    type != DSO_BINARY_TYPE__BUILD_ID_CACHE)
 		return -1;
 
-	if (!ext || !is_supported_compression(ext + 1)) {
-		ext = strrchr(dso->name, '.');
-		if (!ext || !is_supported_compression(ext + 1))
-			return -1;
-	}
+	if (type == DSO_BINARY_TYPE__BUILD_ID_CACHE)
+		name = dso->long_name;
 
-	fd = mkstemp(tmpbuf);
-	if (fd < 0)
+	if (kmod_path__parse_ext(&m, name) || !m.comp)
 		return -1;
 
-	if (!decompress_to_file(ext + 1, name, fd)) {
+	fd = mkstemp(tmpbuf);
+	if (fd < 0) {
+		dso->load_errno = errno;
+		goto out;
+	}
+
+	if (!decompress_to_file(m.ext, name, fd)) {
+		dso->load_errno = DSO_LOAD_ERRNO__DECOMPRESSION_FAILURE;
 		close(fd);
 		fd = -1;
 	}
 
 	unlink(tmpbuf);
 
+out:
+	free(m.ext);
 	return fd;
 }
 
@@ -629,37 +638,49 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 	Elf *elf;
 	int fd;
 
-	if (dso__needs_decompress(dso))
+	if (dso__needs_decompress(dso)) {
 		fd = decompress_kmodule(dso, name, type);
-	else
+		if (fd < 0)
+			return -1;
+	} else {
 		fd = open(name, O_RDONLY);
-
-	if (fd < 0)
-		return -1;
+		if (fd < 0) {
+			dso->load_errno = errno;
+			return -1;
+		}
+	}
 
 	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
 		pr_debug("%s: cannot read %s ELF file.\n", __func__, name);
+		dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF;
 		goto out_close;
 	}
 
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF;
 		pr_debug("%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
 	}
 
-	if (dso__swap_init(dso, ehdr.e_ident[EI_DATA]))
+	if (dso__swap_init(dso, ehdr.e_ident[EI_DATA])) {
+		dso->load_errno = DSO_LOAD_ERRNO__INTERNAL_ERROR;
 		goto out_elf_end;
+	}
 
 	/* Always reject images with a mismatched build-id: */
 	if (dso->has_build_id) {
 		u8 build_id[BUILD_ID_SIZE];
 
-		if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) < 0)
+		if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) < 0) {
+			dso->load_errno = DSO_LOAD_ERRNO__CANNOT_READ_BUILDID;
 			goto out_elf_end;
+		}
 
-		if (!dso__build_id_equal(dso, build_id))
+		if (!dso__build_id_equal(dso, build_id)) {
+			dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID;
 			goto out_elf_end;
+		}
 	}
 
 	ss->is_64_bit = (gelf_getclass(elf) == ELFCLASS64);
@@ -695,8 +716,10 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 	}
 
 	ss->name   = strdup(name);
-	if (!ss->name)
+	if (!ss->name) {
+		dso->load_errno = errno;
 		goto out_elf_end;
+	}
 
 	ss->elf    = elf;
 	ss->fd     = fd;
@@ -753,6 +776,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		  symbol_filter_t filter, int kmodule)
 {
 	struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL;
+	struct map_groups *kmaps = kmap ? map__kmaps(map) : NULL;
 	struct map *curr_map = map;
 	struct dso *curr_dso = dso;
 	Elf_Data *symstrs, *secstrs;
@@ -768,6 +792,9 @@ int dso__load_sym(struct dso *dso, struct map *map,
 	int nr = 0;
 	bool remap_kernel = false, adjust_kernel_syms = false;
 
+	if (kmap && !kmaps)
+		return -1;
+
 	dso->symtab_type = syms_ss->type;
 	dso->is_64_bit = syms_ss->is_64_bit;
 	dso->rel = syms_ss->ehdr.e_type == ET_REL;
@@ -864,10 +891,9 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		/* Reject ARM ELF "mapping symbols": these aren't unique and
 		 * don't identify functions, so will confuse the profile
 		 * output: */
-		if (ehdr.e_machine == EM_ARM) {
-			if (!strcmp(elf_name, "$a") ||
-			    !strcmp(elf_name, "$d") ||
-			    !strcmp(elf_name, "$t"))
+		if (ehdr.e_machine == EM_ARM || ehdr.e_machine == EM_AARCH64) {
+			if (elf_name[0] == '$' && strchr("adtx", elf_name[1])
+			    && (elf_name[2] == '\0' || elf_name[2] == '.'))
 				continue;
 		}
 
@@ -936,8 +962,10 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					map->map_ip = map__map_ip;
 					map->unmap_ip = map__unmap_ip;
 					/* Ensure maps are correctly ordered */
-					map_groups__remove(kmap->kmaps, map);
-					map_groups__insert(kmap->kmaps, map);
+					if (kmaps) {
+						map_groups__remove(kmaps, map);
+						map_groups__insert(kmaps, map);
+					}
 				}
 
 				/*
@@ -961,7 +989,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 			snprintf(dso_name, sizeof(dso_name),
 				 "%s%s", dso->short_name, section_name);
 
-			curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name);
+			curr_map = map_groups__find_by_name(kmaps, map->type, dso_name);
 			if (curr_map == NULL) {
 				u64 start = sym.st_value;
 
@@ -991,7 +1019,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					curr_map->unmap_ip = identity__map_ip;
 				}
 				curr_dso->symtab_type = dso->symtab_type;
-				map_groups__insert(kmap->kmaps, curr_map);
+				map_groups__insert(kmaps, curr_map);
 				/*
 				 * The new DSO should go to the kernel DSOS
 				 */
@@ -1045,14 +1073,15 @@ new_symbol:
 	 * For misannotated, zeroed, ASM function sizes.
 	 */
 	if (nr > 0) {
-		symbols__fixup_duplicate(&dso->symbols[map->type]);
+		if (!symbol_conf.allow_aliases)
+			symbols__fixup_duplicate(&dso->symbols[map->type]);
 		symbols__fixup_end(&dso->symbols[map->type]);
 		if (kmap) {
 			/*
 			 * We need to fixup this here too because we create new
 			 * maps here, for things like vsyscall sections.
 			 */
-			__map_groups__fixup_end(kmap->kmaps, map->type);
+			__map_groups__fixup_end(kmaps, map->type);
 		}
 	}
 	err = nr;
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index d7efb03b3f9a..fd8477cacf88 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -246,13 +246,12 @@ out:
 	return ret;
 }
 
-int symsrc__init(struct symsrc *ss, struct dso *dso __maybe_unused,
-		 const char *name,
+int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 	         enum dso_binary_type type)
 {
 	int fd = open(name, O_RDONLY);
 	if (fd < 0)
-		return -1;
+		goto out_errno;
 
 	ss->name = strdup(name);
 	if (!ss->name)
@@ -264,6 +263,8 @@ int symsrc__init(struct symsrc *ss, struct dso *dso __maybe_unused,
 	return 0;
 out_close:
 	close(fd);
+out_errno:
+	dso->load_errno = errno;
 	return -1;
 }
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index a69066865a55..201f6c4ca738 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -15,6 +15,7 @@
 #include "machine.h"
 #include "symbol.h"
 #include "strlist.h"
+#include "intlist.h"
 #include "header.h"
 
 #include <elf.h>
@@ -629,13 +630,16 @@ static int dso__load_all_kallsyms(struct dso *dso, const char *filename,
 static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 					 symbol_filter_t filter)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
+	struct map_groups *kmaps = map__kmaps(map);
 	struct map *curr_map;
 	struct symbol *pos;
 	int count = 0, moved = 0;
 	struct rb_root *root = &dso->symbols[map->type];
 	struct rb_node *next = rb_first(root);
 
+	if (!kmaps)
+		return -1;
+
 	while (next) {
 		char *module;
 
@@ -681,8 +685,8 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
 			       symbol_filter_t filter)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
-	struct machine *machine = kmaps->machine;
+	struct map_groups *kmaps = map__kmaps(map);
+	struct machine *machine;
 	struct map *curr_map = map;
 	struct symbol *pos;
 	int count = 0, moved = 0;
@@ -690,6 +694,11 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
 	struct rb_node *next = rb_first(root);
 	int kernel_range = 0;
 
+	if (!kmaps)
+		return -1;
+
+	machine = kmaps->machine;
+
 	while (next) {
 		char *module;
 
@@ -1024,9 +1033,12 @@ static bool filename_from_kallsyms_filename(char *filename,
 static int validate_kcore_modules(const char *kallsyms_filename,
 				  struct map *map)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
+	struct map_groups *kmaps = map__kmaps(map);
 	char modules_filename[PATH_MAX];
 
+	if (!kmaps)
+		return -EINVAL;
+
 	if (!filename_from_kallsyms_filename(modules_filename, "modules",
 					     kallsyms_filename))
 		return -EINVAL;
@@ -1042,6 +1054,9 @@ static int validate_kcore_addresses(const char *kallsyms_filename,
 {
 	struct kmap *kmap = map__kmap(map);
 
+	if (!kmap)
+		return -EINVAL;
+
 	if (kmap->ref_reloc_sym && kmap->ref_reloc_sym->name) {
 		u64 start;
 
@@ -1080,8 +1095,8 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data)
 static int dso__load_kcore(struct dso *dso, struct map *map,
 			   const char *kallsyms_filename)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
-	struct machine *machine = kmaps->machine;
+	struct map_groups *kmaps = map__kmaps(map);
+	struct machine *machine;
 	struct kcore_mapfn_data md;
 	struct map *old_map, *new_map, *replacement_map = NULL;
 	bool is_64_bit;
@@ -1089,6 +1104,11 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
 	char kcore_filename[PATH_MAX];
 	struct symbol *sym;
 
+	if (!kmaps)
+		return -EINVAL;
+
+	machine = kmaps->machine;
+
 	/* This function requires that the map is the kernel map */
 	if (map != machine->vmlinux_maps[map->type])
 		return -EINVAL;
@@ -1201,6 +1221,9 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
 	struct kmap *kmap = map__kmap(map);
 	u64 addr;
 
+	if (!kmap)
+		return -1;
+
 	if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->name)
 		return 0;
 
@@ -1859,6 +1882,20 @@ int setup_list(struct strlist **list, const char *list_str,
 	return 0;
 }
 
+int setup_intlist(struct intlist **list, const char *list_str,
+		  const char *list_name)
+{
+	if (list_str == NULL)
+		return 0;
+
+	*list = intlist__new(list_str);
+	if (!*list) {
+		pr_err("problems parsing %s list\n", list_name);
+		return -1;
+	}
+	return 0;
+}
+
 static bool symbol__read_kptr_restrict(void)
 {
 	bool value = false;
@@ -1909,9 +1946,17 @@ int symbol__init(struct perf_session_env *env)
 		       symbol_conf.comm_list_str, "comm") < 0)
 		goto out_free_dso_list;
 
+	if (setup_intlist(&symbol_conf.pid_list,
+		       symbol_conf.pid_list_str, "pid") < 0)
+		goto out_free_comm_list;
+
+	if (setup_intlist(&symbol_conf.tid_list,
+		       symbol_conf.tid_list_str, "tid") < 0)
+		goto out_free_pid_list;
+
 	if (setup_list(&symbol_conf.sym_list,
 		       symbol_conf.sym_list_str, "symbol") < 0)
-		goto out_free_comm_list;
+		goto out_free_tid_list;
 
 	/*
 	 * A path to symbols of "/" is identical to ""
@@ -1930,6 +1975,10 @@ int symbol__init(struct perf_session_env *env)
 	symbol_conf.initialized = true;
 	return 0;
 
+out_free_tid_list:
+	intlist__delete(symbol_conf.tid_list);
+out_free_pid_list:
+	intlist__delete(symbol_conf.pid_list);
 out_free_comm_list:
 	strlist__delete(symbol_conf.comm_list);
 out_free_dso_list:
@@ -1944,6 +1993,8 @@ void symbol__exit(void)
 	strlist__delete(symbol_conf.sym_list);
 	strlist__delete(symbol_conf.dso_list);
 	strlist__delete(symbol_conf.comm_list);
+	intlist__delete(symbol_conf.tid_list);
+	intlist__delete(symbol_conf.pid_list);
 	vmlinux_path__exit();
 	symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL;
 	symbol_conf.initialized = false;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 1650dcb3a67b..09561500164a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -78,6 +78,7 @@ static inline size_t symbol__size(const struct symbol *sym)
 }
 
 struct strlist;
+struct intlist;
 
 struct symbol_conf {
 	unsigned short	priv_size;
@@ -87,6 +88,7 @@ struct symbol_conf {
 			ignore_vmlinux_buildid,
 			show_kernel_path,
 			use_modules,
+			allow_aliases,
 			sort_by_name,
 			show_nr_samples,
 			show_total_period,
@@ -114,6 +116,8 @@ struct symbol_conf {
 	const char	*guestmount;
 	const char	*dso_list_str,
 			*comm_list_str,
+			*pid_list_str,
+			*tid_list_str,
 			*sym_list_str,
 			*col_width_list_str;
        struct strlist	*dso_list,
@@ -123,6 +127,8 @@ struct symbol_conf {
 			*dso_to_list,
 			*sym_from_list,
 			*sym_to_list;
+	struct intlist	*pid_list,
+			*tid_list;
 	const char	*symfs;
 };
 
@@ -294,5 +300,7 @@ int compare_proc_modules(const char *from, const char *to);
 
 int setup_list(struct strlist **list, const char *list_str,
 	       const char *list_name);
+int setup_intlist(struct intlist **list, const char *list_str,
+		  const char *list_name);
 
 #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c
index e74c5963dc7a..a53603b27e52 100644
--- a/tools/perf/util/target.c
+++ b/tools/perf/util/target.c
@@ -123,11 +123,8 @@ int target__strerror(struct target *target, int errnum,
 	if (errnum >= 0) {
 		const char *err = strerror_r(errnum, buf, buflen);
 
-		if (err != buf) {
-			size_t len = strlen(err);
-			memcpy(buf, err, min(buflen - 1, len));
-			*(buf + min(buflen - 1, len)) = '\0';
-		}
+		if (err != buf)
+			scnprintf(buf, buflen, "%s", err);
 
 		return 0;
 	}
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 9ebc8b1f9be5..1c8fbc9588c5 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -82,6 +82,20 @@ void thread__delete(struct thread *thread)
 	free(thread);
 }
 
+struct thread *thread__get(struct thread *thread)
+{
+	++thread->refcnt;
+	return thread;
+}
+
+void thread__put(struct thread *thread)
+{
+	if (thread && --thread->refcnt == 0) {
+		list_del_init(&thread->node);
+		thread__delete(thread);
+	}
+}
+
 struct comm *thread__comm(const struct thread *thread)
 {
 	if (list_empty(&thread->comm_list))
@@ -192,7 +206,6 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp)
 		err = thread__set_comm(thread, comm, timestamp);
 		if (err)
 			return err;
-		thread->comm_set = true;
 	}
 
 	thread->ppid = parent->tid;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 160fd066a7d1..9b8a54dc34a8 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -7,6 +7,7 @@
 #include <sys/types.h>
 #include "symbol.h"
 #include <strlist.h>
+#include <intlist.h>
 
 struct thread_stack;
 
@@ -20,6 +21,7 @@ struct thread {
 	pid_t			tid;
 	pid_t			ppid;
 	int			cpu;
+	int			refcnt;
 	char			shortname[3];
 	bool			comm_set;
 	bool			dead; /* if set thread has exited */
@@ -37,6 +39,18 @@ struct comm;
 struct thread *thread__new(pid_t pid, pid_t tid);
 int thread__init_map_groups(struct thread *thread, struct machine *machine);
 void thread__delete(struct thread *thread);
+
+struct thread *thread__get(struct thread *thread);
+void thread__put(struct thread *thread);
+
+static inline void __thread__zput(struct thread **thread)
+{
+	thread__put(*thread);
+	*thread = NULL;
+}
+
+#define thread__zput(thread) __thread__zput(&thread)
+
 static inline void thread__exited(struct thread *thread)
 {
 	thread->dead = true;
@@ -87,6 +101,16 @@ static inline bool thread__is_filtered(struct thread *thread)
 		return true;
 	}
 
+	if (symbol_conf.pid_list &&
+	    !intlist__has_entry(symbol_conf.pid_list, thread->pid_)) {
+		return true;
+	}
+
+	if (symbol_conf.tid_list &&
+	    !intlist__has_entry(symbol_conf.tid_list, thread->tid)) {
+		return true;
+	}
+
 	return false;
 }
 
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index bb2708bbfaca..51d9e56c0f84 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -10,6 +10,7 @@ struct perf_evsel;
 struct perf_sample;
 struct perf_tool;
 struct machine;
+struct ordered_events;
 
 typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event,
 			    struct perf_sample *sample,
@@ -25,6 +26,9 @@ typedef int (*event_attr_op)(struct perf_tool *tool,
 typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
 			 struct perf_session *session);
 
+typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
+			struct ordered_events *oe);
+
 struct perf_tool {
 	event_sample	sample,
 			read;
@@ -38,8 +42,8 @@ struct perf_tool {
 			unthrottle;
 	event_attr_op	attr;
 	event_op2	tracing_data;
-	event_op2	finished_round,
-			build_id,
+	event_oe	finished_round;
+	event_op2	build_id,
 			id_index;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index c36636fd825b..25d6c737be3e 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -112,8 +112,8 @@ unsigned long long read_size(struct event_format *event, void *ptr, int size)
 	return pevent_read_number(event->pevent, ptr, size);
 }
 
-void event_format__print(struct event_format *event,
-			 int cpu, void *data, int size)
+void event_format__fprintf(struct event_format *event,
+			   int cpu, void *data, int size, FILE *fp)
 {
 	struct pevent_record record;
 	struct trace_seq s;
@@ -125,10 +125,16 @@ void event_format__print(struct event_format *event,
 
 	trace_seq_init(&s);
 	pevent_event_info(&s, event, &record);
-	trace_seq_do_printf(&s);
+	trace_seq_do_fprintf(&s, fp);
 	trace_seq_destroy(&s);
 }
 
+void event_format__print(struct event_format *event,
+			 int cpu, void *data, int size)
+{
+	return event_format__fprintf(event, cpu, data, size, stdout);
+}
+
 void parse_proc_kallsyms(struct pevent *pevent,
 			 char *file, unsigned int size __maybe_unused)
 {
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 5c9bdd1591a9..9df61059a85d 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -43,7 +43,6 @@ static int stop_script_unsupported(void)
 static void process_event_unsupported(union perf_event *event __maybe_unused,
 				      struct perf_sample *sample __maybe_unused,
 				      struct perf_evsel *evsel __maybe_unused,
-				      struct thread *thread __maybe_unused,
 				      struct addr_location *al __maybe_unused)
 {
 }
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 52aaa19e1eb1..d5168f0be4ec 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -23,6 +23,9 @@ trace_event__tp_format(const char *sys, const char *name);
 
 int bigendian(void);
 
+void event_format__fprintf(struct event_format *event,
+			   int cpu, void *data, int size, FILE *fp);
+
 void event_format__print(struct event_format *event,
 			 int cpu, void *data, int size);
 
@@ -69,8 +72,7 @@ struct scripting_ops {
 	void (*process_event) (union perf_event *event,
 			       struct perf_sample *sample,
 			       struct perf_evsel *evsel,
-			       struct thread *thread,
-				   struct addr_location *al);
+			       struct addr_location *al);
 	int (*generate_script) (struct pevent *pevent, const char *outfile);
 };
 
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index e3c40a520a25..7b09a443a280 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -266,7 +266,7 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
 				     u64 *fde_count)
 {
 	int ret = -EINVAL, fd;
-	u64 offset = dso->data.frame_offset;
+	u64 offset = dso->data.eh_frame_hdr_offset;
 
 	if (offset == 0) {
 		fd = dso__data_fd(dso, machine);
@@ -275,7 +275,7 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
 
 		/* Check the .eh_frame section for unwinding info */
 		offset = elf_section_offset(fd, ".eh_frame_hdr");
-		dso->data.frame_offset = offset;
+		dso->data.eh_frame_hdr_offset = offset;
 	}
 
 	if (offset)
@@ -291,7 +291,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
 					struct machine *machine, u64 *offset)
 {
 	int fd;
-	u64 ofs = dso->data.frame_offset;
+	u64 ofs = dso->data.debug_frame_offset;
 
 	if (ofs == 0) {
 		fd = dso__data_fd(dso, machine);
@@ -300,7 +300,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
 
 		/* Check the .debug_frame section for unwinding info */
 		ofs = elf_section_offset(fd, ".debug_frame");
-		dso->data.frame_offset = ofs;
+		dso->data.debug_frame_offset = ofs;
 	}
 
 	*offset = ofs;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index b86744f29eef..4ee6d0d4c993 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -269,6 +269,13 @@ void dump_stack(void)
 void dump_stack(void) {}
 #endif
 
+void sighandler_dump_stack(int sig)
+{
+	psignal(sig, "perf");
+	dump_stack();
+	exit(sig);
+}
+
 void get_term_dimensions(struct winsize *ws)
 {
 	char *s = getenv("LINES");
@@ -303,13 +310,26 @@ void set_term_quiet_input(struct termios *old)
 	tcsetattr(0, TCSANOW, &tc);
 }
 
-static void set_tracing_events_path(const char *mountpoint)
+static void set_tracing_events_path(const char *tracing, const char *mountpoint)
 {
-	snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s",
-		 mountpoint, "tracing/events");
+	snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s%s",
+		 mountpoint, tracing, "events");
 }
 
-const char *perf_debugfs_mount(const char *mountpoint)
+static const char *__perf_tracefs_mount(const char *mountpoint)
+{
+	const char *mnt;
+
+	mnt = tracefs_mount(mountpoint);
+	if (!mnt)
+		return NULL;
+
+	set_tracing_events_path("", mnt);
+
+	return mnt;
+}
+
+static const char *__perf_debugfs_mount(const char *mountpoint)
 {
 	const char *mnt;
 
@@ -317,7 +337,20 @@ const char *perf_debugfs_mount(const char *mountpoint)
 	if (!mnt)
 		return NULL;
 
-	set_tracing_events_path(mnt);
+	set_tracing_events_path("tracing/", mnt);
+
+	return mnt;
+}
+
+const char *perf_debugfs_mount(const char *mountpoint)
+{
+	const char *mnt;
+
+	mnt = __perf_tracefs_mount(mountpoint);
+	if (mnt)
+		return mnt;
+
+	mnt = __perf_debugfs_mount(mountpoint);
 
 	return mnt;
 }
@@ -325,12 +358,19 @@ const char *perf_debugfs_mount(const char *mountpoint)
 void perf_debugfs_set_path(const char *mntpt)
 {
 	snprintf(debugfs_mountpoint, strlen(debugfs_mountpoint), "%s", mntpt);
-	set_tracing_events_path(mntpt);
+	set_tracing_events_path("tracing/", mntpt);
+}
+
+static const char *find_tracefs(void)
+{
+	const char *path = __perf_tracefs_mount(NULL);
+
+	return path;
 }
 
 static const char *find_debugfs(void)
 {
-	const char *path = perf_debugfs_mount(NULL);
+	const char *path = __perf_debugfs_mount(NULL);
 
 	if (!path)
 		fprintf(stderr, "Your kernel does not support the debugfs filesystem");
@@ -344,6 +384,7 @@ static const char *find_debugfs(void)
  */
 const char *find_tracing_dir(void)
 {
+	const char *tracing_dir = "";
 	static char *tracing;
 	static int tracing_found;
 	const char *debugfs;
@@ -351,11 +392,15 @@ const char *find_tracing_dir(void)
 	if (tracing_found)
 		return tracing;
 
-	debugfs = find_debugfs();
-	if (!debugfs)
-		return NULL;
+	debugfs = find_tracefs();
+	if (!debugfs) {
+		tracing_dir = "/tracing";
+		debugfs = find_debugfs();
+		if (!debugfs)
+			return NULL;
+	}
 
-	if (asprintf(&tracing, "%s/tracing", debugfs) < 0)
+	if (asprintf(&tracing, "%s%s", debugfs, tracing_dir) < 0)
 		return NULL;
 
 	tracing_found = 1;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 027a5153495c..1ff23e04ad27 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -75,6 +75,7 @@
 #include <linux/types.h>
 #include <sys/ttydefaults.h>
 #include <api/fs/debugfs.h>
+#include <api/fs/tracefs.h>
 #include <termios.h>
 #include <linux/bitops.h>
 #include <termios.h>
@@ -276,6 +277,7 @@ char *ltrim(char *s);
 char *rtrim(char *s);
 
 void dump_stack(void);
+void sighandler_dump_stack(int sig);
 
 extern unsigned int page_size;
 extern int cacheline_size;
@@ -327,4 +329,8 @@ bool find_process(const char *name);
 int gzip_decompress_to_file(const char *input, int output_fd);
 #endif
 
+#ifdef HAVE_LZMA_SUPPORT
+int lzma_decompress_to_file(const char *input, int output_fd);
+#endif
+
 #endif /* GIT_COMPAT_UTIL_H */