diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7fc70ae21185..dc7d0a95bd36 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Per-cpu breakpoints are not supported by our stepping * mechanism. */ - if (!bp->hw.bp_target) + if (!bp->hw.target) return -EINVAL; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 98bbe06e469c..e7d934d3afe0 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Disallow per-task kernel breakpoints since these would * complicate the stepping code. */ - if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target) + if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) return -EINVAL; return 0; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 7c4f6690533a..7fd60dcb2cb0 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -124,7 +124,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw) static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} -static void power_pmu_flush_branch_stack(void) {} +static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } #endif /* CONFIG_PPC32 */ @@ -350,6 +350,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event) cpuhw->bhrb_context = event->ctx; } cpuhw->bhrb_users++; + perf_sched_cb_inc(event->ctx->pmu); } static void power_pmu_bhrb_disable(struct perf_event *event) @@ -361,6 +362,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event) cpuhw->bhrb_users--; WARN_ON_ONCE(cpuhw->bhrb_users < 0); + perf_sched_cb_dec(event->ctx->pmu); if (!cpuhw->disabled && !cpuhw->bhrb_users) { /* BHRB cannot be turned off when other @@ -375,9 +377,12 @@ static void power_pmu_bhrb_disable(struct perf_event *event) /* Called from ctxsw to prevent one process's branch entries to * mingle with the other process's entries during context switch. */ -static void power_pmu_flush_branch_stack(void) +static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (ppmu->bhrb_nr) + if (!ppmu->bhrb_nr) + return; + + if (sched_in) power_pmu_bhrb_reset(); } /* Calculate the to address for a branch */ @@ -1901,7 +1906,7 @@ static struct pmu power_pmu = { .cancel_txn = power_pmu_cancel_txn, .commit_txn = power_pmu_commit_txn, .event_idx = power_pmu_event_idx, - .flush_branch_stack = power_pmu_flush_branch_stack, + .sched_task = power_pmu_sched_task, }; /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 854c04b3c9c2..7ee9b94d9921 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #endif -#define NCAPINTS 11 /* N 32-bit words worth of info */ +#define NCAPINTS 13 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -195,6 +195,7 @@ #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -226,6 +227,7 @@ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ @@ -244,6 +246,12 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d2203b5d9538..23ba6765b718 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -109,6 +109,9 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ int x86_cache_size; int x86_cache_alignment; /* In bytes */ + /* Cache QoS architectural values: */ + int x86_cache_max_rmid; /* max index */ + int x86_cache_occ_scale; /* scale to bytes */ int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 3ce079136c11..1a4eae695ca8 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -74,6 +74,24 @@ #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + #define MSR_MTRRfix64K_00000 0x00000250 #define MSR_MTRRfix16K_80000 0x00000258 #define MSR_MTRRfix16K_A0000 0x00000259 diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 80091ae54c2b..9bff68798836 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f70538012e2..a62cf04dac8a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[10] = eax; } + /* Additional Intel-defined flags: level 0x0000000F */ + if (c->cpuid_level >= 0x0000000F) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=0 */ + cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[11] = edx; + if (cpu_has(c, X86_FEATURE_CQM_LLC)) { + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = ebx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[12] = edx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } + } else { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + } + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c) detect_nopl(c); } +static void x86_init_cache_qos(struct cpuinfo_x86 *c) +{ + /* + * The heavy lifting of max_rmid and cache_occ_scale are handled + * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu + * in case CQM bits really aren't there in this CPU. + */ + if (c != &boot_cpu_data) { + boot_cpu_data.x86_cache_max_rmid = + min(boot_cpu_data.x86_cache_max_rmid, + c->x86_cache_max_rmid); + } +} + /* * This does the hard work of actually picking apart the CPU stuff... */ @@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) init_hypervisor(c); x86_init_rdrand(c); + x86_init_cache_qos(c); /* * Clear/Set all flags overriden by options, need do it diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h new file mode 100644 index 000000000000..1c338b0eba05 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -0,0 +1,131 @@ +/* + * Intel(R) Processor Trace PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Intel PT is specified in the Intel Architecture Instruction Set Extensions + * Programming Reference: + * http://software.intel.com/en-us/intel-isa-extensions + */ + +#ifndef __INTEL_PT_H__ +#define __INTEL_PT_H__ + +/* + * Single-entry ToPA: when this close to region boundary, switch + * buffers to avoid losing data. + */ +#define TOPA_PMI_MARGIN 512 + +/* + * Table of Physical Addresses bits + */ +enum topa_sz { + TOPA_4K = 0, + TOPA_8K, + TOPA_16K, + TOPA_32K, + TOPA_64K, + TOPA_128K, + TOPA_256K, + TOPA_512K, + TOPA_1MB, + TOPA_2MB, + TOPA_4MB, + TOPA_8MB, + TOPA_16MB, + TOPA_32MB, + TOPA_64MB, + TOPA_128MB, + TOPA_SZ_END, +}; + +static inline unsigned int sizes(enum topa_sz tsz) +{ + return 1 << (tsz + 12); +}; + +struct topa_entry { + u64 end : 1; + u64 rsvd0 : 1; + u64 intr : 1; + u64 rsvd1 : 1; + u64 stop : 1; + u64 rsvd2 : 1; + u64 size : 4; + u64 rsvd3 : 2; + u64 base : 36; + u64 rsvd4 : 16; +}; + +#define TOPA_SHIFT 12 +#define PT_CPUID_LEAVES 2 + +enum pt_capabilities { + PT_CAP_max_subleaf = 0, + PT_CAP_cr3_filtering, + PT_CAP_topa_output, + PT_CAP_topa_multiple_entries, + PT_CAP_payloads_lip, +}; + +struct pt_pmu { + struct pmu pmu; + u32 caps[4 * PT_CPUID_LEAVES]; +}; + +/** + * struct pt_buffer - buffer configuration; one buffer per task_struct or + * cpu, depending on perf event configuration + * @cpu: cpu for per-cpu allocation + * @tables: list of ToPA tables in this buffer + * @first: shorthand for first topa table + * @last: shorthand for last topa table + * @cur: current topa table + * @nr_pages: buffer size in pages + * @cur_idx: current output region's index within @cur table + * @output_off: offset within the current output region + * @data_size: running total of the amount of data in this buffer + * @lost: if data was lost/truncated + * @head: logical write offset inside the buffer + * @snapshot: if this is for a snapshot/overwrite counter + * @stop_pos: STOP topa entry in the buffer + * @intr_pos: INT topa entry in the buffer + * @data_pages: array of pages from perf + * @topa_index: table of topa entries indexed by page offset + */ +struct pt_buffer { + int cpu; + struct list_head tables; + struct topa *first, *last, *cur; + unsigned int cur_idx; + size_t output_off; + unsigned long nr_pages; + local_t data_size; + local_t lost; + local64_t head; + bool snapshot; + unsigned long stop_pos, intr_pos; + void **data_pages; + struct topa_entry *topa_index[0]; +}; + +/** + * struct pt - per-cpu pt context + * @handle: perf output handle + * @handle_nmi: do handle PT PMI on this cpu, there's an active event + */ +struct pt { + struct perf_output_handle handle; + int handle_nmi; +}; + +#endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e2888a3ad1e3..87848ebe2bb7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event) } } +void hw_perf_lbr_event_destroy(struct perf_event *event) +{ + hw_perf_event_destroy(event); + + /* undo the lbr/bts event accounting */ + x86_del_exclusive(x86_lbr_exclusive_lbr); +} + static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; @@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return x86_pmu_extra_regs(val, event); } +/* + * Check if we can create event of a certain type (that no conflicting events + * are present). + */ +int x86_add_exclusive(unsigned int what) +{ + int ret = -EBUSY, i; + + if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) + return 0; + + mutex_lock(&pmc_reserve_mutex); + for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) + if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) + goto out; + + atomic_inc(&x86_pmu.lbr_exclusive[what]); + ret = 0; + +out: + mutex_unlock(&pmc_reserve_mutex); + return ret; +} + +void x86_del_exclusive(unsigned int what) +{ + atomic_dec(&x86_pmu.lbr_exclusive[what]); +} + int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; @@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; + + /* disallow bts if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; } hwc->config |= config; @@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; - /* - * check that PEBS LBR correction does not conflict with - * whatever the user is asking with attr->branch_sample_type - */ - if (event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) { - u64 *br_type = &event->attr.branch_sample_type; + } + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; - if (has_branch_stack(event)) { - if (!precise_br_compat(event)) - return -EOPNOTSUPP; + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; - /* branch_sample_type is compatible */ + /* branch_sample_type is compatible */ - } else { - /* - * user did not specify branch_sample_type - * - * For PEBS fixups, we capture all - * the branches at the priv level of the - * event. - */ - *br_type = PERF_SAMPLE_BRANCH_ANY; + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; - if (!event->attr.exclude_user) - *br_type |= PERF_SAMPLE_BRANCH_USER; + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; - if (!event->attr.exclude_kernel) - *br_type |= PERF_SAMPLE_BRANCH_KERNEL; - } + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } + if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) + event->attach_state |= PERF_ATTACH_TASK_DATA; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + if (event->attr.sample_period && x86_pmu.limit_period) { + if (x86_pmu.limit_period(event, event->attr.sample_period) > + event->attr.sample_period) + return -EINVAL; + } + return x86_setup_perfctr(event); } @@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; - int i, wmin, wmax, num = 0; + int i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); + if (x86_pmu.start_scheduling) + x86_pmu.start_scheduling(cpuc); + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); hwc->constraint = c; wmin = min(wmin, c->weight); @@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) - num = perf_assign_events(cpuc->event_list, n, wmin, - wmax, assign); + unsched = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); /* - * Mark the event as committed, so we do not put_constraint() - * in case new events are added and fail scheduling. + * In case of success (unsched = 0), mark events as committed, + * so we do not put_constraint() in case new events are added + * and fail to be scheduled + * + * We invoke the lower level commit callback to lock the resource + * + * We do not need to do all of this in case we are called to + * validate an event group (assign == NULL) */ - if (!num && assign) { + if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; + if (x86_pmu.commit_scheduling) + x86_pmu.commit_scheduling(cpuc, e, assign[i]); } } - /* - * scheduling failed or is just a simulation, - * free resources if necessary - */ - if (!assign || num) { + + if (!assign || unsched) { + for (i = 0; i < n; i++) { e = cpuc->event_list[i]; /* @@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) continue; + /* + * release events that failed scheduling + */ if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, e); } } - return num ? -EINVAL : 0; + + if (x86_pmu.stop_scheduling) + x86_pmu.stop_scheduling(cpuc); + + return unsched ? -EINVAL : 0; } /* @@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; + if (x86_pmu.limit_period) + left = x86_pmu.limit_period(event, left); + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* @@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags) hwc = &event->hw; - perf_pmu_disable(event->pmu); n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) @@ -1071,7 +1140,6 @@ done_collect: ret = 0; out: - perf_pmu_enable(event->pmu); return ret; } @@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - u64 pebs; + u64 pebs, debugctl; struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; @@ -1121,14 +1189,20 @@ void perf_event_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + if (x86_pmu.pebs_constraints) { + rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); + pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + } + if (x86_pmu.lbr_nr) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); + } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); @@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int ret = NOTIFY_OK; + int i, ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - cpuc->kfree_on_online = NULL; + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) + cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) ret = x86_pmu.cpu_prepare(cpu); break; @@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_ONLINE: - kfree(cpuc->kfree_on_online); + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { + kfree(cpuc->kfree_on_online[i]); + cpuc->kfree_on_online[i] = NULL; + } break; case CPU_DYING: @@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event) if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); - c = x86_pmu.get_event_constraints(fake_cpuc, event); + c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); if (!c || !c->weight) ret = -EINVAL; @@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_flush_branch_stack(void) +static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.flush_branch_stack) - x86_pmu.flush_branch_stack(); + if (x86_pmu.sched_task) + x86_pmu.sched_task(ctx, sched_in); } void perf_check_microcode(void) @@ -1949,7 +2027,8 @@ static struct pmu pmu = { .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, - .flush_branch_stack = x86_pmu_flush_branch_stack, + .sched_task = x86_pmu_sched_task, + .task_ctx_size = sizeof(struct x86_perf_task_context), }; void arch_perf_update_userpage(struct perf_event *event, @@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event, data = cyc2ns_read_begin(); + /* + * Internal timekeeping for enabled/running/stopped times + * is always in the local_clock domain. + */ userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; userpg->time_offset = data->cyc2ns_offset - now; - userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + /* + * cap_user_time_zero doesn't make sense when we're using a different + * time base for the records. + */ + if (event->clock == &local_clock) { + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + } cyc2ns_read_end(data); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index df525d2be1e8..329f0356ad4a 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -71,6 +71,8 @@ struct event_constraint { #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ @@ -123,8 +125,37 @@ struct intel_shared_regs { unsigned core_id; /* per-core: core id */ }; +enum intel_excl_state_type { + INTEL_EXCL_UNUSED = 0, /* counter is unused */ + INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */ + INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */ +}; + +struct intel_excl_states { + enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; + enum intel_excl_state_type state[X86_PMC_IDX_MAX]; + int num_alloc_cntrs;/* #counters allocated */ + int max_alloc_cntrs;/* max #counters allowed */ + bool sched_started; /* true if scheduling has started */ +}; + +struct intel_excl_cntrs { + raw_spinlock_t lock; + + struct intel_excl_states states[2]; + + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + #define MAX_LBR_ENTRIES 16 +enum { + X86_PERF_KFREE_SHARED = 0, + X86_PERF_KFREE_EXCL = 1, + X86_PERF_KFREE_MAX +}; + struct cpu_hw_events { /* * Generic x86 PMC bits @@ -179,6 +210,12 @@ struct cpu_hw_events { * used on Intel NHM/WSM/SNB */ struct intel_shared_regs *shared_regs; + /* + * manage exclusive counter access between hyperthread + */ + struct event_constraint *constraint_list; /* in enable order */ + struct intel_excl_cntrs *excl_cntrs; + int excl_thread_id; /* 0 or 1 */ /* * AMD specific bits @@ -187,7 +224,7 @@ struct cpu_hw_events { /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; - void *kfree_on_online; + void *kfree_on_online[X86_PERF_KFREE_MAX]; }; #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ @@ -202,6 +239,10 @@ struct cpu_hw_events { #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +#define INTEL_EXCLEVT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ + 0, PERF_X86_EVENT_EXCL) + /* * The overlap flag marks event constraints with overlapping counter * masks. This is the case if the counter mask of such an event is not @@ -259,6 +300,10 @@ struct cpu_hw_events { #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) +#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_EXCL) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) @@ -283,22 +328,40 @@ struct cpu_hw_events { /* Check flags and event code, and set the HSW load flag */ #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ - __EVENT_CONSTRAINT(code, n, \ + __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW store flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW load flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, \ + PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) + /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ @@ -408,6 +471,13 @@ union x86_pmu_config { #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value +enum { + x86_lbr_exclusive_lbr, + x86_lbr_exclusive_bts, + x86_lbr_exclusive_pt, + x86_lbr_exclusive_max, +}; + /* * struct x86_pmu - generic x86 pmu */ @@ -443,14 +513,25 @@ struct x86_pmu { u64 max_period; struct event_constraint * (*get_event_constraints)(struct cpu_hw_events *cpuc, + int idx, struct perf_event *event); void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); + + void (*commit_scheduling)(struct cpu_hw_events *cpuc, + struct perf_event *event, + int cntr); + + void (*start_scheduling)(struct cpu_hw_events *cpuc); + + void (*stop_scheduling)(struct cpu_hw_events *cpuc); + struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; + unsigned (*limit_period)(struct perf_event *event, unsigned l); /* * sysfs attrs @@ -472,7 +553,8 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*flush_branch_stack)(void); + void (*sched_task)(struct perf_event_context *ctx, + bool sched_in); /* * Intel Arch Perfmon v2+ @@ -503,11 +585,16 @@ struct x86_pmu { const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ + /* + * Intel PT/LBR/BTS are exclusive + */ + atomic_t lbr_exclusive[x86_lbr_exclusive_max]; + /* * Extra registers for events */ struct extra_reg *extra_regs; - unsigned int er_flags; + unsigned int flags; /* * Intel host/guest support (KVM) @@ -515,6 +602,13 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +struct x86_perf_task_context { + u64 lbr_from[MAX_LBR_ENTRIES]; + u64 lbr_to[MAX_LBR_ENTRIES]; + int lbr_callstack_users; + int lbr_stack_state; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -524,8 +618,13 @@ do { \ x86_pmu.quirks = &__quirk; \ } while (0) -#define ERF_NO_HT_SHARING 1 -#define ERF_HAS_RSP_1 2 +/* + * x86_pmu flags + */ +#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ +#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ +#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ +#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \ extern struct x86_pmu x86_pmu __read_mostly; +static inline bool x86_pmu_has_lbr_callstack(void) +{ + return x86_pmu.lbr_sel_map && + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; +} + DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); int x86_perf_event_set_period(struct perf_event *event); @@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } +int x86_add_exclusive(unsigned int what); + +void x86_del_exclusive(unsigned int what); + +void hw_perf_lbr_event_destroy(struct perf_event *event); + int x86_setup_perfctr(struct perf_event *event); int x86_pmu_hw_config(struct perf_event *event); @@ -674,10 +785,34 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) + return true; + + return false; +} + +static inline bool intel_pmu_has_bts(struct perf_event *event) +{ + if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !event->attr.freq && event->hw.sample_period == 1) + return true; + + return false; +} + int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); +x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event); struct intel_shared_regs *allocate_shared_regs(int cpu); @@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void); void intel_ds_init(void); +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_pmu_lbr_reset(void); void intel_pmu_lbr_enable(struct perf_event *event); void intel_pmu_lbr_disable(struct perf_event *event); -void intel_pmu_lbr_enable_all(void); +void intel_pmu_lbr_enable_all(bool pmi); void intel_pmu_lbr_disable_all(void); @@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_snb(void); +void intel_pmu_lbr_init_hsw(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); +void intel_pt_interrupt(void); + +int intel_bts_interrupt(void); + +void intel_bts_enable_local(void); + +void intel_bts_disable_local(void); + int p4_pmu_init(void); int p6_pmu_init(void); @@ -758,6 +905,10 @@ int knc_pmu_init(void); ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +static inline int is_ht_workaround_enabled(void) +{ + return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); +} #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 28926311aac1..1cee5d2d7ece 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu) static void amd_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; struct amd_nb *nb; int i, nb_id; @@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu) continue; if (nb->nb_id == nb_id) { - cpuc->kfree_on_online = cpuc->amd_nb; + *onln = cpuc->amd_nb; cpuc->amd_nb = nb; break; } @@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu) } static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { /* * if not NB event or no NB, then no constraints @@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); static struct event_constraint * -amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; unsigned int event_code = amd_get_event_code(hwc); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a61f5c6911da..989d3c215d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off) * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that * is using the new offset. */ -static int force_ibs_eilvt_setup(void) +static void force_ibs_eilvt_setup(void) { int offset; int ret; @@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void) if (offset == APIC_EILVT_NR_MAX) { printk(KERN_DEBUG "No EILVT entry available\n"); - return -EBUSY; + return; } ret = setup_ibs_ctl(offset); if (ret) goto out; - if (!ibs_eilvt_valid()) { - ret = -EFAULT; + if (!ibs_eilvt_valid()) goto out; - } pr_info("IBS: LVT offset %d assigned\n", offset); - return 0; + return; out: preempt_disable(); put_eilvt(offset); preempt_enable(); - return ret; + return; } static void ibs_eilvt_setup(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 258990688a5e..9da2400c2ec3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ - /* - * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT - * siblings; disable these events because they can corrupt unrelated - * counters. - */ - INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), + + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_bdw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ EVENT_CONSTRAINT_END }; @@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids }; +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts because they are not + * reliably counted. + */ + +#define HSW_DEMAND_DATA_RD BIT_ULL(0) +#define HSW_DEMAND_RFO BIT_ULL(1) +#define HSW_ANY_RESPONSE BIT_ULL(16) +#define HSW_SUPPLIER_NONE BIT_ULL(17) +#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22) +#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27) +#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28) +#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29) +#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \ + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ + HSW_L3_MISS_REMOTE_HOP2P) +#define HSW_SNOOP_NONE BIT_ULL(31) +#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32) +#define HSW_SNOOP_MISS BIT_ULL(33) +#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34) +#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35) +#define HSW_SNOOP_HITM BIT_ULL(36) +#define HSW_SNOOP_NON_DRAM BIT_ULL(37) +#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \ + HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \ + HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \ + HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM) +#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM) +#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD +#define HSW_DEMAND_WRITE HSW_DEMAND_RFO +#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\ + HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P) +#define HSW_LLC_ACCESS HSW_ANY_RESPONSE + +#define BDW_L3_MISS_LOCAL BIT(26) +#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \ + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ + HSW_L3_MISS_REMOTE_HOP2P) + + +static __initconst const u64 hsw_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 hsw_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| + HSW_LLC_ACCESS, + [ C(RESULT_MISS) ] = HSW_DEMAND_READ| + HSW_L3_MISS|HSW_ANY_SNOOP, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| + HSW_LLC_ACCESS, + [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS|HSW_ANY_SNOOP, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ| + HSW_L3_MISS_LOCAL_DRAM| + HSW_SNOOP_DRAM, + [ C(RESULT_MISS) ] = HSW_DEMAND_READ| + HSW_L3_MISS_REMOTE| + HSW_SNOOP_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS_LOCAL_DRAM| + HSW_SNOOP_DRAM, + [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE| + HSW_L3_MISS_REMOTE| + HSW_SNOOP_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - -static void intel_pmu_disable_all(void) +/* + * Use from PMIs where the LBRs are already disabled. + */ +static void __intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); + else + intel_bts_disable_local(); intel_pmu_pebs_disable_all(); +} + +static void intel_pmu_disable_all(void) +{ + __intel_pmu_disable_all(); intel_pmu_lbr_disable_all(); } -static void intel_pmu_enable_all(int added) +static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); intel_pmu_pebs_enable_all(); - intel_pmu_lbr_enable_all(); + intel_pmu_lbr_enable_all(pmi); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added) return; intel_pmu_enable_bts(event->hw.config); - } + } else + intel_bts_enable_local(); +} + +static void intel_pmu_enable_all(int added) +{ + __intel_pmu_enable_all(added, false); } /* @@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * must disable before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_disable(event); if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * must enabled before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_enable(event); if (event->attr.exclude_host) @@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void) if (ds) ds->bts_index = ds->bts_buffer_base; + /* Ack all overflows and disable fixed counters */ + if (x86_pmu.version >= 2) { + intel_pmu_ack_status(intel_pmu_get_status()); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + } + + /* Reset LBRs and LBR freezing */ + if (x86_pmu.lbr_nr) { + update_debugctlmsr(get_debugctlmsr() & + ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR)); + } + local_irq_restore(flags); } @@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); - intel_pmu_disable_all(); + __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); + handled += intel_bts_interrupt(); status = intel_pmu_get_status(); if (!status) goto done; @@ -1398,6 +1628,14 @@ again: x86_pmu.drain_pebs(regs); } + /* + * Intel PT + */ + if (__test_and_clear_bit(55, (unsigned long *)&status)) { + handled++; + intel_pt_interrupt(); + } + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status @@ -1433,7 +1671,7 @@ again: goto again; done: - intel_pmu_enable_all(0); + __intel_pmu_enable_all(0, true); /* * Only unmask the NMI after the overflow counters * have been reset. This avoids spurious NMIs on @@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event) static int intel_alt_er(int idx) { - if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; if (idx == EXTRA_REG_RSP_0) @@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, } struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct event_constraint *c; @@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) } static struct event_constraint * -intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { struct event_constraint *c; @@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - return x86_get_event_constraints(cpuc, event); + return x86_get_event_constraints(cpuc, idx, event); +} + +static void +intel_start_scheduling(struct cpu_hw_events *cpuc) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* sibling thread */ + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return; + + /* + * no exclusion needed + */ + if (!excl_cntrs) + return; + + xlo = &excl_cntrs->states[o_tid]; + xl = &excl_cntrs->states[tid]; + + xl->sched_started = true; + xl->num_alloc_cntrs = 0; + /* + * lock shared state until we are done scheduling + * in stop_event_scheduling() + * makes scheduling appear as a transaction + */ + WARN_ON_ONCE(!irqs_disabled()); + raw_spin_lock(&excl_cntrs->lock); + + /* + * save initial state of sibling thread + */ + memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state)); +} + +static void +intel_stop_scheduling(struct cpu_hw_events *cpuc) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* sibling thread */ + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return; + /* + * no exclusion needed + */ + if (!excl_cntrs) + return; + + xlo = &excl_cntrs->states[o_tid]; + xl = &excl_cntrs->states[tid]; + + /* + * make new sibling thread state visible + */ + memcpy(xlo->state, xlo->init_state, sizeof(xlo->state)); + + xl->sched_started = false; + /* + * release shared state lock (acquired in intel_start_scheduling()) + */ + raw_spin_unlock(&excl_cntrs->lock); +} + +static struct event_constraint * +intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + int idx, struct event_constraint *c) +{ + struct event_constraint *cx; + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xl, *xlo; + int is_excl, i; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; /* alternate */ + + /* + * validating a group does not require + * enforcing cross-thread exclusion + */ + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return c; + + /* + * no exclusion needed + */ + if (!excl_cntrs) + return c; + /* + * event requires exclusive counter access + * across HT threads + */ + is_excl = c->flags & PERF_X86_EVENT_EXCL; + + /* + * xl = state of current HT + * xlo = state of sibling HT + */ + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + /* + * do not allow scheduling of more than max_alloc_cntrs + * which is set to half the available generic counters. + * this helps avoid counter starvation of sibling thread + * by ensuring at most half the counters cannot be in + * exclusive mode. There is not designated counters for the + * limits. Any N/2 counters can be used. This helps with + * events with specifix counter constraints + */ + if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) + return &emptyconstraint; + + cx = c; + + /* + * because we modify the constraint, we need + * to make a copy. Static constraints come + * from static const tables. + * + * only needed when constraint has not yet + * been cloned (marked dynamic) + */ + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { + + /* sanity check */ + if (idx < 0) + return &emptyconstraint; + + /* + * grab pre-allocated constraint entry + */ + cx = &cpuc->constraint_list[idx]; + + /* + * initialize dynamic constraint + * with static constraint + */ + memcpy(cx, c, sizeof(*cx)); + + /* + * mark constraint as dynamic, so we + * can free it later on + */ + cx->flags |= PERF_X86_EVENT_DYNAMIC; + } + + /* + * From here on, the constraint is dynamic. + * Either it was just allocated above, or it + * was allocated during a earlier invocation + * of this function + */ + + /* + * Modify static constraint with current dynamic + * state of thread + * + * EXCLUSIVE: sibling counter measuring exclusive event + * SHARED : sibling counter measuring non-exclusive event + * UNUSED : sibling counter unused + */ + for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) { + /* + * exclusive event in sibling counter + * our corresponding counter cannot be used + * regardless of our event + */ + if (xl->state[i] == INTEL_EXCL_EXCLUSIVE) + __clear_bit(i, cx->idxmsk); + /* + * if measuring an exclusive event, sibling + * measuring non-exclusive, then counter cannot + * be used + */ + if (is_excl && xl->state[i] == INTEL_EXCL_SHARED) + __clear_bit(i, cx->idxmsk); + } + + /* + * recompute actual bit weight for scheduling algorithm + */ + cx->weight = hweight64(cx->idxmsk64); + + /* + * if we return an empty mask, then switch + * back to static empty constraint to avoid + * the cost of freeing later on + */ + if (cx->weight == 0) + cx = &emptyconstraint; + + return cx; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c2; + + /* + * first time only + * - static constraint: no change across incremental scheduling calls + * - dynamic constraint: handled by intel_get_excl_constraints() + */ + c2 = __intel_get_event_constraints(cpuc, idx, event); + if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { + bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); + c1->weight = c2->weight; + c2 = c1; + } + + if (cpuc->excl_cntrs) + return intel_get_excl_constraints(cpuc, event, idx, c2); + + return c2; +} + +static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct intel_excl_states *xlo, *xl; + unsigned long flags = 0; /* keep compiler happy */ + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; + + /* + * nothing needed if in group validation mode + */ + if (cpuc->is_fake) + return; + + WARN_ON_ONCE(!excl_cntrs); + + if (!excl_cntrs) + return; + + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + /* + * put_constraint may be called from x86_schedule_events() + * which already has the lock held so here make locking + * conditional + */ + if (!xl->sched_started) + raw_spin_lock_irqsave(&excl_cntrs->lock, flags); + + /* + * if event was actually assigned, then mark the + * counter state as unused now + */ + if (hwc->idx >= 0) + xlo->state[hwc->idx] = INTEL_EXCL_UNUSED; + + if (!xl->sched_started) + raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); } static void @@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { + struct event_constraint *c = event->hw.constraint; + intel_put_shared_regs_event_constraints(cpuc, event); + + /* + * is PMU has exclusive counter restrictions, then + * all events are subject to and must call the + * put_excl_constraints() routine + */ + if (c && cpuc->excl_cntrs) + intel_put_excl_constraints(cpuc, event); + + /* cleanup dynamic constraint */ + if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) + event->hw.constraint = NULL; +} + +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, + struct perf_event *event, int cntr) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct event_constraint *c = event->hw.constraint; + struct intel_excl_states *xlo, *xl; + int tid = cpuc->excl_thread_id; + int o_tid = 1 - tid; + int is_excl; + + if (cpuc->is_fake || !c) + return; + + is_excl = c->flags & PERF_X86_EVENT_EXCL; + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) + return; + + WARN_ON_ONCE(!excl_cntrs); + + if (!excl_cntrs) + return; + + xl = &excl_cntrs->states[tid]; + xlo = &excl_cntrs->states[o_tid]; + + WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); + + if (cntr >= 0) { + if (is_excl) + xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; + else + xlo->init_state[cntr] = INTEL_EXCL_SHARED; + } } static void intel_pebs_aliases_core2(struct perf_event *event) @@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - if (intel_pmu_needs_lbr_smpl(event)) { + if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + + /* + * BTS is set up earlier in this path, so don't account twice + */ + if (!intel_pmu_has_bts(event)) { + /* disallow lbr if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; + } } if (event->attr.type != PERF_TYPE_RAW) @@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); static struct event_constraint * -hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) { - struct event_constraint *c = intel_get_event_constraints(cpuc, event); + struct event_constraint *c; + + c = intel_get_event_constraints(cpuc, idx, event); /* Handle special quirk on in_tx_checkpointed only in counter 2 */ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { @@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) return c; } +/* + * Broadwell: + * + * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared + * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine + * the two to enforce a minimum period of 128 (the smallest value that has bits + * 0-5 cleared and >= 100). + * + * Because of how the code in x86_perf_event_set_period() works, the truncation + * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period + * to make up for the 'lost' events due to carrying the 'error' in period_left. + * + * Therefore the effective (average) period matches the requested period, + * despite coarser hardware granularity. + */ +static unsigned bdw_limit_period(struct perf_event *event, unsigned left) +{ + if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (left < 128) + left = 128; + left &= ~0x3fu; + } + return left; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu) return regs; } +static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) +{ + struct intel_excl_cntrs *c; + int i; + + c = kzalloc_node(sizeof(struct intel_excl_cntrs), + GFP_KERNEL, cpu_to_node(cpu)); + if (c) { + raw_spin_lock_init(&c->lock); + for (i = 0; i < X86_PMC_IDX_MAX; i++) { + c->states[0].state[i] = INTEL_EXCL_UNUSED; + c->states[0].init_state[i] = INTEL_EXCL_UNUSED; + + c->states[1].state[i] = INTEL_EXCL_UNUSED; + c->states[1].init_state[i] = INTEL_EXCL_UNUSED; + } + c->core_id = -1; + } + return c; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) - return NOTIFY_OK; + if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + return NOTIFY_BAD; + } - cpuc->shared_regs = allocate_shared_regs(cpu); - if (!cpuc->shared_regs) - return NOTIFY_BAD; + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); + + cpuc->constraint_list = kzalloc(sz, GFP_KERNEL); + if (!cpuc->constraint_list) + return NOTIFY_BAD; + + cpuc->excl_cntrs = allocate_excl_cntrs(cpu); + if (!cpuc->excl_cntrs) { + kfree(cpuc->constraint_list); + kfree(cpuc->shared_regs); + return NOTIFY_BAD; + } + cpuc->excl_thread_id = 0; + } return NOTIFY_OK; } @@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu) if (!cpuc->shared_regs) return; - if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { + void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; + for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_shared_regs *pc; pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online = cpuc->shared_regs; + *onln = cpuc->shared_regs; cpuc->shared_regs = pc; break; } @@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.lbr_sel_map) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; + + if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { + int h = x86_pmu.num_counters >> 1; + + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_excl_cntrs *c; + + c = per_cpu(cpu_hw_events, i).excl_cntrs; + if (c && c->core_id == core_id) { + cpuc->kfree_on_online[1] = cpuc->excl_cntrs; + cpuc->excl_cntrs = c; + cpuc->excl_thread_id = 1; + break; + } + } + cpuc->excl_cntrs->core_id = core_id; + cpuc->excl_cntrs->refcnt++; + /* + * set hard limit to half the number of generic counters + */ + cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; + cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; + } +} + +static void free_excl_cntrs(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_excl_cntrs *c; + + c = cpuc->excl_cntrs; + if (c) { + if (c->core_id == -1 || --c->refcnt == 0) + kfree(c); + cpuc->excl_cntrs = NULL; + kfree(cpuc->constraint_list); + cpuc->constraint_list = NULL; + } } static void intel_pmu_cpu_dying(int cpu) @@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu) cpuc->shared_regs = NULL; } - fini_debug_store_on_cpu(cpu); -} + free_excl_cntrs(cpu); -static void intel_pmu_flush_branch_stack(void) -{ - /* - * Intel LBR does not tag entries with the - * PID of the current task, then we need to - * flush it on ctxsw - * For now, we simply reset it - */ - if (x86_pmu.lbr_nr) - intel_pmu_lbr_reset(); + fini_debug_store_on_cpu(cpu); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); @@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .flush_branch_stack = intel_pmu_flush_branch_stack, + .sched_task = intel_pmu_lbr_sched_task, }; static __init void intel_clovertown_quirk(void) @@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void) } } +/* + * enable software workaround for errata: + * SNB: BJ122 + * IVB: BV98 + * HSW: HSD29 + * + * Only needed when HT is enabled. However detecting + * if HT is enabled is difficult (model specific). So instead, + * we enable the workaround in the early boot, and verify if + * it is needed in a later initcall phase once we have valid + * topology information to check if HT is actually enabled + */ +static __init void intel_ht_bug(void) +{ + x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; + + x86_pmu.commit_scheduling = intel_commit_scheduling; + x86_pmu.start_scheduling = intel_start_scheduling; + x86_pmu.stop_scheduling = intel_stop_scheduling; +} + EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") @@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_slm_event_constraints; x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; x86_pmu.extra_regs = intel_slm_extra_regs; - x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; pr_cont("Silvermont events, "); break; @@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; - x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = nhm_events_attrs; @@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void) case 42: /* 32nm SandyBridge */ case 45: /* 32nm SandyBridge-E/EN/EP */ x86_add_quirk(intel_sandybridge_quirk); + x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_snbep_extra_regs; else x86_pmu.extra_regs = intel_snb_extra_regs; + + /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; @@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void) case 58: /* 22nm IvyBridge */ case 62: /* 22nm IvyBridge-EP/EX */ + x86_add_quirk(intel_ht_bug); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); /* dTLB-load-misses on IVB is different than SNB */ @@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void) else x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.cpu_events = snb_events_attrs; @@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void) case 63: /* 22nm Haswell Server */ case 69: /* 22nm Haswell ULT */ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + x86_add_quirk(intel_ht_bug); x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_snb(); + intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; x86_pmu.extra_regs = intel_snbep_extra_regs; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; /* all extra regs are per-cpu when HT is on */ - x86_pmu.er_flags |= ERF_HAS_RSP_1; - x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; @@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void) pr_cont("Haswell events, "); break; + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */ + hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ | + BDW_L3_MISS|HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS| + HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ| + BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; + hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| + BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_bdw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snbep_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.limit_period = bdw_limit_period; + pr_cont("Broadwell events, "); + break; + default: switch (x86_pmu.version) { case 1: @@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void) return 0; } + +/* + * HT bug: phase 2 init + * Called once we have valid topology information to check + * whether or not HT is enabled + * If HT is off, then we disable the workaround + */ +static __init int fixup_ht_bug(void) +{ + int cpu = smp_processor_id(); + int w, c; + /* + * problem not present on this CPU model, nothing to do + */ + if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) + return 0; + + w = cpumask_weight(topology_thread_cpumask(cpu)); + if (w > 1) { + pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); + return 0; + } + + watchdog_nmi_disable_all(); + + x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); + + x86_pmu.commit_scheduling = NULL; + x86_pmu.start_scheduling = NULL; + x86_pmu.stop_scheduling = NULL; + + watchdog_nmi_enable_all(); + + get_online_cpus(); + + for_each_online_cpu(c) { + free_excl_cntrs(c); + } + + put_online_cpus(); + pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); + return 0; +} +subsys_initcall(fixup_ht_bug) diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c new file mode 100644 index 000000000000..ac1f0c55f379 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -0,0 +1,525 @@ +/* + * BTS PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#undef DEBUG + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "perf_event.h" + +struct bts_ctx { + struct perf_output_handle handle; + struct debug_store ds_back; + int started; +}; + +static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); + +#define BTS_RECORD_SIZE 24 +#define BTS_SAFETY_MARGIN 4080 + +struct bts_phys { + struct page *page; + unsigned long size; + unsigned long offset; + unsigned long displacement; +}; + +struct bts_buffer { + size_t real_size; /* multiple of BTS_RECORD_SIZE */ + unsigned int nr_pages; + unsigned int nr_bufs; + unsigned int cur_buf; + bool snapshot; + local_t data_size; + local_t lost; + local_t head; + unsigned long end; + void **data_pages; + struct bts_phys buf[0]; +}; + +struct pmu bts_pmu; + +void intel_pmu_enable_bts(u64 config); +void intel_pmu_disable_bts(void); + +static size_t buf_size(struct page *page) +{ + return 1 << (PAGE_SHIFT + page_private(page)); +} + +static void * +bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) +{ + struct bts_buffer *buf; + struct page *page; + int node = (cpu == -1) ? cpu : cpu_to_node(cpu); + unsigned long offset; + size_t size = nr_pages << PAGE_SHIFT; + int pg, nbuf, pad; + + /* count all the high order buffers */ + for (pg = 0, nbuf = 0; pg < nr_pages;) { + page = virt_to_page(pages[pg]); + if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1)) + return NULL; + pg += 1 << page_private(page); + nbuf++; + } + + /* + * to avoid interrupts in overwrite mode, only allow one physical + */ + if (overwrite && nbuf > 1) + return NULL; + + buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); + if (!buf) + return NULL; + + buf->nr_pages = nr_pages; + buf->nr_bufs = nbuf; + buf->snapshot = overwrite; + buf->data_pages = pages; + buf->real_size = size - size % BTS_RECORD_SIZE; + + for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { + unsigned int __nr_pages; + + page = virt_to_page(pages[pg]); + __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; + buf->buf[nbuf].page = page; + buf->buf[nbuf].offset = offset; + buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); + buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; + pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; + buf->buf[nbuf].size -= pad; + + pg += __nr_pages; + offset += __nr_pages << PAGE_SHIFT; + } + + return buf; +} + +static void bts_buffer_free_aux(void *data) +{ + kfree(data); +} + +static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) +{ + return buf->buf[idx].offset + buf->buf[idx].displacement; +} + +static void +bts_config_buffer(struct bts_buffer *buf) +{ + int cpu = raw_smp_processor_id(); + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct bts_phys *phys = &buf->buf[buf->cur_buf]; + unsigned long index, thresh = 0, end = phys->size; + struct page *page = phys->page; + + index = local_read(&buf->head); + + if (!buf->snapshot) { + if (buf->end < phys->offset + buf_size(page)) + end = buf->end - phys->offset - phys->displacement; + + index -= phys->offset + phys->displacement; + + if (end - index > BTS_SAFETY_MARGIN) + thresh = end - BTS_SAFETY_MARGIN; + else if (end - index > BTS_RECORD_SIZE) + thresh = end - BTS_RECORD_SIZE; + else + thresh = end; + } + + ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; + ds->bts_index = ds->bts_buffer_base + index; + ds->bts_absolute_maximum = ds->bts_buffer_base + end; + ds->bts_interrupt_threshold = !buf->snapshot + ? ds->bts_buffer_base + thresh + : ds->bts_absolute_maximum + BTS_RECORD_SIZE; +} + +static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) +{ + unsigned long index = head - phys->offset; + + memset(page_address(phys->page) + index, 0, phys->size - index); +} + +static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) +{ + if (buf->snapshot) + return false; + + if (local_read(&buf->data_size) >= bts->handle.size || + bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) + return true; + + return false; +} + +static void bts_update(struct bts_ctx *bts) +{ + int cpu = raw_smp_processor_id(); + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct bts_buffer *buf = perf_get_aux(&bts->handle); + unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; + + if (!buf) + return; + + head = index + bts_buffer_offset(buf, buf->cur_buf); + old = local_xchg(&buf->head, head); + + if (!buf->snapshot) { + if (old == head) + return; + + if (ds->bts_index >= ds->bts_absolute_maximum) + local_inc(&buf->lost); + + /* + * old and head are always in the same physical buffer, so we + * can subtract them to get the data size. + */ + local_add(head - old, &buf->data_size); + } else { + local_set(&buf->data_size, head); + } +} + +static void __bts_event_start(struct perf_event *event) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); + u64 config = 0; + + if (!buf || bts_buffer_is_full(buf, bts)) + return; + + event->hw.state = 0; + + if (!buf->snapshot) + config |= ARCH_PERFMON_EVENTSEL_INT; + if (!event->attr.exclude_kernel) + config |= ARCH_PERFMON_EVENTSEL_OS; + if (!event->attr.exclude_user) + config |= ARCH_PERFMON_EVENTSEL_USR; + + bts_config_buffer(buf); + + /* + * local barrier to make sure that ds configuration made it + * before we enable BTS + */ + wmb(); + + intel_pmu_enable_bts(config); +} + +static void bts_event_start(struct perf_event *event, int flags) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + __bts_event_start(event); + + /* PMI handler: this counter is running and likely generating PMIs */ + ACCESS_ONCE(bts->started) = 1; +} + +static void __bts_event_stop(struct perf_event *event) +{ + /* + * No extra synchronization is mandated by the documentation to have + * BTS data stores globally visible. + */ + intel_pmu_disable_bts(); + + if (event->hw.state & PERF_HES_STOPPED) + return; + + ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; +} + +static void bts_event_stop(struct perf_event *event, int flags) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* PMI handler: don't restart this counter */ + ACCESS_ONCE(bts->started) = 0; + + __bts_event_stop(event); + + if (flags & PERF_EF_UPDATE) + bts_update(bts); +} + +void intel_bts_enable_local(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + if (bts->handle.event && bts->started) + __bts_event_start(bts->handle.event); +} + +void intel_bts_disable_local(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + if (bts->handle.event) + __bts_event_stop(bts->handle.event); +} + +static int +bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) +{ + unsigned long head, space, next_space, pad, gap, skip, wakeup; + unsigned int next_buf; + struct bts_phys *phys, *next_phys; + int ret; + + if (buf->snapshot) + return 0; + + head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); + if (WARN_ON_ONCE(head != local_read(&buf->head))) + return -EINVAL; + + phys = &buf->buf[buf->cur_buf]; + space = phys->offset + phys->displacement + phys->size - head; + pad = space; + if (space > handle->size) { + space = handle->size; + space -= space % BTS_RECORD_SIZE; + } + if (space <= BTS_SAFETY_MARGIN) { + /* See if next phys buffer has more space */ + next_buf = buf->cur_buf + 1; + if (next_buf >= buf->nr_bufs) + next_buf = 0; + next_phys = &buf->buf[next_buf]; + gap = buf_size(phys->page) - phys->displacement - phys->size + + next_phys->displacement; + skip = pad + gap; + if (handle->size >= skip) { + next_space = next_phys->size; + if (next_space + skip > handle->size) { + next_space = handle->size - skip; + next_space -= next_space % BTS_RECORD_SIZE; + } + if (next_space > space || !space) { + if (pad) + bts_buffer_pad_out(phys, head); + ret = perf_aux_output_skip(handle, skip); + if (ret) + return ret; + /* Advance to next phys buffer */ + phys = next_phys; + space = next_space; + head = phys->offset + phys->displacement; + /* + * After this, cur_buf and head won't match ds + * anymore, so we must not be racing with + * bts_update(). + */ + buf->cur_buf = next_buf; + local_set(&buf->head, head); + } + } + } + + /* Don't go far beyond wakeup watermark */ + wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - + handle->head; + if (space > wakeup) { + space = wakeup; + space -= space % BTS_RECORD_SIZE; + } + + buf->end = head + space; + + /* + * If we have no space, the lost notification would have been sent when + * we hit absolute_maximum - see bts_update() + */ + if (!space) + return -ENOSPC; + + return 0; +} + +int intel_bts_interrupt(void) +{ + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct perf_event *event = bts->handle.event; + struct bts_buffer *buf; + s64 old_head; + int err; + + if (!event || !bts->started) + return 0; + + buf = perf_get_aux(&bts->handle); + /* + * Skip snapshot counters: they don't use the interrupt, but + * there's no other way of telling, because the pointer will + * keep moving + */ + if (!buf || buf->snapshot) + return 0; + + old_head = local_read(&buf->head); + bts_update(bts); + + /* no new data */ + if (old_head == local_read(&buf->head)) + return 0; + + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + return 1; + + err = bts_buffer_reset(buf, &bts->handle); + if (err) + perf_aux_output_end(&bts->handle, 0, false); + + return 1; +} + +static void bts_event_del(struct perf_event *event, int mode) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); + + bts_event_stop(event, PERF_EF_UPDATE); + + if (buf) { + if (buf->snapshot) + bts->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + } + + cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; + cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; +} + +static int bts_event_add(struct perf_event *event, int mode) +{ + struct bts_buffer *buf; + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int ret = -EBUSY; + + event->hw.state = PERF_HES_STOPPED; + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + return -EBUSY; + + if (bts->handle.event) + return -EBUSY; + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + return -EINVAL; + + ret = bts_buffer_reset(buf, &bts->handle); + if (ret) { + perf_aux_output_end(&bts->handle, 0, false); + return ret; + } + + bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; + bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; + bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; + + if (mode & PERF_EF_START) { + bts_event_start(event, 0); + if (hwc->state & PERF_HES_STOPPED) { + bts_event_del(event, 0); + return -EBUSY; + } + } + + return 0; +} + +static void bts_event_destroy(struct perf_event *event) +{ + x86_del_exclusive(x86_lbr_exclusive_bts); +} + +static int bts_event_init(struct perf_event *event) +{ + if (event->attr.type != bts_pmu.type) + return -ENOENT; + + if (x86_add_exclusive(x86_lbr_exclusive_bts)) + return -EBUSY; + + event->destroy = bts_event_destroy; + + return 0; +} + +static void bts_event_read(struct perf_event *event) +{ +} + +static __init int bts_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + return -ENODEV; + + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.task_ctx_nr = perf_sw_context; + bts_pmu.event_init = bts_event_init; + bts_pmu.add = bts_event_add; + bts_pmu.del = bts_event_del; + bts_pmu.start = bts_event_start; + bts_pmu.stop = bts_event_stop; + bts_pmu.read = bts_event_read; + bts_pmu.setup_aux = bts_buffer_setup_aux; + bts_pmu.free_aux = bts_buffer_free_aux; + + return perf_pmu_register(&bts_pmu, "intel_bts", -1); +} + +module_init(bts_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c new file mode 100644 index 000000000000..e4d1b8b738fa --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -0,0 +1,1379 @@ +/* + * Intel Cache Quality-of-Service Monitoring (CQM) support. + * + * Based very, very heavily on work by Peter Zijlstra. + */ + +#include +#include +#include +#include "perf_event.h" + +#define MSR_IA32_PQR_ASSOC 0x0c8f +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +static unsigned int cqm_max_rmid = -1; +static unsigned int cqm_l3_scale; /* supposedly cacheline size */ + +struct intel_cqm_state { + raw_spinlock_t lock; + int rmid; + int cnt; +}; + +static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); + +/* + * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. + * Also protects event->hw.cqm_rmid + * + * Hold either for stability, both for modification of ->hw.cqm_rmid. + */ +static DEFINE_MUTEX(cache_mutex); +static DEFINE_RAW_SPINLOCK(cache_lock); + +/* + * Groups of events that have the same target(s), one RMID per group. + */ +static LIST_HEAD(cache_groups); + +/* + * Mask of CPUs for reading CQM values. We only need one per-socket. + */ +static cpumask_t cqm_cpumask; + +#define RMID_VAL_ERROR (1ULL << 63) +#define RMID_VAL_UNAVAIL (1ULL << 62) + +#define QOS_L3_OCCUP_EVENT_ID (1 << 0) + +#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID + +/* + * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). + * + * This rmid is always free and is guaranteed to have an associated + * near-zero occupancy value, i.e. no cachelines are tagged with this + * RMID, once __intel_cqm_rmid_rotate() returns. + */ +static unsigned int intel_cqm_rotation_rmid; + +#define INVALID_RMID (-1) + +/* + * Is @rmid valid for programming the hardware? + * + * rmid 0 is reserved by the hardware for all non-monitored tasks, which + * means that we should never come across an rmid with that value. + * Likewise, an rmid value of -1 is used to indicate "no rmid currently + * assigned" and is used as part of the rotation code. + */ +static inline bool __rmid_valid(unsigned int rmid) +{ + if (!rmid || rmid == INVALID_RMID) + return false; + + return true; +} + +static u64 __rmid_read(unsigned int rmid) +{ + u64 val; + + /* + * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, + * it just says that to increase confusion. + */ + wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + /* + * Aside from the ERROR and UNAVAIL bits, assume this thing returns + * the number of cachelines tagged with @rmid. + */ + return val; +} + +enum rmid_recycle_state { + RMID_YOUNG = 0, + RMID_AVAILABLE, + RMID_DIRTY, +}; + +struct cqm_rmid_entry { + unsigned int rmid; + enum rmid_recycle_state state; + struct list_head list; + unsigned long queue_time; +}; + +/* + * cqm_rmid_free_lru - A least recently used list of RMIDs. + * + * Oldest entry at the head, newest (most recently used) entry at the + * tail. This list is never traversed, it's only used to keep track of + * the lru order. That is, we only pick entries of the head or insert + * them on the tail. + * + * All entries on the list are 'free', and their RMIDs are not currently + * in use. To mark an RMID as in use, remove its entry from the lru + * list. + * + * + * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. + * + * This list is contains RMIDs that no one is currently using but that + * may have a non-zero occupancy value associated with them. The + * rotation worker moves RMIDs from the limbo list to the free list once + * the occupancy value drops below __intel_cqm_threshold. + * + * Both lists are protected by cache_mutex. + */ +static LIST_HEAD(cqm_rmid_free_lru); +static LIST_HEAD(cqm_rmid_limbo_lru); + +/* + * We use a simple array of pointers so that we can lookup a struct + * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() + * and __put_rmid() from having to worry about dealing with struct + * cqm_rmid_entry - they just deal with rmids, i.e. integers. + * + * Once this array is initialized it is read-only. No locks are required + * to access it. + * + * All entries for all RMIDs can be looked up in the this array at all + * times. + */ +static struct cqm_rmid_entry **cqm_rmid_ptrs; + +static inline struct cqm_rmid_entry *__rmid_entry(int rmid) +{ + struct cqm_rmid_entry *entry; + + entry = cqm_rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +/* + * Returns < 0 on fail. + * + * We expect to be called with cache_mutex held. + */ +static int __get_rmid(void) +{ + struct cqm_rmid_entry *entry; + + lockdep_assert_held(&cache_mutex); + + if (list_empty(&cqm_rmid_free_lru)) + return INVALID_RMID; + + entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void __put_rmid(unsigned int rmid) +{ + struct cqm_rmid_entry *entry; + + lockdep_assert_held(&cache_mutex); + + WARN_ON(!__rmid_valid(rmid)); + entry = __rmid_entry(rmid); + + entry->queue_time = jiffies; + entry->state = RMID_YOUNG; + + list_add_tail(&entry->list, &cqm_rmid_limbo_lru); +} + +static int intel_cqm_setup_rmid_cache(void) +{ + struct cqm_rmid_entry *entry; + unsigned int nr_rmids; + int r = 0; + + nr_rmids = cqm_max_rmid + 1; + cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * + nr_rmids, GFP_KERNEL); + if (!cqm_rmid_ptrs) + return -ENOMEM; + + for (; r <= cqm_max_rmid; r++) { + struct cqm_rmid_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto fail; + + INIT_LIST_HEAD(&entry->list); + entry->rmid = r; + cqm_rmid_ptrs[r] = entry; + + list_add_tail(&entry->list, &cqm_rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + mutex_lock(&cache_mutex); + intel_cqm_rotation_rmid = __get_rmid(); + mutex_unlock(&cache_mutex); + + return 0; +fail: + while (r--) + kfree(cqm_rmid_ptrs[r]); + + kfree(cqm_rmid_ptrs); + return -ENOMEM; +} + +/* + * Determine if @a and @b measure the same set of tasks. + * + * If @a and @b measure the same set of tasks then we want to share a + * single RMID. + */ +static bool __match_event(struct perf_event *a, struct perf_event *b) +{ + /* Per-cpu and task events don't mix */ + if ((a->attach_state & PERF_ATTACH_TASK) != + (b->attach_state & PERF_ATTACH_TASK)) + return false; + +#ifdef CONFIG_CGROUP_PERF + if (a->cgrp != b->cgrp) + return false; +#endif + + /* If not task event, we're machine wide */ + if (!(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Events that target same task are placed into the same cache group. + */ + if (a->hw.target == b->hw.target) + return true; + + /* + * Are we an inherited event? + */ + if (b->parent == a) + return true; + + return false; +} + +#ifdef CONFIG_CGROUP_PERF +static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) +{ + if (event->attach_state & PERF_ATTACH_TASK) + return perf_cgroup_from_task(event->hw.target); + + return event->cgrp; +} +#endif + +/* + * Determine if @a's tasks intersect with @b's tasks + * + * There are combinations of events that we explicitly prohibit, + * + * PROHIBITS + * system-wide -> cgroup and task + * cgroup -> system-wide + * -> task in cgroup + * task -> system-wide + * -> task in cgroup + * + * Call this function before allocating an RMID. + */ +static bool __conflict_event(struct perf_event *a, struct perf_event *b) +{ +#ifdef CONFIG_CGROUP_PERF + /* + * We can have any number of cgroups but only one system-wide + * event at a time. + */ + if (a->cgrp && b->cgrp) { + struct perf_cgroup *ac = a->cgrp; + struct perf_cgroup *bc = b->cgrp; + + /* + * This condition should have been caught in + * __match_event() and we should be sharing an RMID. + */ + WARN_ON_ONCE(ac == bc); + + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } + + if (a->cgrp || b->cgrp) { + struct perf_cgroup *ac, *bc; + + /* + * cgroup and system-wide events are mutually exclusive + */ + if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || + (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) + return true; + + /* + * Ensure neither event is part of the other's cgroup + */ + ac = event_to_cgroup(a); + bc = event_to_cgroup(b); + if (ac == bc) + return true; + + /* + * Must have cgroup and non-intersecting task events. + */ + if (!ac || !bc) + return false; + + /* + * We have cgroup and task events, and the task belongs + * to a cgroup. Check for for overlap. + */ + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } +#endif + /* + * If one of them is not a task, same story as above with cgroups. + */ + if (!(a->attach_state & PERF_ATTACH_TASK) || + !(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Must be non-overlapping. + */ + return false; +} + +struct rmid_read { + unsigned int rmid; + atomic64_t value; +}; + +static void __intel_cqm_event_count(void *info); + +/* + * Exchange the RMID of a group of events. + */ +static unsigned int +intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) +{ + struct perf_event *event; + unsigned int old_rmid = group->hw.cqm_rmid; + struct list_head *head = &group->hw.cqm_group_entry; + + lockdep_assert_held(&cache_mutex); + + /* + * If our RMID is being deallocated, perform a read now. + */ + if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { + struct rmid_read rr = { + .value = ATOMIC64_INIT(0), + .rmid = old_rmid, + }; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, + &rr, 1); + local64_set(&group->count, atomic64_read(&rr.value)); + } + + raw_spin_lock_irq(&cache_lock); + + group->hw.cqm_rmid = rmid; + list_for_each_entry(event, head, hw.cqm_group_entry) + event->hw.cqm_rmid = rmid; + + raw_spin_unlock_irq(&cache_lock); + + return old_rmid; +} + +/* + * If we fail to assign a new RMID for intel_cqm_rotation_rmid because + * cachelines are still tagged with RMIDs in limbo, we progressively + * increment the threshold until we find an RMID in limbo with <= + * __intel_cqm_threshold lines tagged. This is designed to mitigate the + * problem where cachelines tagged with an RMID are not steadily being + * evicted. + * + * On successful rotations we decrease the threshold back towards zero. + * + * __intel_cqm_max_threshold provides an upper bound on the threshold, + * and is measured in bytes because it's exposed to userland. + */ +static unsigned int __intel_cqm_threshold; +static unsigned int __intel_cqm_max_threshold; + +/* + * Test whether an RMID has a zero occupancy value on this cpu. + */ +static void intel_cqm_stable(void *arg) +{ + struct cqm_rmid_entry *entry; + + list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { + if (entry->state != RMID_AVAILABLE) + break; + + if (__rmid_read(entry->rmid) > __intel_cqm_threshold) + entry->state = RMID_DIRTY; + } +} + +/* + * If we have group events waiting for an RMID that don't conflict with + * events already running, assign @rmid. + */ +static bool intel_cqm_sched_in_event(unsigned int rmid) +{ + struct perf_event *leader, *event; + + lockdep_assert_held(&cache_mutex); + + leader = list_first_entry(&cache_groups, struct perf_event, + hw.cqm_groups_entry); + event = leader; + + list_for_each_entry_continue(event, &cache_groups, + hw.cqm_groups_entry) { + if (__rmid_valid(event->hw.cqm_rmid)) + continue; + + if (__conflict_event(event, leader)) + continue; + + intel_cqm_xchg_rmid(event, rmid); + return true; + } + + return false; +} + +/* + * Initially use this constant for both the limbo queue time and the + * rotation timer interval, pmu::hrtimer_interval_ms. + * + * They don't need to be the same, but the two are related since if you + * rotate faster than you recycle RMIDs, you may run out of available + * RMIDs. + */ +#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ + +static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; + +/* + * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list + * @nr_available: number of freeable RMIDs on the limbo list + * + * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no + * cachelines are tagged with those RMIDs. After this we can reuse them + * and know that the current set of active RMIDs is stable. + * + * Return %true or %false depending on whether stabilization needs to be + * reattempted. + * + * If we return %true then @nr_available is updated to indicate the + * number of RMIDs on the limbo list that have been queued for the + * minimum queue time (RMID_AVAILABLE), but whose data occupancy values + * are above __intel_cqm_threshold. + */ +static bool intel_cqm_rmid_stabilize(unsigned int *available) +{ + struct cqm_rmid_entry *entry, *tmp; + + lockdep_assert_held(&cache_mutex); + + *available = 0; + list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { + unsigned long min_queue_time; + unsigned long now = jiffies; + + /* + * We hold RMIDs placed into limbo for a minimum queue + * time. Before the minimum queue time has elapsed we do + * not recycle RMIDs. + * + * The reasoning is that until a sufficient time has + * passed since we stopped using an RMID, any RMID + * placed onto the limbo list will likely still have + * data tagged in the cache, which means we'll probably + * fail to recycle it anyway. + * + * We can save ourselves an expensive IPI by skipping + * any RMIDs that have not been queued for the minimum + * time. + */ + min_queue_time = entry->queue_time + + msecs_to_jiffies(__rmid_queue_time_ms); + + if (time_after(min_queue_time, now)) + break; + + entry->state = RMID_AVAILABLE; + (*available)++; + } + + /* + * Fast return if none of the RMIDs on the limbo list have been + * sitting on the queue for the minimum queue time. + */ + if (!*available) + return false; + + /* + * Test whether an RMID is free for each package. + */ + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); + + list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { + /* + * Exhausted all RMIDs that have waited min queue time. + */ + if (entry->state == RMID_YOUNG) + break; + + if (entry->state == RMID_DIRTY) + continue; + + list_del(&entry->list); /* remove from limbo */ + + /* + * The rotation RMID gets priority if it's + * currently invalid. In which case, skip adding + * the RMID to the the free lru. + */ + if (!__rmid_valid(intel_cqm_rotation_rmid)) { + intel_cqm_rotation_rmid = entry->rmid; + continue; + } + + /* + * If we have groups waiting for RMIDs, hand + * them one now provided they don't conflict. + */ + if (intel_cqm_sched_in_event(entry->rmid)) + continue; + + /* + * Otherwise place it onto the free list. + */ + list_add_tail(&entry->list, &cqm_rmid_free_lru); + } + + + return __rmid_valid(intel_cqm_rotation_rmid); +} + +/* + * Pick a victim group and move it to the tail of the group list. + * @next: The first group without an RMID + */ +static void __intel_cqm_pick_and_rotate(struct perf_event *next) +{ + struct perf_event *rotor; + unsigned int rmid; + + lockdep_assert_held(&cache_mutex); + + rotor = list_first_entry(&cache_groups, struct perf_event, + hw.cqm_groups_entry); + + /* + * The group at the front of the list should always have a valid + * RMID. If it doesn't then no groups have RMIDs assigned and we + * don't need to rotate the list. + */ + if (next == rotor) + return; + + rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); + __put_rmid(rmid); + + list_rotate_left(&cache_groups); +} + +/* + * Deallocate the RMIDs from any events that conflict with @event, and + * place them on the back of the group list. + */ +static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) +{ + struct perf_event *group, *g; + unsigned int rmid; + + lockdep_assert_held(&cache_mutex); + + list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { + if (group == event) + continue; + + rmid = group->hw.cqm_rmid; + + /* + * Skip events that don't have a valid RMID. + */ + if (!__rmid_valid(rmid)) + continue; + + /* + * No conflict? No problem! Leave the event alone. + */ + if (!__conflict_event(group, event)) + continue; + + intel_cqm_xchg_rmid(group, INVALID_RMID); + __put_rmid(rmid); + } +} + +/* + * Attempt to rotate the groups and assign new RMIDs. + * + * We rotate for two reasons, + * 1. To handle the scheduling of conflicting events + * 2. To recycle RMIDs + * + * Rotating RMIDs is complicated because the hardware doesn't give us + * any clues. + * + * There's problems with the hardware interface; when you change the + * task:RMID map cachelines retain their 'old' tags, giving a skewed + * picture. In order to work around this, we must always keep one free + * RMID - intel_cqm_rotation_rmid. + * + * Rotation works by taking away an RMID from a group (the old RMID), + * and assigning the free RMID to another group (the new RMID). We must + * then wait for the old RMID to not be used (no cachelines tagged). + * This ensure that all cachelines are tagged with 'active' RMIDs. At + * this point we can start reading values for the new RMID and treat the + * old RMID as the free RMID for the next rotation. + * + * Return %true or %false depending on whether we did any rotating. + */ +static bool __intel_cqm_rmid_rotate(void) +{ + struct perf_event *group, *start = NULL; + unsigned int threshold_limit; + unsigned int nr_needed = 0; + unsigned int nr_available; + bool rotated = false; + + mutex_lock(&cache_mutex); + +again: + /* + * Fast path through this function if there are no groups and no + * RMIDs that need cleaning. + */ + if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) + goto out; + + list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { + if (!__rmid_valid(group->hw.cqm_rmid)) { + if (!start) + start = group; + nr_needed++; + } + } + + /* + * We have some event groups, but they all have RMIDs assigned + * and no RMIDs need cleaning. + */ + if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) + goto out; + + if (!nr_needed) + goto stabilize; + + /* + * We have more event groups without RMIDs than available RMIDs, + * or we have event groups that conflict with the ones currently + * scheduled. + * + * We force deallocate the rmid of the group at the head of + * cache_groups. The first event group without an RMID then gets + * assigned intel_cqm_rotation_rmid. This ensures we always make + * forward progress. + * + * Rotate the cache_groups list so the previous head is now the + * tail. + */ + __intel_cqm_pick_and_rotate(start); + + /* + * If the rotation is going to succeed, reduce the threshold so + * that we don't needlessly reuse dirty RMIDs. + */ + if (__rmid_valid(intel_cqm_rotation_rmid)) { + intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); + intel_cqm_rotation_rmid = __get_rmid(); + + intel_cqm_sched_out_conflicting_events(start); + + if (__intel_cqm_threshold) + __intel_cqm_threshold--; + } + + rotated = true; + +stabilize: + /* + * We now need to stablize the RMID we freed above (if any) to + * ensure that the next time we rotate we have an RMID with zero + * occupancy value. + * + * Alternatively, if we didn't need to perform any rotation, + * we'll have a bunch of RMIDs in limbo that need stabilizing. + */ + threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; + + while (intel_cqm_rmid_stabilize(&nr_available) && + __intel_cqm_threshold < threshold_limit) { + unsigned int steal_limit; + + /* + * Don't spin if nobody is actively waiting for an RMID, + * the rotation worker will be kicked as soon as an + * event needs an RMID anyway. + */ + if (!nr_needed) + break; + + /* Allow max 25% of RMIDs to be in limbo. */ + steal_limit = (cqm_max_rmid + 1) / 4; + + /* + * We failed to stabilize any RMIDs so our rotation + * logic is now stuck. In order to make forward progress + * we have a few options: + * + * 1. rotate ("steal") another RMID + * 2. increase the threshold + * 3. do nothing + * + * We do both of 1. and 2. until we hit the steal limit. + * + * The steal limit prevents all RMIDs ending up on the + * limbo list. This can happen if every RMID has a + * non-zero occupancy above threshold_limit, and the + * occupancy values aren't dropping fast enough. + * + * Note that there is prioritisation at work here - we'd + * rather increase the number of RMIDs on the limbo list + * than increase the threshold, because increasing the + * threshold skews the event data (because we reuse + * dirty RMIDs) - threshold bumps are a last resort. + */ + if (nr_available < steal_limit) + goto again; + + __intel_cqm_threshold++; + } + +out: + mutex_unlock(&cache_mutex); + return rotated; +} + +static void intel_cqm_rmid_rotate(struct work_struct *work); + +static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); + +static struct pmu intel_cqm_pmu; + +static void intel_cqm_rmid_rotate(struct work_struct *work) +{ + unsigned long delay; + + __intel_cqm_rmid_rotate(); + + delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); + schedule_delayed_work(&intel_cqm_rmid_work, delay); +} + +/* + * Find a group and setup RMID. + * + * If we're part of a group, we use the group's RMID. + */ +static void intel_cqm_setup_event(struct perf_event *event, + struct perf_event **group) +{ + struct perf_event *iter; + unsigned int rmid; + bool conflict = false; + + list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + rmid = iter->hw.cqm_rmid; + + if (__match_event(iter, event)) { + /* All tasks in a group share an RMID */ + event->hw.cqm_rmid = rmid; + *group = iter; + return; + } + + /* + * We only care about conflicts for events that are + * actually scheduled in (and hence have a valid RMID). + */ + if (__conflict_event(iter, event) && __rmid_valid(rmid)) + conflict = true; + } + + if (conflict) + rmid = INVALID_RMID; + else + rmid = __get_rmid(); + + event->hw.cqm_rmid = rmid; +} + +static void intel_cqm_event_read(struct perf_event *event) +{ + unsigned long flags; + unsigned int rmid; + u64 val; + + /* + * Task events are handled by intel_cqm_event_count(). + */ + if (event->cpu == -1) + return; + + raw_spin_lock_irqsave(&cache_lock, flags); + rmid = event->hw.cqm_rmid; + + if (!__rmid_valid(rmid)) + goto out; + + val = __rmid_read(rmid); + + /* + * Ignore this reading on error states and do not update the value. + */ + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + goto out; + + local64_set(&event->count, val); +out: + raw_spin_unlock_irqrestore(&cache_lock, flags); +} + +static void __intel_cqm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = __rmid_read(rr->rmid); + + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + atomic64_add(val, &rr->value); +} + +static inline bool cqm_group_leader(struct perf_event *event) +{ + return !list_empty(&event->hw.cqm_groups_entry); +} + +static u64 intel_cqm_event_count(struct perf_event *event) +{ + unsigned long flags; + struct rmid_read rr = { + .value = ATOMIC64_INIT(0), + }; + + /* + * We only need to worry about task events. System-wide events + * are handled like usual, i.e. entirely with + * intel_cqm_event_read(). + */ + if (event->cpu != -1) + return __perf_event_count(event); + + /* + * Only the group leader gets to report values. This stops us + * reporting duplicate values to userspace, and gives us a clear + * rule for which task gets to report the values. + * + * Note that it is impossible to attribute these values to + * specific packages - we forfeit that ability when we create + * task events. + */ + if (!cqm_group_leader(event)) + return 0; + + /* + * Notice that we don't perform the reading of an RMID + * atomically, because we can't hold a spin lock across the + * IPIs. + * + * Speculatively perform the read, since @event might be + * assigned a different (possibly invalid) RMID while we're + * busying performing the IPI calls. It's therefore necessary to + * check @event's RMID afterwards, and if it has changed, + * discard the result of the read. + */ + rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); + + if (!__rmid_valid(rr.rmid)) + goto out; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + + raw_spin_lock_irqsave(&cache_lock, flags); + if (event->hw.cqm_rmid == rr.rmid) + local64_set(&event->count, atomic64_read(&rr.value)); + raw_spin_unlock_irqrestore(&cache_lock, flags); +out: + return __perf_event_count(event); +} + +static void intel_cqm_event_start(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned int rmid = event->hw.cqm_rmid; + unsigned long flags; + + if (!(event->hw.cqm_state & PERF_HES_STOPPED)) + return; + + event->hw.cqm_state &= ~PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + + if (state->cnt++) + WARN_ON_ONCE(state->rmid != rmid); + else + WARN_ON_ONCE(state->rmid); + + state->rmid = rmid; + wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static void intel_cqm_event_stop(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned long flags; + + if (event->hw.cqm_state & PERF_HES_STOPPED) + return; + + event->hw.cqm_state |= PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + intel_cqm_event_read(event); + + if (!--state->cnt) { + state->rmid = 0; + wrmsrl(MSR_IA32_PQR_ASSOC, 0); + } else { + WARN_ON_ONCE(!state->rmid); + } + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static int intel_cqm_event_add(struct perf_event *event, int mode) +{ + unsigned long flags; + unsigned int rmid; + + raw_spin_lock_irqsave(&cache_lock, flags); + + event->hw.cqm_state = PERF_HES_STOPPED; + rmid = event->hw.cqm_rmid; + + if (__rmid_valid(rmid) && (mode & PERF_EF_START)) + intel_cqm_event_start(event, mode); + + raw_spin_unlock_irqrestore(&cache_lock, flags); + + return 0; +} + +static void intel_cqm_event_del(struct perf_event *event, int mode) +{ + intel_cqm_event_stop(event, mode); +} + +static void intel_cqm_event_destroy(struct perf_event *event) +{ + struct perf_event *group_other = NULL; + + mutex_lock(&cache_mutex); + + /* + * If there's another event in this group... + */ + if (!list_empty(&event->hw.cqm_group_entry)) { + group_other = list_first_entry(&event->hw.cqm_group_entry, + struct perf_event, + hw.cqm_group_entry); + list_del(&event->hw.cqm_group_entry); + } + + /* + * And we're the group leader.. + */ + if (cqm_group_leader(event)) { + /* + * If there was a group_other, make that leader, otherwise + * destroy the group and return the RMID. + */ + if (group_other) { + list_replace(&event->hw.cqm_groups_entry, + &group_other->hw.cqm_groups_entry); + } else { + unsigned int rmid = event->hw.cqm_rmid; + + if (__rmid_valid(rmid)) + __put_rmid(rmid); + list_del(&event->hw.cqm_groups_entry); + } + } + + mutex_unlock(&cache_mutex); +} + +static int intel_cqm_event_init(struct perf_event *event) +{ + struct perf_event *group = NULL; + bool rotate = false; + + if (event->attr.type != intel_cqm_pmu.type) + return -ENOENT; + + if (event->attr.config & ~QOS_EVENT_MASK) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + INIT_LIST_HEAD(&event->hw.cqm_group_entry); + INIT_LIST_HEAD(&event->hw.cqm_groups_entry); + + event->destroy = intel_cqm_event_destroy; + + mutex_lock(&cache_mutex); + + /* Will also set rmid */ + intel_cqm_setup_event(event, &group); + + if (group) { + list_add_tail(&event->hw.cqm_group_entry, + &group->hw.cqm_group_entry); + } else { + list_add_tail(&event->hw.cqm_groups_entry, + &cache_groups); + + /* + * All RMIDs are either in use or have recently been + * used. Kick the rotation worker to clean/free some. + * + * We only do this for the group leader, rather than for + * every event in a group to save on needless work. + */ + if (!__rmid_valid(event->hw.cqm_rmid)) + rotate = true; + } + + mutex_unlock(&cache_mutex); + + if (rotate) + schedule_delayed_work(&intel_cqm_rmid_work, 0); + + return 0; +} + +EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); +EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); +EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); +EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); +EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); + +static struct attribute *intel_cqm_events_attr[] = { + EVENT_PTR(intel_cqm_llc), + EVENT_PTR(intel_cqm_llc_pkg), + EVENT_PTR(intel_cqm_llc_unit), + EVENT_PTR(intel_cqm_llc_scale), + EVENT_PTR(intel_cqm_llc_snapshot), + NULL, +}; + +static struct attribute_group intel_cqm_events_group = { + .name = "events", + .attrs = intel_cqm_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *intel_cqm_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group intel_cqm_format_group = { + .name = "format", + .attrs = intel_cqm_formats_attr, +}; + +static ssize_t +max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + ssize_t rv; + + mutex_lock(&cache_mutex); + rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); + mutex_unlock(&cache_mutex); + + return rv; +} + +static ssize_t +max_recycle_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int bytes, cachelines; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + mutex_lock(&cache_mutex); + + __intel_cqm_max_threshold = bytes; + cachelines = bytes / cqm_l3_scale; + + /* + * The new maximum takes effect immediately. + */ + if (__intel_cqm_threshold > cachelines) + __intel_cqm_threshold = cachelines; + + mutex_unlock(&cache_mutex); + + return count; +} + +static DEVICE_ATTR_RW(max_recycle_threshold); + +static struct attribute *intel_cqm_attrs[] = { + &dev_attr_max_recycle_threshold.attr, + NULL, +}; + +static const struct attribute_group intel_cqm_group = { + .attrs = intel_cqm_attrs, +}; + +static const struct attribute_group *intel_cqm_attr_groups[] = { + &intel_cqm_events_group, + &intel_cqm_format_group, + &intel_cqm_group, + NULL, +}; + +static struct pmu intel_cqm_pmu = { + .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, + .attr_groups = intel_cqm_attr_groups, + .task_ctx_nr = perf_sw_context, + .event_init = intel_cqm_event_init, + .add = intel_cqm_event_add, + .del = intel_cqm_event_del, + .start = intel_cqm_event_start, + .stop = intel_cqm_event_stop, + .read = intel_cqm_event_read, + .count = intel_cqm_event_count, +}; + +static inline void cqm_pick_event_reader(int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + for_each_cpu(i, &cqm_cpumask) { + if (phys_id == topology_physical_package_id(i)) + return; /* already got reader for this socket */ + } + + cpumask_set_cpu(cpu, &cqm_cpumask); +} + +static void intel_cqm_cpu_prepare(unsigned int cpu) +{ + struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + raw_spin_lock_init(&state->lock); + state->rmid = 0; + state->cnt = 0; + + WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); + WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); +} + +static void intel_cqm_cpu_exit(unsigned int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + /* + * Is @cpu a designated cqm reader? + */ + if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) + return; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + + if (phys_id == topology_physical_package_id(i)) { + cpumask_set_cpu(i, &cqm_cpumask); + break; + } + } +} + +static int intel_cqm_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + intel_cqm_cpu_prepare(cpu); + break; + case CPU_DOWN_PREPARE: + intel_cqm_cpu_exit(cpu); + break; + case CPU_STARTING: + cqm_pick_event_reader(cpu); + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id intel_cqm_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, + {} +}; + +static int __init intel_cqm_init(void) +{ + char *str, scale[20]; + int i, cpu, ret; + + if (!x86_match_cpu(intel_cqm_match)) + return -ENODEV; + + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; + + /* + * It's possible that not all resources support the same number + * of RMIDs. Instead of making scheduling much more complicated + * (where we have to match a task's RMID to a cpu that supports + * that many RMIDs) just find the minimum RMIDs supported across + * all cpus. + * + * Also, check that the scales match on all cpus. + */ + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_cache_max_rmid < cqm_max_rmid) + cqm_max_rmid = c->x86_cache_max_rmid; + + if (c->x86_cache_occ_scale != cqm_l3_scale) { + pr_err("Multiple LLC scale values, disabling\n"); + ret = -EINVAL; + goto out; + } + } + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + __intel_cqm_max_threshold = + boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); + + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); + str = kstrdup(scale, GFP_KERNEL); + if (!str) { + ret = -ENOMEM; + goto out; + } + + event_attr_intel_cqm_llc_scale.event_str = str; + + ret = intel_cqm_setup_rmid_cache(); + if (ret) + goto out; + + for_each_online_cpu(i) { + intel_cqm_cpu_prepare(i); + cqm_pick_event_reader(i); + } + + __perf_cpu_notifier(intel_cqm_cpu_notifier); + + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); + if (ret) + pr_err("Intel CQM perf registration failed: %d\n", ret); + else + pr_info("Intel CQM monitoring enabled\n"); + +out: + cpu_notifier_register_done(); + + return ret; +} +device_initcall(intel_cqm_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 073983398364..ca69ea56c712 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config) debugctlmsr |= DEBUGCTLMSR_TR; debugctlmsr |= DEBUGCTLMSR_BTS; - debugctlmsr |= DEBUGCTLMSR_BTINT; + if (config & ARCH_PERFMON_EVENTSEL_INT) + debugctlmsr |= DEBUGCTLMSR_BTINT; if (!(config & ARCH_PERFMON_EVENTSEL_OS)) debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; @@ -611,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END @@ -622,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END @@ -633,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ - INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 58f1a94beaf0..94e5b506caa6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -39,6 +39,7 @@ static enum { #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ #define LBR_FAR_BIT 8 /* do not capture far branches */ +#define LBR_CALL_STACK_BIT 9 /* enable call stack */ #define LBR_KERNEL (1 << LBR_KERNEL_BIT) #define LBR_USER (1 << LBR_USER_BIT) @@ -49,6 +50,7 @@ static enum { #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) #define LBR_FAR (1 << LBR_FAR_BIT) +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) #define LBR_PLM (LBR_KERNEL | LBR_USER) @@ -69,33 +71,31 @@ static enum { #define LBR_FROM_FLAG_IN_TX (1ULL << 62) #define LBR_FROM_FLAG_ABORT (1ULL << 61) -#define for_each_branch_sample_type(x) \ - for ((x) = PERF_SAMPLE_BRANCH_USER; \ - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) - /* * x86control flow change classification * x86control flow changes include branches, interrupts, traps, faults */ enum { - X86_BR_NONE = 0, /* unknown */ + X86_BR_NONE = 0, /* unknown */ - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -112,13 +112,15 @@ enum { X86_BR_JMP |\ X86_BR_IRQ |\ X86_BR_ABORT |\ - X86_BR_IND_CALL) + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) #define X86_BR_ANY_CALL \ (X86_BR_CALL |\ X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ X86_BR_SYSCALL |\ X86_BR_IRQ |\ X86_BR_INT) @@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); * otherwise it becomes near impossible to get a reliable stack. */ -static void __intel_pmu_lbr_enable(void) +static void __intel_pmu_lbr_enable(bool pmi) { - u64 debugctl; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 debugctl, lbr_select = 0, orig_debugctl; - if (cpuc->lbr_sel) - wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); + /* + * No need to reprogram LBR_SELECT in a PMI, as it + * did not change. + */ + if (cpuc->lbr_sel && !pmi) { + lbr_select = cpuc->lbr_sel->config; + wrmsrl(MSR_LBR_SELECT, lbr_select); + } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + orig_debugctl = debugctl; + debugctl |= DEBUGCTLMSR_LBR; + /* + * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. + * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions + * may cause superfluous increase/decrease of LBR_TOS. + */ + if (!(lbr_select & LBR_CALL_STACK)) + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } static void __intel_pmu_lbr_disable(void) @@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } +/* + * TOS = most recently recorded branch + */ +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + return tos; +} + +enum { + LBR_NONE, + LBR_VALID, +}; + +static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0 || + task_ctx->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_NONE; +} + +static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_VALID; +} + +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; + + if (!x86_pmu.lbr_nr) + return; + + /* + * If LBR callstack feature is enabled and the stack was saved when + * the task was scheduled out, restore the stack. Otherwise flush + * the LBR stack. + */ + task_ctx = ctx ? ctx->task_ctx_data : NULL; + if (task_ctx) { + if (sched_in) { + __intel_pmu_lbr_restore(task_ctx); + cpuc->lbr_context = ctx; + } else { + __intel_pmu_lbr_save(task_ctx); + } + return; + } + + /* + * When sampling the branck stack in system-wide, it may be + * necessary to flush the stack on context switch. This happens + * when the branch stack does not tag its entries with the pid + * of the current task. Otherwise it becomes impossible to + * associate a branch entry with a task. This ambiguity is more + * likely to appear when the branch stack supports priv level + * filtering and the user sets it to monitor only at the user + * level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch + * stack with branch from multiple tasks. + */ + if (sched_in) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = ctx; + } +} + +static inline bool branch_user_callstack(unsigned br_sel) +{ + return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); +} + void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event) } cpuc->br_sel = event->hw.branch_reg.reg; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users++; + } + cpuc->lbr_users++; + perf_sched_cb_inc(event->ctx->pmu); } void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users--; + } + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); @@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event) } } -void intel_pmu_lbr_enable_all(void) +void intel_pmu_lbr_enable_all(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->lbr_users) - __intel_pmu_lbr_enable(); + __intel_pmu_lbr_enable(pmi); } void intel_pmu_lbr_disable_all(void) @@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -/* - * TOS = most recently recorded branch - */ -static inline u64 intel_pmu_lbr_tos(void) -{ - u64 tos; - - rdmsrl(x86_pmu.lbr_tos, tos); - - return tos; -} - static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; @@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void) * - in case there is no HW filter * - in case the HW filter has errata or limitations */ -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) { u64 br_type = event->attr.branch_sample_type; int mask = 0; @@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_COND) mask |= X86_BR_JCC; + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { + if (!x86_pmu_has_lbr_callstack()) + return -EOPNOTSUPP; + if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) + return -EINVAL; + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | + X86_BR_CALL_STACK; + } + /* * stash actual user request into reg, it may * be used by fixup code for some CPU */ event->hw.branch_reg.reg = mask; + return 0; } /* @@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) { struct hw_perf_event_extra *reg; u64 br_type = event->attr.branch_sample_type; - u64 mask = 0, m; - u64 v; + u64 mask = 0, v; + int i; - for_each_branch_sample_type(m) { - if (!(br_type & m)) + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & (1ULL << i))) continue; - v = x86_pmu.lbr_sel_map[m]; + v = x86_pmu.lbr_sel_map[i]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; @@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; - /* LBR_SELECT operates in suppress mode so invert mask */ - reg->config = ~mask & x86_pmu.lbr_sel_mask; + /* + * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate + * in suppress mode. So LBR_SELECT should be set to + * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) + */ + reg->config = mask ^ x86_pmu.lbr_sel_mask; return 0; } @@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * setup SW LBR filter */ - intel_pmu_setup_sw_lbr_filter(event); + ret = intel_pmu_setup_sw_lbr_filter(event); + if (ret) + return ret; /* * setup HW LBR filter, if any @@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_INT; break; case 0xe8: /* call near rel */ + insn_get_immediate(&insn); + if (insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } case 0x9a: /* call far absolute */ ret = X86_BR_CALL; break; @@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP - | LBR_IND_JMP | LBR_FAR, +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches */ - [PERF_SAMPLE_BRANCH_ANY_CALL] = + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include IND_JMP to capture IND_CALL */ - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, +}; + +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_RETURN | LBR_CALL_STACK, }; /* core */ @@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void) pr_cont("16-deep LBR, "); } +/* haswell */ +void intel_pmu_lbr_init_hsw(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + pr_cont("16-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c new file mode 100644 index 000000000000..f2770641c0fd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -0,0 +1,1103 @@ +/* + * Intel(R) Processor Trace PMU driver for perf + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Intel PT is specified in the Intel Architecture Instruction Set Extensions + * Programming Reference: + * http://software.intel.com/en-us/intel-isa-extensions + */ + +#undef DEBUG + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include +#include +#include + +#include "perf_event.h" +#include "intel_pt.h" + +static DEFINE_PER_CPU(struct pt, pt_ctx); + +static struct pt_pmu pt_pmu; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +/* + * Capabilities of Intel PT hardware, such as number of address bits or + * supported output schemes, are cached and exported to userspace as "caps" + * attribute group of pt pmu device + * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store + * relevant bits together with intel_pt traces. + * + * These are necessary for both trace decoding (payloads_lip, contains address + * width encoded in IP-related packets), and event configuration (bitmasks with + * permitted values for certain bit fields). + */ +#define PT_CAP(_n, _l, _r, _m) \ + [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ + .reg = _r, .mask = _m } + +static struct pt_cap_desc { + const char *name; + u32 leaf; + u8 reg; + u32 mask; +} pt_caps[] = { + PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), + PT_CAP(topa_output, 0, CR_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), + PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), +}; + +static u32 pt_cap_get(enum pt_capabilities cap) +{ + struct pt_cap_desc *cd = &pt_caps[cap]; + u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; + unsigned int shift = __ffs(cd->mask); + + return (c & cd->mask) >> shift; +} + +static ssize_t pt_cap_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *ea = + container_of(attr, struct dev_ext_attribute, attr); + enum pt_capabilities cap = (long)ea->var; + + return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); +} + +static struct attribute_group pt_cap_group = { + .name = "caps", +}; + +PMU_FORMAT_ATTR(tsc, "config:10" ); +PMU_FORMAT_ATTR(noretcomp, "config:11" ); + +static struct attribute *pt_formats_attr[] = { + &format_attr_tsc.attr, + &format_attr_noretcomp.attr, + NULL, +}; + +static struct attribute_group pt_format_group = { + .name = "format", + .attrs = pt_formats_attr, +}; + +static const struct attribute_group *pt_attr_groups[] = { + &pt_cap_group, + &pt_format_group, + NULL, +}; + +static int __init pt_pmu_hw_init(void) +{ + struct dev_ext_attribute *de_attrs; + struct attribute **attrs; + size_t size; + int ret; + long i; + + attrs = NULL; + ret = -ENODEV; + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + goto fail; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + cpuid_count(20, i, + &pt_pmu.caps[CR_EAX + i*4], + &pt_pmu.caps[CR_EBX + i*4], + &pt_pmu.caps[CR_ECX + i*4], + &pt_pmu.caps[CR_EDX + i*4]); + } + + ret = -ENOMEM; + size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); + attrs = kzalloc(size, GFP_KERNEL); + if (!attrs) + goto fail; + + size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); + de_attrs = kzalloc(size, GFP_KERNEL); + if (!de_attrs) + goto fail; + + for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { + struct dev_ext_attribute *de_attr = de_attrs + i; + + de_attr->attr.attr.name = pt_caps[i].name; + + sysfs_attr_init(&de_attrs->attr.attr); + + de_attr->attr.attr.mode = S_IRUGO; + de_attr->attr.show = pt_cap_show; + de_attr->var = (void *)i; + + attrs[i] = &de_attr->attr.attr; + } + + pt_cap_group.attrs = attrs; + + return 0; + +fail: + kfree(attrs); + + return ret; +} + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) + +static bool pt_event_valid(struct perf_event *event) +{ + u64 config = event->attr.config; + + if ((config & PT_CONFIG_MASK) != config) + return false; + + return true; +} + +/* + * PT configuration helpers + * These all are cpu affine and operate on a local PT + */ + +static bool pt_is_running(void) +{ + u64 ctl; + + rdmsrl(MSR_IA32_RTIT_CTL, ctl); + + return !!(ctl & RTIT_CTL_TRACEEN); +} + +static void pt_config(struct perf_event *event) +{ + u64 reg; + + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + + if (!event->attr.exclude_kernel) + reg |= RTIT_CTL_OS; + if (!event->attr.exclude_user) + reg |= RTIT_CTL_USR; + + reg |= (event->attr.config & PT_CONFIG_MASK); + + wrmsrl(MSR_IA32_RTIT_CTL, reg); +} + +static void pt_config_start(bool start) +{ + u64 ctl; + + rdmsrl(MSR_IA32_RTIT_CTL, ctl); + if (start) + ctl |= RTIT_CTL_TRACEEN; + else + ctl &= ~RTIT_CTL_TRACEEN; + wrmsrl(MSR_IA32_RTIT_CTL, ctl); + + /* + * A wrmsr that disables trace generation serializes other PT + * registers and causes all data packets to be written to memory, + * but a fence is required for the data to become globally visible. + * + * The below WMB, separating data store and aux_head store matches + * the consumer's RMB that separates aux_head load and data load. + */ + if (!start) + wmb(); +} + +static void pt_config_buffer(void *buf, unsigned int topa_idx, + unsigned int output_off) +{ + u64 reg; + + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); + + reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); + + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); +} + +/* + * Keep ToPA table-related metadata on the same page as the actual table, + * taking up a few words from the top + */ + +#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) + +/** + * struct topa - page-sized ToPA table with metadata at the top + * @table: actual ToPA table entries, as understood by PT hardware + * @list: linkage to struct pt_buffer's list of tables + * @phys: physical address of this page + * @offset: offset of the first entry in this table in the buffer + * @size: total size of all entries in this table + * @last: index of the last initialized entry in this table + */ +struct topa { + struct topa_entry table[TENTS_PER_PAGE]; + struct list_head list; + u64 phys; + u64 offset; + size_t size; + int last; +}; + +/* make -1 stand for the last table entry */ +#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) + +/** + * topa_alloc() - allocate page-sized ToPA table + * @cpu: CPU on which to allocate. + * @gfp: Allocation flags. + * + * Return: On success, return the pointer to ToPA table page. + */ +static struct topa *topa_alloc(int cpu, gfp_t gfp) +{ + int node = cpu_to_node(cpu); + struct topa *topa; + struct page *p; + + p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); + if (!p) + return NULL; + + topa = page_address(p); + topa->last = 0; + topa->phys = page_to_phys(p); + + /* + * In case of singe-entry ToPA, always put the self-referencing END + * link as the 2nd entry in the table + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(topa, 1)->end = 1; + } + + return topa; +} + +/** + * topa_free() - free a page-sized ToPA table + * @topa: Table to deallocate. + */ +static void topa_free(struct topa *topa) +{ + free_page((unsigned long)topa); +} + +/** + * topa_insert_table() - insert a ToPA table into a buffer + * @buf: PT buffer that's being extended. + * @topa: New topa table to be inserted. + * + * If it's the first table in this buffer, set up buffer's pointers + * accordingly; otherwise, add a END=1 link entry to @topa to the current + * "last" table and adjust the last table pointer to @topa. + */ +static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) +{ + struct topa *last = buf->last; + + list_add_tail(&topa->list, &buf->tables); + + if (!buf->first) { + buf->first = buf->last = buf->cur = topa; + return; + } + + topa->offset = last->offset + last->size; + buf->last = topa; + + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return; + + BUG_ON(last->last != TENTS_PER_PAGE - 1); + + TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(last, -1)->end = 1; +} + +/** + * topa_table_full() - check if a ToPA table is filled up + * @topa: ToPA table. + */ +static bool topa_table_full(struct topa *topa) +{ + /* single-entry ToPA is a special case */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return !!topa->last; + + return topa->last == TENTS_PER_PAGE - 1; +} + +/** + * topa_insert_pages() - create a list of ToPA tables + * @buf: PT buffer being initialized. + * @gfp: Allocation flags. + * + * This initializes a list of ToPA tables with entries from + * the data_pages provided by rb_alloc_aux(). + * + * Return: 0 on success or error code. + */ +static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) +{ + struct topa *topa = buf->last; + int order = 0; + struct page *p; + + p = virt_to_page(buf->data_pages[buf->nr_pages]); + if (PagePrivate(p)) + order = page_private(p); + + if (topa_table_full(topa)) { + topa = topa_alloc(buf->cpu, gfp); + if (!topa) + return -ENOMEM; + + topa_insert_table(buf, topa); + } + + TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; + TOPA_ENTRY(topa, -1)->size = order; + if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(topa, -1)->intr = 1; + TOPA_ENTRY(topa, -1)->stop = 1; + } + + topa->last++; + topa->size += sizes(order); + + buf->nr_pages += 1ul << order; + + return 0; +} + +/** + * pt_topa_dump() - print ToPA tables and their entries + * @buf: PT buffer. + */ +static void pt_topa_dump(struct pt_buffer *buf) +{ + struct topa *topa; + + list_for_each_entry(topa, &buf->tables, list) { + int i; + + pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, + topa->phys, topa->offset, topa->size); + for (i = 0; i < TENTS_PER_PAGE; i++) { + pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", + &topa->table[i], + (unsigned long)topa->table[i].base << TOPA_SHIFT, + sizes(topa->table[i].size), + topa->table[i].end ? 'E' : ' ', + topa->table[i].intr ? 'I' : ' ', + topa->table[i].stop ? 'S' : ' ', + *(u64 *)&topa->table[i]); + if ((pt_cap_get(PT_CAP_topa_multiple_entries) && + topa->table[i].stop) || + topa->table[i].end) + break; + } + } +} + +/** + * pt_buffer_advance() - advance to the next output region + * @buf: PT buffer. + * + * Advance the current pointers in the buffer to the next ToPA entry. + */ +static void pt_buffer_advance(struct pt_buffer *buf) +{ + buf->output_off = 0; + buf->cur_idx++; + + if (buf->cur_idx == buf->cur->last) { + if (buf->cur == buf->last) + buf->cur = buf->first; + else + buf->cur = list_entry(buf->cur->list.next, struct topa, + list); + buf->cur_idx = 0; + } +} + +/** + * pt_update_head() - calculate current offsets and sizes + * @pt: Per-cpu pt context. + * + * Update buffer's current write pointer position and data size. + */ +static void pt_update_head(struct pt *pt) +{ + struct pt_buffer *buf = perf_get_aux(&pt->handle); + u64 topa_idx, base, old; + + /* offset of the first region in this table from the beginning of buf */ + base = buf->cur->offset + buf->output_off; + + /* offset of the current output region within this table */ + for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) + base += sizes(buf->cur->table[topa_idx].size); + + if (buf->snapshot) { + local_set(&buf->data_size, base); + } else { + old = (local64_xchg(&buf->head, base) & + ((buf->nr_pages << PAGE_SHIFT) - 1)); + if (base < old) + base += buf->nr_pages << PAGE_SHIFT; + + local_add(base - old, &buf->data_size); + } +} + +/** + * pt_buffer_region() - obtain current output region's address + * @buf: PT buffer. + */ +static void *pt_buffer_region(struct pt_buffer *buf) +{ + return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); +} + +/** + * pt_buffer_region_size() - obtain current output region's size + * @buf: PT buffer. + */ +static size_t pt_buffer_region_size(struct pt_buffer *buf) +{ + return sizes(buf->cur->table[buf->cur_idx].size); +} + +/** + * pt_handle_status() - take care of possible status conditions + * @pt: Per-cpu pt context. + */ +static void pt_handle_status(struct pt *pt) +{ + struct pt_buffer *buf = perf_get_aux(&pt->handle); + int advance = 0; + u64 status; + + rdmsrl(MSR_IA32_RTIT_STATUS, status); + + if (status & RTIT_STATUS_ERROR) { + pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); + pt_topa_dump(buf); + status &= ~RTIT_STATUS_ERROR; + } + + if (status & RTIT_STATUS_STOPPED) { + status &= ~RTIT_STATUS_STOPPED; + + /* + * On systems that only do single-entry ToPA, hitting STOP + * means we are already losing data; need to let the decoder + * know. + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries) || + buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + local_inc(&buf->lost); + advance++; + } + } + + /* + * Also on single-entry ToPA implementations, interrupt will come + * before the output reaches its output region's boundary. + */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && + pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { + void *head = pt_buffer_region(buf); + + /* everything within this margin needs to be zeroed out */ + memset(head + buf->output_off, 0, + pt_buffer_region_size(buf) - + buf->output_off); + advance++; + } + + if (advance) + pt_buffer_advance(buf); + + wrmsrl(MSR_IA32_RTIT_STATUS, status); +} + +/** + * pt_read_offset() - translate registers into buffer pointers + * @buf: PT buffer. + * + * Set buffer's output pointers from MSR values. + */ +static void pt_read_offset(struct pt_buffer *buf) +{ + u64 offset, base_topa; + + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); + buf->cur = phys_to_virt(base_topa); + + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); + /* offset within current output region */ + buf->output_off = offset >> 32; + /* index of current output region within this table */ + buf->cur_idx = (offset & 0xffffff80) >> 7; +} + +/** + * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry + * @buf: PT buffer. + * @pg: Page offset in the buffer. + * + * When advancing to the next output region (ToPA entry), given a page offset + * into the buffer, we need to find the offset of the first page in the next + * region. + */ +static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) +{ + struct topa_entry *te = buf->topa_index[pg]; + + /* one region */ + if (buf->first == buf->last && buf->first->last == 1) + return pg; + + do { + pg++; + pg &= buf->nr_pages - 1; + } while (buf->topa_index[pg] == te); + + return pg; +} + +/** + * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer + * @buf: PT buffer. + * @handle: Current output handle. + * + * Place INT and STOP marks to prevent overwriting old data that the consumer + * hasn't yet collected. + */ +static int pt_buffer_reset_markers(struct pt_buffer *buf, + struct perf_output_handle *handle) + +{ + unsigned long idx, npages, end; + + if (buf->snapshot) + return 0; + + /* can't stop in the middle of an output region */ + if (buf->output_off + handle->size + 1 < + sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) + return -EINVAL; + + + /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + return 0; + + /* clear STOP and INT from current entry */ + buf->topa_index[buf->stop_pos]->stop = 0; + buf->topa_index[buf->intr_pos]->intr = 0; + + if (pt_cap_get(PT_CAP_topa_multiple_entries)) { + npages = (handle->size + 1) >> PAGE_SHIFT; + end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; + /*if (end > handle->wakeup >> PAGE_SHIFT) + end = handle->wakeup >> PAGE_SHIFT;*/ + idx = end & (buf->nr_pages - 1); + buf->stop_pos = idx; + idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; + idx &= buf->nr_pages - 1; + buf->intr_pos = idx; + } + + buf->topa_index[buf->stop_pos]->stop = 1; + buf->topa_index[buf->intr_pos]->intr = 1; + + return 0; +} + +/** + * pt_buffer_setup_topa_index() - build topa_index[] table of regions + * @buf: PT buffer. + * + * topa_index[] references output regions indexed by offset into the + * buffer for purposes of quick reverse lookup. + */ +static void pt_buffer_setup_topa_index(struct pt_buffer *buf) +{ + struct topa *cur = buf->first, *prev = buf->last; + struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), + *te_prev = TOPA_ENTRY(prev, prev->last - 1); + int pg = 0, idx = 0, ntopa = 0; + + while (pg < buf->nr_pages) { + int tidx; + + /* pages within one topa entry */ + for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) + buf->topa_index[pg] = te_prev; + + te_prev = te_cur; + + if (idx == cur->last - 1) { + /* advance to next topa table */ + idx = 0; + cur = list_entry(cur->list.next, struct topa, list); + ntopa++; + } else + idx++; + te_cur = TOPA_ENTRY(cur, idx); + } + +} + +/** + * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head + * @buf: PT buffer. + * @head: Write pointer (aux_head) from AUX buffer. + * + * Find the ToPA table and entry corresponding to given @head and set buffer's + * "current" pointers accordingly. + */ +static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) +{ + int pg; + + if (buf->snapshot) + head &= (buf->nr_pages << PAGE_SHIFT) - 1; + + pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); + pg = pt_topa_next_entry(buf, pg); + + buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); + buf->cur_idx = ((unsigned long)buf->topa_index[pg] - + (unsigned long)buf->cur) / sizeof(struct topa_entry); + buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); + + local64_set(&buf->head, head); + local_set(&buf->data_size, 0); +} + +/** + * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer + * @buf: PT buffer. + */ +static void pt_buffer_fini_topa(struct pt_buffer *buf) +{ + struct topa *topa, *iter; + + list_for_each_entry_safe(topa, iter, &buf->tables, list) { + /* + * right now, this is in free_aux() path only, so + * no need to unlink this table from the list + */ + topa_free(topa); + } +} + +/** + * pt_buffer_init_topa() - initialize ToPA table for pt buffer + * @buf: PT buffer. + * @size: Total size of all regions within this ToPA. + * @gfp: Allocation flags. + */ +static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, + gfp_t gfp) +{ + struct topa *topa; + int err; + + topa = topa_alloc(buf->cpu, gfp); + if (!topa) + return -ENOMEM; + + topa_insert_table(buf, topa); + + while (buf->nr_pages < nr_pages) { + err = topa_insert_pages(buf, gfp); + if (err) { + pt_buffer_fini_topa(buf); + return -ENOMEM; + } + } + + pt_buffer_setup_topa_index(buf); + + /* link last table to the first one, unless we're double buffering */ + if (pt_cap_get(PT_CAP_topa_multiple_entries)) { + TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; + TOPA_ENTRY(buf->last, -1)->end = 1; + } + + pt_topa_dump(buf); + return 0; +} + +/** + * pt_buffer_setup_aux() - set up topa tables for a PT buffer + * @cpu: Cpu on which to allocate, -1 means current. + * @pages: Array of pointers to buffer pages passed from perf core. + * @nr_pages: Number of pages in the buffer. + * @snapshot: If this is a snapshot/overwrite counter. + * + * This is a pmu::setup_aux callback that sets up ToPA tables and all the + * bookkeeping for an AUX buffer. + * + * Return: Our private PT buffer structure. + */ +static void * +pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) +{ + struct pt_buffer *buf; + int node, ret; + + if (!nr_pages) + return NULL; + + if (cpu == -1) + cpu = raw_smp_processor_id(); + node = cpu_to_node(cpu); + + buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), + GFP_KERNEL, node); + if (!buf) + return NULL; + + buf->cpu = cpu; + buf->snapshot = snapshot; + buf->data_pages = pages; + + INIT_LIST_HEAD(&buf->tables); + + ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); + if (ret) { + kfree(buf); + return NULL; + } + + return buf; +} + +/** + * pt_buffer_free_aux() - perf AUX deallocation path callback + * @data: PT buffer. + */ +static void pt_buffer_free_aux(void *data) +{ + struct pt_buffer *buf = data; + + pt_buffer_fini_topa(buf); + kfree(buf); +} + +/** + * pt_buffer_is_full() - check if the buffer is full + * @buf: PT buffer. + * @pt: Per-cpu pt handle. + * + * If the user hasn't read data from the output region that aux_head + * points to, the buffer is considered full: the user needs to read at + * least this region and update aux_tail to point past it. + */ +static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) +{ + if (buf->snapshot) + return false; + + if (local_read(&buf->data_size) >= pt->handle.size) + return true; + + return false; +} + +/** + * intel_pt_interrupt() - PT PMI handler + */ +void intel_pt_interrupt(void) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf; + struct perf_event *event = pt->handle.event; + + /* + * There may be a dangling PT bit in the interrupt status register + * after PT has been disabled by pt_event_stop(). Make sure we don't + * do anything (particularly, re-enable) for this event here. + */ + if (!ACCESS_ONCE(pt->handle_nmi)) + return; + + pt_config_start(false); + + if (!event) + return; + + buf = perf_get_aux(&pt->handle); + if (!buf) + return; + + pt_read_offset(buf); + + pt_handle_status(pt); + + pt_update_head(pt); + + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), + local_xchg(&buf->lost, 0)); + + if (!event->hw.state) { + int ret; + + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) { + event->hw.state = PERF_HES_STOPPED; + return; + } + + pt_buffer_reset_offsets(buf, pt->handle.head); + ret = pt_buffer_reset_markers(buf, &pt->handle); + if (ret) { + perf_aux_output_end(&pt->handle, 0, true); + return; + } + + pt_config_buffer(buf->cur->table, buf->cur_idx, + buf->output_off); + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + pt_config(event); + } +} + +/* + * PMU callbacks + */ + +static void pt_event_start(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + + if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) { + event->hw.state = PERF_HES_STOPPED; + return; + } + + ACCESS_ONCE(pt->handle_nmi) = 1; + event->hw.state = 0; + + pt_config_buffer(buf->cur->table, buf->cur_idx, + buf->output_off); + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + pt_config(event); +} + +static void pt_event_stop(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + + /* + * Protect against the PMI racing with disabling wrmsr, + * see comment in intel_pt_interrupt(). + */ + ACCESS_ONCE(pt->handle_nmi) = 0; + pt_config_start(false); + + if (event->hw.state == PERF_HES_STOPPED) + return; + + event->hw.state = PERF_HES_STOPPED; + + if (mode & PERF_EF_UPDATE) { + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + + if (!buf) + return; + + if (WARN_ON_ONCE(pt->handle.event != event)) + return; + + pt_read_offset(buf); + + pt_handle_status(pt); + + pt_update_head(pt); + } +} + +static void pt_event_del(struct perf_event *event, int mode) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf; + + pt_event_stop(event, PERF_EF_UPDATE); + + buf = perf_get_aux(&pt->handle); + + if (buf) { + if (buf->snapshot) + pt->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), + local_xchg(&buf->lost, 0)); + } +} + +static int pt_event_add(struct perf_event *event, int mode) +{ + struct pt_buffer *buf; + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct hw_perf_event *hwc = &event->hw; + int ret = -EBUSY; + + if (pt->handle.event) + goto out; + + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) { + ret = -EINVAL; + goto out; + } + + pt_buffer_reset_offsets(buf, pt->handle.head); + if (!buf->snapshot) { + ret = pt_buffer_reset_markers(buf, &pt->handle); + if (ret) { + perf_aux_output_end(&pt->handle, 0, true); + goto out; + } + } + + if (mode & PERF_EF_START) { + pt_event_start(event, 0); + if (hwc->state == PERF_HES_STOPPED) { + pt_event_del(event, 0); + ret = -EBUSY; + } + } else { + hwc->state = PERF_HES_STOPPED; + } + + ret = 0; +out: + + if (ret) + hwc->state = PERF_HES_STOPPED; + + return ret; +} + +static void pt_event_read(struct perf_event *event) +{ +} + +static void pt_event_destroy(struct perf_event *event) +{ + x86_del_exclusive(x86_lbr_exclusive_pt); +} + +static int pt_event_init(struct perf_event *event) +{ + if (event->attr.type != pt_pmu.pmu.type) + return -ENOENT; + + if (!pt_event_valid(event)) + return -EINVAL; + + if (x86_add_exclusive(x86_lbr_exclusive_pt)) + return -EBUSY; + + event->destroy = pt_event_destroy; + + return 0; +} + +static __init int pt_init(void) +{ + int ret, cpu, prior_warn = 0; + + BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); + get_online_cpus(); + for_each_online_cpu(cpu) { + u64 ctl; + + ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); + if (!ret && (ctl & RTIT_CTL_TRACEEN)) + prior_warn++; + } + put_online_cpus(); + + if (prior_warn) { + x86_add_exclusive(x86_lbr_exclusive_pt); + pr_warn("PT is enabled at boot time, doing nothing\n"); + + return -EBUSY; + } + + ret = pt_pmu_hw_init(); + if (ret) + return ret; + + if (!pt_cap_get(PT_CAP_topa_output)) { + pr_warn("ToPA output is not supported on this CPU\n"); + return -ENODEV; + } + + if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + pt_pmu.pmu.capabilities = + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; + + pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; + pt_pmu.pmu.attr_groups = pt_attr_groups; + pt_pmu.pmu.task_ctx_nr = perf_sw_context; + pt_pmu.pmu.event_init = pt_event_init; + pt_pmu.pmu.add = pt_event_add; + pt_pmu.pmu.del = pt_event_del; + pt_pmu.pmu.start = pt_event_start; + pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.read = pt_event_read; + pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; + pt_pmu.pmu.free_aux = pt_buffer_free_aux; + ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); + + return ret; +} + +module_init(pt_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 21af6149edf2..12d9548457e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid) } } - if (ubox_dev) - pci_dev_put(ubox_dev); + pci_dev_put(ubox_dev); return err ? pcibios_err_to_errno(err) : 0; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 60639093d536..3d423a101fae 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 24d079604fd5..1deffe6cc873 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); @@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src) return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); + length = insn.length; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, insn.length); + memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, insn.length); + kernel_insn_init(&insn, dest, length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src) *(s32 *) disp = (s32) newdisp; } #endif - return insn.length; + return length; } static int arch_copy_kprobe(struct kprobe *p) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbfceb756452..c2e21113ecc0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -113,8 +113,6 @@ struct bpf_prog_type_list { enum bpf_prog_type type; }; -void bpf_register_prog_type(struct bpf_prog_type_list *tl); - struct bpf_prog; struct bpf_prog_aux { @@ -129,11 +127,25 @@ struct bpf_prog_aux { }; #ifdef CONFIG_BPF_SYSCALL +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + void bpf_prog_put(struct bpf_prog *prog); -#else -static inline void bpf_prog_put(struct bpf_prog *prog) {} -#endif struct bpf_prog *bpf_prog_get(u32 ufd); +#else +static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ +} + +static inline struct bpf_prog *bpf_prog_get(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void bpf_prog_put(struct bpf_prog *prog) +{ +} +#endif + /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 112cf49d9576..46e83c2156c6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -13,6 +13,7 @@ struct trace_array; struct trace_buffer; struct tracer; struct dentry; +struct bpf_prog; struct trace_print_flags { unsigned long mask; @@ -252,6 +253,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED_BIT, TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_KPROBE_BIT, }; /* @@ -265,6 +267,7 @@ enum { * it is best to clear the buffers that used it). * USE_CALL_FILTER - For ftrace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint + * KPROBE - Event is a kprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -274,6 +277,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), }; struct ftrace_event_call { @@ -303,6 +307,7 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; int (*perf_perm)(struct ftrace_event_call *, struct perf_event *); @@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file, event_triggers_post_call(file, tt); } +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b621982938d..61992cf2e977 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -53,6 +53,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -118,10 +119,19 @@ struct hw_perf_event { struct hrtimer hrtimer; }; struct { /* tracepoint */ - struct task_struct *tp_target; /* for tp_event->class */ struct list_head tp_list; }; + struct { /* intel_cqm */ + int cqm_state; + int cqm_rmid; + struct list_head cqm_events_entry; + struct list_head cqm_groups_entry; + struct list_head cqm_group_entry; + }; + struct { /* itrace */ + int itrace_started; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* @@ -129,12 +139,12 @@ struct hw_perf_event { * problem hw_breakpoint has with context * creation and event initalization. */ - struct task_struct *bp_target; struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif }; + struct task_struct *target; int state; local64_t prev_count; u64 sample_period; @@ -166,6 +176,11 @@ struct perf_event; * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 +#define PERF_PMU_CAP_NO_NMI 0x02 +#define PERF_PMU_CAP_AUX_NO_SG 0x04 +#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 +#define PERF_PMU_CAP_EXCLUSIVE 0x10 +#define PERF_PMU_CAP_ITRACE 0x20 /** * struct pmu - generic performance monitoring unit @@ -186,6 +201,7 @@ struct pmu { int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; + atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -262,9 +278,32 @@ struct pmu { int (*event_idx) (struct perf_event *event); /*optional */ /* - * flush branch stack on context-switches (needed in cpu-wide mode) + * context-switches callback */ - void (*flush_branch_stack) (void); + void (*sched_task) (struct perf_event_context *ctx, + bool sched_in); + /* + * PMU specific data size + */ + size_t task_ctx_size; + + + /* + * Return the count value for a counter. + */ + u64 (*count) (struct perf_event *event); /*optional*/ + + /* + * Set up pmu-private data structures for an AUX area + */ + void *(*setup_aux) (int cpu, void **pages, + int nr_pages, bool overwrite); + /* optional */ + + /* + * Free pmu-private AUX data structures + */ + void (*free_aux) (void *aux); /* optional */ }; /** @@ -300,6 +339,7 @@ struct swevent_hlist { #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 +#define PERF_ATTACH_TASK_DATA 0x08 struct perf_cgroup; struct ring_buffer; @@ -438,6 +478,7 @@ struct perf_event { struct pid_namespace *ns; u64 id; + u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; @@ -504,7 +545,7 @@ struct perf_event_context { u64 generation; int pin_count; int nr_cgroups; /* cgroup evts */ - int nr_branch_stack; /* branch_stack evt */ + void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; struct delayed_work orphans_remove; @@ -536,12 +577,52 @@ struct perf_output_handle { struct ring_buffer *rb; unsigned long wakeup; unsigned long size; - void *addr; + union { + void *addr; + unsigned long head; + }; int page; }; +#ifdef CONFIG_CGROUP_PERF + +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_css(task, perf_event_cgrp_id), + struct perf_cgroup, css); +} +#endif /* CONFIG_CGROUP_PERF */ + #ifdef CONFIG_PERF_EVENTS +extern void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event); +extern void perf_aux_output_end(struct perf_output_handle *handle, + unsigned long size, bool truncated); +extern int perf_aux_output_skip(struct perf_output_handle *handle, + unsigned long size); +extern void *perf_get_aux(struct perf_output_handle *handle); + extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); @@ -558,6 +639,8 @@ extern void perf_event_delayed_put(struct task_struct *task); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); +extern void perf_sched_cb_dec(struct pmu *pmu); +extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern int perf_event_refresh(struct perf_event *event, int refresh); @@ -731,6 +814,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, __perf_event_task_sched_out(prev, next); } +static inline u64 __perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -800,6 +888,16 @@ static inline bool has_branch_stack(struct perf_event *event) return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } +static inline bool needs_branch_stack(struct perf_event *event) +{ + return event->attr.branch_sample_type != 0; +} + +static inline bool has_aux(struct perf_event *event) +{ + return event->pmu->setup_aux; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); @@ -815,6 +913,17 @@ extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ +static inline void * +perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) { return NULL; } +static inline void +perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) { } +static inline int +perf_aux_output_skip(struct perf_output_handle *handle, + unsigned long size) { return -EINVAL; } +static inline void * +perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 395b70e0eccf..a746bf5216f8 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, extern int watchdog_register_device(struct watchdog_device *); extern void watchdog_unregister_device(struct watchdog_device *); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void watchdog_nmi_disable_all(void); +void watchdog_nmi_enable_all(void); +#else +static inline void watchdog_nmi_disable_all(void) {} +static inline void watchdog_nmi_enable_all(void) {} +#endif + #endif /* ifndef _LINUX_WATCHDOG_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..cc47ef41076a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, }; /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -151,6 +152,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ }; } __attribute__((aligned(8))); @@ -162,6 +164,9 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ + BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ + BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9b79abbd1ab8..309211b3eb67 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -152,21 +152,42 @@ enum perf_event_sample_format { * The branch types can be combined, however BRANCH_ANY covers all types * of branches and therefore it supersedes all the other types. */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, - PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */ - PERF_SAMPLE_BRANCH_COND = 1U << 10, /* conditional branches */ + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, - PERF_SAMPLE_BRANCH_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; #define PERF_SAMPLE_BRANCH_PLM_ALL \ @@ -240,6 +261,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ /* add: sample_stack_user */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -305,7 +327,8 @@ struct perf_event_attr { exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ comm_exec : 1, /* flag comm events that are due to an exec */ - __reserved_1 : 39; + use_clockid : 1, /* use @clockid for time fields */ + __reserved_1 : 38; union { __u32 wakeup_events; /* wakeup every n events */ @@ -334,8 +357,7 @@ struct perf_event_attr { */ __u32 sample_stack_user; - /* Align to u64. */ - __u32 __reserved_2; + __s32 clockid; /* * Defines set of regs to dump for each sample * state captured on: @@ -345,6 +367,12 @@ struct perf_event_attr { * See asm/perf_regs.h for details. */ __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u32 __reserved_2; /* align to __u64 */ }; #define perf_flags(attr) (*(&(attr)->read_format + 1)) @@ -360,6 +388,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -500,9 +529,30 @@ struct perf_event_mmap_page { * In this case the kernel will not over-write unread data. * * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; }; #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) @@ -725,6 +775,31 @@ enum perf_event_type { */ PERF_RECORD_MMAP2 = 10, + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -742,6 +817,12 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ diff --git a/init/Kconfig b/init/Kconfig index 9a0592516f48..a905b7301e10 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1526,7 +1526,7 @@ config EVENTFD # syscall, maps, verifier config BPF_SYSCALL - bool "Enable bpf() system call" if EXPERT + bool "Enable bpf() system call" select ANON_INODES select BPF default n diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..504c10b990ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -16,6 +16,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_buf +#define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { @@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2fabc0627165..06917d537302 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -34,14 +34,16 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include +#include +#include #include "internal.h" @@ -153,7 +155,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -327,6 +329,11 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline u64 perf_event_clock(struct perf_event *event) +{ + return event->clock(); +} + static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { @@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info __percpu *info; -}; - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_css(task, perf_event_cgrp_id), - struct perf_cgroup, css); -} - static inline bool perf_cgroup_match(struct perf_event *event) { @@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx->task_ctx_data); + kfree(ctx); +} + static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - kfree_rcu(ctx, rcu_head); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; - if (has_branch_stack(event)) - ctx->nr_branch_stack++; - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } - if (has_branch_stack(event)) - ctx->nr_branch_stack--; - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); +static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, @@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + perf_log_itrace_start(event); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event, goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; + + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2577,6 +2567,56 @@ unlock: } } +void perf_sched_cb_dec(struct pmu *pmu) +{ + this_cpu_dec(perf_sched_cb_usages); +} + +void perf_sched_cb_inc(struct pmu *pmu) +{ + this_cpu_inc(perf_sched_cb_usages); +} + +/* + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when the context switch callback is enabled. + */ +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (prev == next) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->sched_task) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2754,64 +2797,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_ctx_unlock(cpuctx, ctx); } -/* - * When sampling the branck stack in system-wide, it may be necessary - * to flush the stack on context switch. This happens when the branch - * stack does not tag its entries with the pid of the current task. - * Otherwise it becomes impossible to associate a branch entry with a - * task. This ambiguity is more likely to appear when the branch stack - * supports priv level filtering and the user sets it to monitor only - * at the user level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch stack with - * branch from multiple tasks. Flushing may mean dropping the existing - * entries or stashing them somewhere in the PMU specific code layer. - * - * This function provides the context switch callback to the lower code - * layer. It is invoked ONLY when there is at least one system-wide context - * with at least one active event using taken branch sampling. - */ -static void perf_branch_stack_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* no need to flush branch stack if not changing task */ - if (prev == task) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - /* - * check if the context has at least one - * event using PERF_SAMPLE_BRANCH_STACK - */ - if (cpuctx->ctx.nr_branch_stack > 0 - && pmu->flush_branch_stack) { - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - - perf_pmu_disable(pmu); - - pmu->flush_branch_stack(); - - perf_pmu_enable(pmu); - - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); - /* check for system-wide branch_stack events */ - if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) - perf_branch_stack_sched_in(prev, task); + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return local64_read(&event->count) + atomic64_read(&event->child_count); + if (event->pmu->count) + return event->pmu->count(event); + + return __perf_event_count(event); } static u64 perf_event_read(struct perf_event *event) @@ -3321,12 +3308,15 @@ errout: * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +find_get_context(struct pmu *pmu, struct task_struct *task, + struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; + void *task_ctx_data = NULL; unsigned long flags; int ctxn, err; + int cpu = event->cpu; if (!task) { /* Must be root to operate on a CPU event: */ @@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (ctxn < 0) goto errout; + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + if (!task_ctx_data) { + err = -ENOMEM; + goto errout; + } + } + retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; + + if (task_ctx_data && !ctx->task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) @@ -3369,6 +3372,11 @@ retry: if (!ctx) goto errout; + if (task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } + err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -3395,13 +3403,16 @@ retry: } } + kfree(task_ctx_data); return ctx; errout: + kfree(task_ctx_data); return ERR_PTR(err); } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); @@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } @@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event) unaccount_event_cpu(event, event->cpu); } +/* + * The following implement mutual exclusion of events on "exclusive" pmus + * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled + * at a time, so we disallow creating events that might conflict, namely: + * + * 1) cpu-wide events in the presence of per-task events, + * 2) per-task events in the presence of cpu-wide events, + * 3) two matching events on the same context. + * + * The former two cases are handled in the allocation path (perf_event_alloc(), + * __free_event()), the latter -- before the first perf_install_in_context(). + */ +static int exclusive_event_init(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return 0; + + /* + * Prevent co-existence of per-task and cpu-wide events on the + * same exclusive pmu. + * + * Negative pmu::exclusive_cnt means there are cpu-wide + * events on this "exclusive" pmu, positive means there are + * per-task events. + * + * Since this is called in perf_event_alloc() path, event::ctx + * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK + * to mean "per-task event", because unlike other attach states it + * never gets cleared. + */ + if (event->attach_state & PERF_ATTACH_TASK) { + if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) + return -EBUSY; + } else { + if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) + return -EBUSY; + } + + return 0; +} + +static void exclusive_event_destroy(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return; + + /* see comment in exclusive_event_init() */ + if (event->attach_state & PERF_ATTACH_TASK) + atomic_dec(&pmu->exclusive_cnt); + else + atomic_inc(&pmu->exclusive_cnt); +} + +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) +{ + if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + (e1->cpu == e2->cpu || + e1->cpu == -1 || + e2->cpu == -1)) + return true; + return false; +} + +/* Called under the same ctx::mutex as perf_install_in_context() */ +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *iter_event; + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return true; + + list_for_each_entry(iter_event, &ctx->event_list, event_entry) { + if (exclusive_event_match(iter_event, event)) + return false; + } + + return true; +} + static void __free_event(struct perf_event *event) { if (!event->parent) { @@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) + if (event->pmu) { + exclusive_event_destroy(event); module_put(event->pmu->module); + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event) /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + userpg->data_offset = PAGE_SIZE; + userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); @@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } -static struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct ring_buffer *rb) { if (!atomic_dec_and_test(&rb->refcount)) return; @@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + if (vma->vm_pgoff) + atomic_inc(&event->rb->aux_mmap_count); + if (event->pmu->event_mapped) event->pmu->event_mapped(event); } @@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (event->pmu->event_unmapped) event->pmu->event_unmapped(event); + /* + * rb->aux_mmap_count will always drop before rb->mmap_count and + * event->mmap_count, so it is ok to use event->mmap_mutex to + * serialize with perf_mmap here. + */ + if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + + rb_free_aux(rb); + mutex_unlock(&event->mmap_mutex); + } + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4392,7 +4509,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, + .close = perf_mmap_close, /* non mergable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct ring_buffer *rb; + struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra, extra; + long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (vma->vm_pgoff == 0) { + nr_pages = (vma_size / PAGE_SIZE) - 1; + } else { + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + u64 aux_offset, aux_size; + + if (!event->rb) + return -EINVAL; + + nr_pages = vma_size / PAGE_SIZE; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + rb = event->rb; + if (!rb) + goto aux_unlock; + + aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); + aux_size = ACCESS_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + goto aux_unlock; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + goto aux_unlock; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + goto aux_unlock; + + if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + goto aux_unlock; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + goto aux_unlock; + + if (!is_power_of_2(nr_pages)) + goto aux_unlock; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + goto aux_unlock; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + ret = 0; + goto unlock; + } + + atomic_set(&rb->aux_mmap_count, 1); + user_extra = nr_pages; + + goto accounting; + } /* * If we have rb pages ensure they're a power-of-two number, so we @@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - if (vma->vm_pgoff != 0) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -4459,6 +4632,8 @@ again: } user_extra = nr_pages + 1; + +accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -4468,7 +4643,6 @@ again: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - extra = 0; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -4482,35 +4656,46 @@ again: goto unlock; } - WARN_ON(event->rb); + WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - if (!rb) { - ret = -ENOMEM; - goto unlock; + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + + if (!rb) { + ret = -ENOMEM; + goto unlock; + } + + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); + if (!ret) + rb->aux_mmap_locked = extra; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_locked = extra; - rb->mmap_user = get_current_user(); - - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; - - ring_buffer_attach(event, rb); - - perf_event_init_userpage(event); - perf_event_update_userpage(event); - unlock: - if (!ret) + if (!ret) { + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->pinned_vm += extra; + atomic_inc(&event->mmap_count); + } else if (rb) { + atomic_dec(&rb->mmap_count); + } +aux_unlock: mutex_unlock(&event->mmap_mutex); /* @@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); + data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); @@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); + task_event->event_id.time = perf_event_clock(event); + perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); @@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ - .time = perf_clock(), + /* .time */ }, }; @@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma) perf_event_mmap_event(&mmap_event); } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 offset; + u64 size; + u64 flags; + } rec = { + .header = { + .type = PERF_RECORD_AUX, + .misc = 0, + .size = sizeof(rec), + }, + .offset = head, + .size = size, + .flags = flags, + }; + int ret; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * IRQ throttle logging */ @@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = perf_clock(), + .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; @@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +static void perf_log_itrace_start(struct perf_event *event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u32 pid; + u32 tid; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || + event->hw.itrace_started) + return; + + event->hw.itrace_started = 1; + + rec.header.type = PERF_RECORD_ITRACE_START; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.pid = perf_event_pid(event, current); + rec.tid = perf_event_tid(event, current); + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * Generic event overflow handling, sampling. */ @@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) } hlist_add_head_rcu(&event->hlist_entry, head); + perf_event_update_userpage(event); return 0; } @@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event) static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, @@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, @@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, @@ -6993,6 +7312,7 @@ got_cpu_context: pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister); static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { + struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; + + if (event->group_leader != event) { + ctx = perf_event_ctx_lock(event->group_leader); + BUG_ON(!ctx); + } + event->pmu = pmu; ret = pmu->event_init(event); + + if (ctx) + perf_event_ctx_unlock(event->group_leader, ctx); + if (ret) module_put(pmu->module); @@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } @@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, - void *context) + void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; @@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif + event->hw.target = task; } + event->clock = &local_clock; + if (parent_event) + event->clock = parent_event->clock; + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; @@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto err_ns; + if (!has_branch_stack(event)) + event->attr.branch_sample_type = 0; + + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_ns; + } + pmu = perf_init_event(event); if (!pmu) goto err_ns; @@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + err = exclusive_event_init(event); + if (err) + goto err_pmu; + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_pmu; + goto err_per_task; } } return event; +err_per_task: + exclusive_event_destroy(event); + err_pmu: if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); kfree(event); @@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; + /* + * Mixing clocks in the same buffer is trouble you don't need. + */ + if (output_event->clock != event->clock) + goto out; + + /* + * If both events generate aux data, they must be on the same PMU + */ + if (has_aux(event) && has_aux(output_event) && + event->pmu != output_event->pmu) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ @@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } +static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) +{ + bool nmi_safe = false; + + switch (clk_id) { + case CLOCK_MONOTONIC: + event->clock = &ktime_get_mono_fast_ns; + nmi_safe = true; + break; + + case CLOCK_MONOTONIC_RAW: + event->clock = &ktime_get_raw_fast_ns; + nmi_safe = true; + break; + + case CLOCK_REALTIME: + event->clock = &ktime_get_real_ns; + break; + + case CLOCK_BOOTTIME: + event->clock = &ktime_get_boot_ns; + break; + + case CLOCK_TAI: + event->clock = &ktime_get_tai_ns; + break; + + default: + return -EINVAL; + } + + if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) + return -EINVAL; + + return 0; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open, int move_group = 0; int err; int f_flags = O_RDWR; + int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (flags & PERF_FLAG_PID_CGROUP) + cgroup_fd = pid; + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, - NULL, NULL); + NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_cpus; } - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) { - __free_event(event); - goto err_cpus; - } - } - if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -ENOTSUPP; @@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = event->pmu; + if (attr.use_clockid) { + err = perf_event_set_clock(event, attr.clockid); + if (err) + goto err_alloc; + } + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event->cpu); + ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } + if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { + err = -EBUSY; + goto err_context; + } + if (task) { put_task_struct(task); task = NULL; @@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open, */ if (group_leader->group_leader != group_leader) goto err_context; + + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + goto err_context; + /* * Do not allow to attach to a group in a different * task or CPU context: @@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open, get_ctx(ctx); } + if (!exclusive_event_installable(event, ctx)) { + err = -EBUSY; + mutex_unlock(&ctx->mutex); + fput(event_file); + goto err_context; + } + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, */ event = perf_event_alloc(attr, cpu, task, NULL, NULL, - overflow_handler, context); + overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, account_event(event); - ctx = find_get_context(event->pmu, task, cpu); + ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; @@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (!exclusive_event_installable(event, ctx)) { + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); + err = -EBUSY; + goto err_free; + } + perf_install_in_context(ctx, event, cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL, NULL); + NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,6 +27,7 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; @@ -35,6 +36,20 @@ struct ring_buffer { unsigned long mmap_locked; struct user_struct *mmap_user; + /* AUX area */ + local_t aux_head; + local_t aux_nest; + local_t aux_wakeup; + unsigned long aux_pgoff; + int aux_nr_pages; + int aux_overwrite; + atomic_t aux_mmap_count; + unsigned long aux_mmap_locked; + void (*free_aux)(void *); + atomic_t aux_refcount; + void **aux_pages; + void *aux_priv; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); +extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags); +extern void rb_free_aux(struct ring_buffer *rb); +extern struct ring_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct ring_buffer *rb); + +static inline bool rb_has_aux(struct ring_buffer *rb) +{ + return !!rb->aux_nr_pages; +} + +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags); extern void perf_event_header__init_id(struct perf_event_header *header, @@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +static inline unsigned long perf_aux_size(struct ring_buffer *rb) +{ + return rb->aux_nr_pages << PAGE_SHIFT; +} + #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +/* + * This is called before hardware starts writing to the AUX area to + * obtain an output handle and make sure there's room in the buffer. + * When the capture completes, call perf_aux_output_end() to commit + * the recorded data to the buffer. + * + * The ordering is similar to that of perf_output_{begin,end}, with + * the exception of (B), which should be taken care of by the pmu + * driver, since ordering rules will differ depending on hardware. + */ +void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *output_event = event; + unsigned long aux_head, aux_tail; + struct ring_buffer *rb; + + if (output_event->parent) + output_event = output_event->parent; + + /* + * Since this will typically be open across pmu::add/pmu::del, we + * grab ring_buffer's refcount instead of holding rcu read lock + * to make sure it doesn't disappear under us. + */ + rb = ring_buffer_get(output_event); + if (!rb) + return NULL; + + if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + goto err; + + /* + * Nesting is not supported for AUX area, make sure nested + * writers are caught early + */ + if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + goto err_put; + + aux_head = local_read(&rb->aux_head); + + handle->rb = rb; + handle->event = event; + handle->head = aux_head; + handle->size = 0; + + /* + * In overwrite mode, AUX data stores do not depend on aux_tail, + * therefore (A) control dependency barrier does not exist. The + * (B) <-> (C) ordering is still observed by the pmu driver. + */ + if (!rb->aux_overwrite) { + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } + } + + return handle->rb->aux_priv; + +err_put: + rb_free_aux(rb); + +err: + ring_buffer_put(rb); + handle->event = NULL; + + return NULL; +} + +/* + * Commit the data written by hardware into the ring buffer by adjusting + * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the + * pmu driver's responsibility to observe ordering rules of the hardware, + * so that all the data is externally visible before this is called. + */ +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + u64 flags = 0; + + if (truncated) + flags |= PERF_AUX_FLAG_TRUNCATED; + + /* in overwrite mode, driver provides aux_head via handle */ + if (rb->aux_overwrite) { + flags |= PERF_AUX_FLAG_OVERWRITE; + + aux_head = handle->head; + local_set(&rb->aux_head, aux_head); + } else { + aux_head = local_read(&rb->aux_head); + local_add(size, &rb->aux_head); + } + + if (size || flags) { + /* + * Only send RECORD_AUX if we have something useful to communicate + */ + + perf_event_aux_event(handle->event, aux_head, size, flags); + } + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + } + handle->event = NULL; + + local_set(&rb->aux_nest, 0); + rb_free_aux(rb); + ring_buffer_put(rb); +} + +/* + * Skip over a given number of bytes in the AUX buffer, due to, for example, + * hardware's alignment constraints. + */ +int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + + if (size > handle->size) + return -ENOSPC; + + local_add(size, &rb->aux_head); + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + handle->wakeup = local_read(&rb->aux_wakeup) + + rb->aux_watermark; + } + + handle->head = aux_head; + handle->size -= size; + + return 0; +} + +void *perf_get_aux(struct perf_output_handle *handle) +{ + /* this is only valid between perf_aux_output_begin and *_end */ + if (!handle->event) + return NULL; + + return handle->rb->aux_priv; +} + +#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) + +static struct page *rb_alloc_aux_page(int node, int order) +{ + struct page *page; + + if (order > MAX_ORDER) + order = MAX_ORDER; + + do { + page = alloc_pages_node(node, PERF_AUX_GFP, order); + } while (!page && order--); + + if (page && order) { + /* + * Communicate the allocation size to the driver + */ + split_page(page, order); + SetPagePrivate(page); + set_page_private(page, order); + } + + return page; +} + +static void rb_free_aux_page(struct ring_buffer *rb, int idx) +{ + struct page *page = virt_to_page(rb->aux_pages[idx]); + + ClearPagePrivate(page); + page->mapping = NULL; + __free_page(page); +} + +int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags) +{ + bool overwrite = !(flags & RING_BUFFER_WRITABLE); + int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int ret = -ENOMEM, max_order = 0; + + if (!has_aux(event)) + return -ENOTSUPP; + + if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); + + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && + !overwrite) { + if (!max_order) + return -EINVAL; + + max_order--; + } + } + + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + if (!rb->aux_pages) + return -ENOMEM; + + rb->free_aux = event->pmu->free_aux; + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { + struct page *page; + int last, order; + + order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); + page = rb_alloc_aux_page(node, order); + if (!page) + goto out; + + for (last = rb->aux_nr_pages + (1 << page_private(page)); + last > rb->aux_nr_pages; rb->aux_nr_pages++) + rb->aux_pages[rb->aux_nr_pages] = page_address(page++); + } + + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + overwrite); + if (!rb->aux_priv) + goto out; + + ret = 0; + + /* + * aux_pages (and pmu driver's private data, aux_priv) will be + * referenced in both producer's and consumer's contexts, thus + * we keep a refcount here to make sure either of the two can + * reference them safely. + */ + atomic_set(&rb->aux_refcount, 1); + + rb->aux_overwrite = overwrite; + rb->aux_watermark = watermark; + + if (!rb->aux_watermark && !rb->aux_overwrite) + rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); + +out: + if (!ret) + rb->aux_pgoff = pgoff; + else + rb_free_aux(rb); + + return ret; +} + +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; +} + +void rb_free_aux(struct ring_buffer *rb) +{ + if (atomic_dec_and_test(&rb->aux_refcount)) + __rb_free_aux(rb); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb) return rb->nr_pages << page_order(rb); } -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -416,3 +719,19 @@ fail: } #endif + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (rb->aux_nr_pages) { + /* above AUX space */ + if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) + return NULL; + + /* AUX space */ + if (pgoff >= rb->aux_pgoff) + return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + } + + return __perf_mmap_to_page(rb, pgoff); +} diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fedbdd7d5d1e..3b9a48ae153a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -432,6 +432,14 @@ config UPROBE_EVENT This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..2d56ce501632 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,222 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9ba3f43f580e..d0ce590f06e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1135,11 +1135,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; call->data = tk; ret = trace_add_event_call(call); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 74865465e0b7..d60fe62ec4fa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..9a056f5bc02c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu) cpu0_err = 0; } } + +void watchdog_nmi_enable_all(void) +{ + int cpu; + + if (!watchdog_user_enabled) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_enable(cpu); + put_online_cpus(); +} + +void watchdog_nmi_disable_all(void) +{ + int cpu; + + if (!watchdog_running) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_disable(cpu); + put_online_cpus(); +} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } +void watchdog_nmi_enable_all(void) {} +void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b5b3600dcdf5..fe98fb226e6e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -6,23 +6,39 @@ hostprogs-y := test_verifier test_maps hostprogs-y += sock_example hostprogs-y += sockex1 hostprogs-y += sockex2 +hostprogs-y += tracex1 +hostprogs-y += tracex2 +hostprogs-y += tracex3 +hostprogs-y += tracex4 test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o sock_example-objs := sock_example.o libbpf.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o sockex2-objs := bpf_load.o libbpf.o sockex2_user.o +tracex1-objs := bpf_load.o libbpf.o tracex1_user.o +tracex2-objs := bpf_load.o libbpf.o tracex2_user.o +tracex3-objs := bpf_load.o libbpf.o tracex3_user.o +tracex4-objs := bpf_load.o libbpf.o tracex4_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) always += sockex1_kern.o always += sockex2_kern.o +always += tracex1_kern.o +always += tracex2_kern.o +always += tracex3_kern.o +always += tracex4_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable HOSTLOADLIBES_sockex1 += -lelf HOSTLOADLIBES_sockex2 += -lelf +HOSTLOADLIBES_tracex1 += -lelf +HOSTLOADLIBES_tracex2 += -lelf +HOSTLOADLIBES_tracex3 += -lelf +HOSTLOADLIBES_tracex4 += -lelf -lrt # point this to your LLVM backend with bpf support LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index ca0333146006..1c872bcf5a80 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value, (void *) BPF_FUNC_map_update_elem; static int (*bpf_map_delete_elem)(void *map, void *key) = (void *) BPF_FUNC_map_delete_elem; +static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read; +static unsigned long long (*bpf_ktime_get_ns)(void) = + (void *) BPF_FUNC_ktime_get_ns; +static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = + (void *) BPF_FUNC_trace_printk; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 1831d236382b..38dac5a53b51 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -8,29 +8,70 @@ #include #include #include +#include #include #include +#include +#include +#include +#include +#include #include "libbpf.h" #include "bpf_helpers.h" #include "bpf_load.h" +#define DEBUGFS "/sys/kernel/debug/tracing/" + static char license[128]; +static int kern_version; static bool processed_sec[128]; int map_fd[MAX_MAPS]; int prog_fd[MAX_PROGS]; +int event_fd[MAX_PROGS]; int prog_cnt; static int load_and_attach(const char *event, struct bpf_insn *prog, int size) { - int fd; bool is_socket = strncmp(event, "socket", 6) == 0; + bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; + bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + enum bpf_prog_type prog_type; + char buf[256]; + int fd, efd, err, id; + struct perf_event_attr attr = {}; - if (!is_socket) - /* tracing events tbd */ + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + if (is_socket) { + prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + } else if (is_kprobe || is_kretprobe) { + prog_type = BPF_PROG_TYPE_KPROBE; + } else { + printf("Unknown event '%s'\n", event); return -1; + } - fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, - prog, size, license); + if (is_kprobe || is_kretprobe) { + if (is_kprobe) + event += 7; + else + event += 10; + + snprintf(buf, sizeof(buf), + "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", + is_kprobe ? 'p' : 'r', event, event); + err = system(buf); + if (err < 0) { + printf("failed to create kprobe '%s' error '%s'\n", + event, strerror(errno)); + return -1; + } + } + + fd = bpf_prog_load(prog_type, prog, size, license, kern_version); if (fd < 0) { printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); @@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; + if (is_socket) + return 0; + + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event); + strcat(buf, "/id"); + + efd = open(buf, O_RDONLY, 0); + if (efd < 0) { + printf("failed to open event %s\n", event); + return -1; + } + + err = read(efd, buf, sizeof(buf)); + if (err < 0 || err >= sizeof(buf)) { + printf("read from '%s' failed '%s'\n", event, strerror(errno)); + return -1; + } + + close(efd); + + buf[err] = 0; + id = atoi(buf); + attr.config = id; + + efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); + if (efd < 0) { + printf("event %d fd %d err %s\n", id, efd, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); + ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + return 0; } @@ -135,6 +211,9 @@ int load_bpf_file(char *path) if (gelf_getehdr(elf, &ehdr) != &ehdr) return 1; + /* clear all kprobes */ + i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); + /* scan over all elf sections to get license and map info */ for (i = 1; i < ehdr.e_shnum; i++) { @@ -149,6 +228,14 @@ int load_bpf_file(char *path) if (strcmp(shname, "license") == 0) { processed_sec[i] = true; memcpy(license, data->d_buf, data->d_size); + } else if (strcmp(shname, "version") == 0) { + processed_sec[i] = true; + if (data->d_size != sizeof(int)) { + printf("invalid size of version section %zd\n", + data->d_size); + return 1; + } + memcpy(&kern_version, data->d_buf, sizeof(int)); } else if (strcmp(shname, "maps") == 0) { processed_sec[i] = true; if (load_maps(data->d_buf, data->d_size)) @@ -178,7 +265,8 @@ int load_bpf_file(char *path) if (parse_relo_and_apply(data, symbols, &shdr, insns)) continue; - if (memcmp(shname_prog, "events/", 7) == 0 || + if (memcmp(shname_prog, "kprobe/", 7) == 0 || + memcmp(shname_prog, "kretprobe/", 10) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -193,7 +281,8 @@ int load_bpf_file(char *path) if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) continue; - if (memcmp(shname, "events/", 7) == 0 || + if (memcmp(shname, "kprobe/", 7) == 0 || + memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } @@ -201,3 +290,23 @@ int load_bpf_file(char *path) close(fd); return 0; } + +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[4096]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf)); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 27789a34f5e6..cbd7c2b532b9 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -6,6 +6,7 @@ extern int map_fd[MAX_MAPS]; extern int prog_fd[MAX_PROGS]; +extern int event_fd[MAX_PROGS]; /* parses elf file compiled by llvm .c->.o * . parses 'maps' section and creates maps via BPF syscall @@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS]; */ int load_bpf_file(char *path); +void read_trace_pipe(void); + #endif diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c index 46d50b7ddf79..7e1efa7e2ed7 100644 --- a/samples/bpf/libbpf.c +++ b/samples/bpf/libbpf.c @@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE]; int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int prog_len, - const char *license) + const char *license, int kern_version) { union bpf_attr attr = { .prog_type = prog_type, @@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type, .log_level = 1, }; + /* assign one field outside of struct init to make sure any + * padding is zero initialized + */ + attr.kern_version = kern_version; + bpf_log_buf[0] = 0; return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); @@ -121,3 +126,10 @@ int open_raw_sock(const char *name) return sock; } + +int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, + int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, attr, pid, cpu, + group_fd, flags); +} diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index 58c5fe1bdba1..ac7b09672b46 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key); int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, - const char *license); + const char *license, int kern_version); #define LOG_BUF_SIZE 65536 extern char bpf_log_buf[LOG_BUF_SIZE]; @@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; /* create RAW socket and bind to interface 'name' */ int open_raw_sock(const char *name); +struct perf_event_attr; +int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, + int group_fd, unsigned long flags); #endif diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index c8ad0404416f..a0ce251c5390 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -56,7 +56,7 @@ static int test_sock(void) }; prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog), - "GPL"); + "GPL", 0); if (prog_fd < 0) { printf("failed to load prog '%s'\n", strerror(errno)); goto cleanup; diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index b96175e90363..740ce97cda5e 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -689,7 +689,7 @@ static int test(void) prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog, prog_len * sizeof(struct bpf_insn), - "GPL"); + "GPL", 0); if (tests[i].result == ACCEPT) { if (prog_fd < 0) { diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c new file mode 100644 index 000000000000..31620463701a --- /dev/null +++ b/samples/bpf/tracex1_kern.c @@ -0,0 +1,50 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + */ +SEC("kprobe/__netif_receive_skb_core") +int bpf_prog1(struct pt_regs *ctx) +{ + /* attaches to kprobe netif_receive_skb, + * looks for packets on loobpack device and prints them + */ + char devname[IFNAMSIZ] = {}; + struct net_device *dev; + struct sk_buff *skb; + int len; + + /* non-portable! works for the given kernel only */ + skb = (struct sk_buff *) ctx->di; + + dev = _(skb->dev); + + len = _(skb->len); + + bpf_probe_read(devname, sizeof(devname), dev->name); + + if (devname[0] == 'l' && devname[1] == 'o') { + char fmt[] = "skb %p len %d\n"; + /* using bpf_trace_printk() for DEBUG ONLY */ + bpf_trace_printk(fmt, sizeof(fmt), skb, len); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c new file mode 100644 index 000000000000..31a48183beea --- /dev/null +++ b/samples/bpf/tracex1_user.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +int main(int ac, char **argv) +{ + FILE *f; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + f = popen("taskset 1 ping -c5 localhost", "r"); + (void) f; + + read_trace_pipe(); + + return 0; +} diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c new file mode 100644 index 000000000000..19ec1cfc45db --- /dev/null +++ b/samples/bpf/tracex2_kern.c @@ -0,0 +1,86 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(long), + .max_entries = 1024, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kfree_skb") +int bpf_prog2(struct pt_regs *ctx) +{ + long loc = 0; + long init_val = 1; + long *value; + + /* x64 specific: read ip of kfree_skb caller. + * non-portable version of __builtin_return_address(0) + */ + bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp); + + value = bpf_map_lookup_elem(&my_map, &loc); + if (value) + *value += 1; + else + bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct bpf_map_def SEC("maps") my_hist_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 64, +}; + +SEC("kprobe/sys_write") +int bpf_prog3(struct pt_regs *ctx) +{ + long write_size = ctx->dx; /* arg3 */ + long init_val = 1; + long *value; + u32 index = log2l(write_size); + + value = bpf_map_lookup_elem(&my_hist_map, &index); + if (value) + __sync_fetch_and_add(value, 1); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c new file mode 100644 index 000000000000..91b8d0896fbb --- /dev/null +++ b/samples/bpf/tracex2_user.c @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_INDEX 64 +#define MAX_STARS 38 + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +static void print_hist(int fd) +{ + int key; + long value; + long data[MAX_INDEX] = {}; + char starstr[MAX_STARS]; + int i; + int max_ind = -1; + long max_value = 0; + + for (key = 0; key < MAX_INDEX; key++) { + bpf_lookup_elem(fd, &key, &value); + data[key] = value; + if (value && key > max_ind) + max_ind = key; + if (value > max_value) + max_value = value; + } + + printf(" syscall write() stats\n"); + printf(" byte_size : count distribution\n"); + for (i = 1; i <= max_ind + 1; i++) { + stars(starstr, data[i - 1], max_value, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, data[i - 1], + MAX_STARS, starstr); + } +} +static void int_exit(int sig) +{ + print_hist(map_fd[1]); + exit(0); +} + +int main(int ac, char **argv) +{ + char filename[256]; + long key, next_key, value; + FILE *f; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + signal(SIGINT, int_exit); + + /* start 'ping' in the background to have some kfree_skb events */ + f = popen("ping -c5 localhost", "r"); + (void) f; + + /* start 'dd' in the background to have plenty of 'write' syscalls */ + f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); + (void) f; + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 0; i < 5; i++) { + key = 0; + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &value); + printf("location 0x%lx count %ld\n", next_key, value); + key = next_key; + } + if (key) + printf("\n"); + sleep(1); + } + print_hist(map_fd[1]); + + return 0; +} diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c new file mode 100644 index 000000000000..255ff2792366 --- /dev/null +++ b/samples/bpf/tracex3_kern.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(u64), + .max_entries = 4096, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/blk_mq_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + long rq = ctx->di; + u64 val = bpf_ktime_get_ns(); + + bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); + return 0; +} + +static unsigned int log2l(unsigned long long n) +{ +#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; } + int i = -(n == 0); + S(32); S(16); S(8); S(4); S(2); S(1); + return i; +#undef S +} + +#define SLOTS 100 + +struct bpf_map_def SEC("maps") lat_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = SLOTS, +}; + +SEC("kprobe/blk_update_request") +int bpf_prog2(struct pt_regs *ctx) +{ + long rq = ctx->di; + u64 *value, l, base; + u32 index; + + value = bpf_map_lookup_elem(&my_map, &rq); + if (!value) + return 0; + + u64 cur_time = bpf_ktime_get_ns(); + u64 delta = cur_time - *value; + + bpf_map_delete_elem(&my_map, &rq); + + /* the lines below are computing index = log10(delta)*10 + * using integer arithmetic + * index = 29 ~ 1 usec + * index = 59 ~ 1 msec + * index = 89 ~ 1 sec + * index = 99 ~ 10sec or more + * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3 + */ + l = log2l(delta); + base = 1ll << l; + index = (l * 64 + (delta - base) * 64 / base) * 3 / 64; + + if (index >= SLOTS) + index = SLOTS - 1; + + value = bpf_map_lookup_elem(&lat_map, &index); + if (value) + __sync_fetch_and_add((long *)value, 1); + + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c new file mode 100644 index 000000000000..0aaa933ab938 --- /dev/null +++ b/samples/bpf/tracex3_user.c @@ -0,0 +1,150 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#define SLOTS 100 + +static void clear_stats(int fd) +{ + __u32 key; + __u64 value = 0; + + for (key = 0; key < SLOTS; key++) + bpf_update_elem(fd, &key, &value, BPF_ANY); +} + +const char *color[] = { + "\033[48;5;255m", + "\033[48;5;252m", + "\033[48;5;250m", + "\033[48;5;248m", + "\033[48;5;246m", + "\033[48;5;244m", + "\033[48;5;242m", + "\033[48;5;240m", + "\033[48;5;238m", + "\033[48;5;236m", + "\033[48;5;234m", + "\033[48;5;232m", +}; +const int num_colors = ARRAY_SIZE(color); + +const char nocolor[] = "\033[00m"; + +const char *sym[] = { + " ", + " ", + ".", + ".", + "*", + "*", + "o", + "o", + "O", + "O", + "#", + "#", +}; + +bool full_range = false; +bool text_only = false; + +static void print_banner(void) +{ + if (full_range) + printf("|1ns |10ns |100ns |1us |10us |100us" + " |1ms |10ms |100ms |1s |10s\n"); + else + printf("|1us |10us |100us |1ms |10ms " + "|100ms |1s |10s\n"); +} + +static void print_hist(int fd) +{ + __u32 key; + __u64 value; + __u64 cnt[SLOTS]; + __u64 max_cnt = 0; + __u64 total_events = 0; + + for (key = 0; key < SLOTS; key++) { + value = 0; + bpf_lookup_elem(fd, &key, &value); + cnt[key] = value; + total_events += value; + if (value > max_cnt) + max_cnt = value; + } + clear_stats(fd); + for (key = full_range ? 0 : 29; key < SLOTS; key++) { + int c = num_colors * cnt[key] / (max_cnt + 1); + + if (text_only) + printf("%s", sym[c]); + else + printf("%s %s", color[c], nocolor); + } + printf(" # %lld\n", total_events); +} + +int main(int ac, char **argv) +{ + char filename[256]; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 1; i < ac; i++) { + if (strcmp(argv[i], "-a") == 0) { + full_range = true; + } else if (strcmp(argv[i], "-t") == 0) { + text_only = true; + } else if (strcmp(argv[i], "-h") == 0) { + printf("Usage:\n" + " -a display wider latency range\n" + " -t text only\n"); + return 1; + } + } + + printf(" heatmap of IO latency\n"); + if (text_only) + printf(" %s", sym[num_colors - 1]); + else + printf(" %s %s", color[num_colors - 1], nocolor); + printf(" - many events with this latency\n"); + + if (text_only) + printf(" %s", sym[0]); + else + printf(" %s %s", color[0], nocolor); + printf(" - few events\n"); + + for (i = 0; ; i++) { + if (i % 20 == 0) + print_banner(); + print_hist(map_fd[1]); + sleep(2); + } + + return 0; +} diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c new file mode 100644 index 000000000000..126b80512228 --- /dev/null +++ b/samples/bpf/tracex4_kern.c @@ -0,0 +1,54 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include "bpf_helpers.h" + +struct pair { + u64 val; + u64 ip; +}; + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct pair), + .max_entries = 1000000, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kmem_cache_free") +int bpf_prog1(struct pt_regs *ctx) +{ + long ptr = ctx->si; + + bpf_map_delete_elem(&my_map, &ptr); + return 0; +} + +SEC("kretprobe/kmem_cache_alloc_node") +int bpf_prog2(struct pt_regs *ctx) +{ + long ptr = ctx->ax; + long ip = 0; + + /* get ip address of kmem_cache_alloc_node() caller */ + bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip))); + + struct pair v = { + .val = bpf_ktime_get_ns(), + .ip = ip, + }; + + bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c new file mode 100644 index 000000000000..bc4a3bdea6ed --- /dev/null +++ b/samples/bpf/tracex4_user.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +struct pair { + long long val; + __u64 ip; +}; + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void print_old_objects(int fd) +{ + long long val = time_get_ns(); + __u64 key, next_key; + struct pair v; + + key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + + key = -1; + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &v); + key = next_key; + if (val - v.val < 1000000000ll) + /* object was allocated more then 1 sec ago */ + continue; + printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n", + next_key, (val - v.val) / 1000000000ll, v.ip); + } +} + +int main(int ac, char **argv) +{ + char filename[256]; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 0; ; i++) { + print_old_objects(map_fd[1]); + sleep(1); + } + + return 0; +} diff --git a/tools/build/Build.include b/tools/build/Build.include new file mode 100644 index 000000000000..4c8daaccb82a --- /dev/null +++ b/tools/build/Build.include @@ -0,0 +1,81 @@ +### +# build: Generic definitions +# +# Lots of this code have been borrowed or heavily inspired from parts +# of kbuild code, which is not credited, but mostly developed by: +# +# Copyright (C) Sam Ravnborg , 2015 +# Copyright (C) Linus Torvalds , 2015 +# + +### +# Convenient variables +comma := , +squote := ' + +### +# Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o +dot-target = $(dir $@).$(notdir $@) + +### +# filename of target with directory and extension stripped +basetarget = $(basename $(notdir $@)) + +### +# The temporary file to save gcc -MD generated dependencies must not +# contain a comma +depfile = $(subst $(comma),_,$(dot-target).d) + +### +# Check if both arguments has same arguments. Result is empty string if equal. +arg-check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \ + $(filter-out $(cmd_$@), $(cmd_$(1))) ) + +### +# Escape single quote for use in echo statements +escsq = $(subst $(squote),'\$(squote)',$1) + +# Echo command +# Short version is used, if $(quiet) equals `quiet_', otherwise full one. +echo-cmd = $(if $($(quiet)cmd_$(1)),\ + echo ' $(call escsq,$($(quiet)cmd_$(1)))';) + +### +# Replace >$< with >$$< to preserve $ when reloading the .cmd file +# (needed for make) +# Replace >#< with >\#< to avoid starting a comment in the .cmd file +# (needed for make) +# Replace >'< with >'\''< to be able to enclose the whole string in '...' +# (needed for the shell) +make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1))))) + +### +# Find any prerequisites that is newer than target or that does not exist. +# PHONY targets skipped in both cases. +any-prereq = $(filter-out $(PHONY),$?) $(filter-out $(PHONY) $(wildcard $^),$^) + +### +# if_changed_dep - execute command if any prerequisite is newer than +# target, or command line has changed and update +# dependencies in the cmd file +if_changed_dep = $(if $(strip $(any-prereq) $(arg-check)), \ + @set -e; \ + $(echo-cmd) $(cmd_$(1)); \ + cat $(depfile) > $(dot-target).cmd; \ + printf '%s\n' 'cmd_$@ := $(make-cmd)' >> $(dot-target).cmd) + +# if_changed - execute command if any prerequisite is newer than +# target, or command line has changed +if_changed = $(if $(strip $(any-prereq) $(arg-check)), \ + @set -e; \ + $(echo-cmd) $(cmd_$(1)); \ + printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd) + +### +# C flags to be used in rule definitions, includes: +# - depfile generation +# - global $(CFLAGS) +# - per target C flags +# - per object C flags +# - BUILD_STR macro to allow '-D"$(variable)"' constructs +c_flags = -Wp,-MD,$(depfile),-MT,$@ $(CFLAGS) -D"BUILD_STR(s)=\#s" $(CFLAGS_$(basetarget).o) $(CFLAGS_$(obj)) diff --git a/tools/build/Documentation/Build.txt b/tools/build/Documentation/Build.txt new file mode 100644 index 000000000000..00ad2d608727 --- /dev/null +++ b/tools/build/Documentation/Build.txt @@ -0,0 +1,139 @@ +Build Framework +=============== + +The perf build framework was adopted from the kernel build system, hence the +idea and the way how objects are built is the same. + +Basically the user provides set of 'Build' files that list objects and +directories to nest for specific target to be build. + +Unlike the kernel we don't have a single build object 'obj-y' list that where +we setup source objects, but we support more. This allows one 'Build' file to +carry a sources list for multiple build objects. + +a) Build framework makefiles +---------------------------- + +The build framework consists of 2 Makefiles: + + Build.include + Makefile.build + +While the 'Build.include' file contains just some generic definitions, the +'Makefile.build' file is the makefile used from the outside. It's +interface/usage is following: + + $ make -f tools/build/Makefile srctree=$(KSRC) dir=$(DIR) obj=$(OBJECT) + +where: + + KSRC - is the path to kernel sources + DIR - is the path to the project to be built + OBJECT - is the name of the build object + +When succefully finished the $(DIR) directory contains the final object file +called $(OBJECT)-in.o: + + $ ls $(DIR)/$(OBJECT)-in.o + +which includes all compiled sources described in 'Build' makefiles. + +a) Build makefiles +------------------ + +The user supplies 'Build' makefiles that contains a objects list, and connects +the build to nested directories. + +Assume we have the following project structure: + + ex/a.c + /b.c + /c.c + /d.c + /arch/e.c + /arch/f.c + +Out of which you build the 'ex' binary ' and the 'libex.a' library: + + 'ex' - consists of 'a.o', 'b.o' and libex.a + 'libex.a' - consists of 'c.o', 'd.o', 'e.o' and 'f.o' + +The build framework does not create the 'ex' and 'libex.a' binaries for you, it +only prepares proper objects to be compiled and grouped together. + +To follow the above example, the user provides following 'Build' files: + + ex/Build: + ex-y += a.o + ex-y += b.o + + libex-y += c.o + libex-y += d.o + libex-y += arch/ + + ex/arch/Build: + libex-y += e.o + libex-y += f.o + +and runs: + + $ make -f tools/build/Makefile.build dir=. obj=ex + $ make -f tools/build/Makefile.build dir=. obj=libex + +which creates the following objects: + + ex/ex-in.o + ex/libex-in.o + +that contain request objects names in Build files. + +It's only a matter of 2 single commands to create the final binaries: + + $ ar rcs libex.a libex-in.o + $ gcc -o ex ex-in.o libex.a + +You can check the 'ex' example in 'tools/build/tests/ex' for more details. + +b) Rules +-------- + +The build framework provides standard compilation rules to handle .S and .c +compilation. + +It's possible to include special rule if needed (like we do for flex or bison +code generation). + +c) CFLAGS +--------- + +It's possible to alter the standard object C flags in the following way: + + CFLAGS_perf.o += '...' - alters CFLAGS for perf.o object + CFLAGS_gtk += '...' - alters CFLAGS for gtk build object + +This C flags changes has the scope of the Build makefile they are defined in. + + +d) Dependencies +--------------- + +For each built object file 'a.o' the '.a.cmd' is created and holds: + + - Command line used to built that object + (for each object) + + - Dependency rules generated by 'gcc -Wp,-MD,...' + (for compiled object) + +All existing '.cmd' files are included in the Build process to follow properly +the dependencies and trigger a rebuild when necessary. + + +e) Single rules +--------------- + +It's possible to build single object file by choice, like: + + $ make util/map.o # objects + $ make util/map.i # preprocessor + $ make util/map.s # assembly diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build new file mode 100644 index 000000000000..10df57237a66 --- /dev/null +++ b/tools/build/Makefile.build @@ -0,0 +1,130 @@ +### +# Main build makefile. +# +# Lots of this code have been borrowed or heavily inspired from parts +# of kbuild code, which is not credited, but mostly developed by: +# +# Copyright (C) Sam Ravnborg , 2015 +# Copyright (C) Linus Torvalds , 2015 +# + +PHONY := __build +__build: + +ifeq ($(V),1) + quiet = + Q = +else + quiet=quiet_ + Q=@ +endif + +build-dir := $(srctree)/tools/build + +# Generic definitions +include $(build-dir)/Build.include + +# do not force detected configuration +-include .config-detected + +# Init all relevant variables used in build files so +# 1) they have correct type +# 2) they do not inherit any value from the environment +subdir-y := +obj-y := +subdir-y := +subdir-obj-y := + +# Build definitions +build-file := $(dir)/Build +include $(build-file) + +quiet_cmd_flex = FLEX $@ +quiet_cmd_bison = BISON $@ + +# Create directory unless it exists +quiet_cmd_mkdir = MKDIR $(dir $@) + cmd_mkdir = mkdir -p $(dir $@) + rule_mkdir = $(if $(wildcard $(dir $@)),,@$(call echo-cmd,mkdir) $(cmd_mkdir)) + +# Compile command +quiet_cmd_cc_o_c = CC $@ + cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< + +quiet_cmd_cc_i_c = CPP $@ + cmd_cc_i_c = $(CC) $(c_flags) -E -o $@ $< + +quiet_cmd_cc_s_c = AS $@ + cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $< + +# Link agregate command +# If there's nothing to link, create empty $@ object. +quiet_cmd_ld_multi = LD $@ + cmd_ld_multi = $(if $(strip $(obj-y)),\ + $(LD) -r -o $@ $(obj-y),rm -f $@; $(AR) rcs $@) + +# Build rules +$(OUTPUT)%.o: %.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + +$(OUTPUT)%.o: %.S FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + +$(OUTPUT)%.i: %.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_i_c) + +$(OUTPUT)%.i: %.S FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_i_c) + +$(OUTPUT)%.s: %.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_s_c) + +# Gather build data: +# obj-y - list of build objects +# subdir-y - list of directories to nest +# subdir-obj-y - list of directories objects 'dir/$(obj)-in.o' +obj-y := $($(obj)-y) +subdir-y := $(patsubst %/,%,$(filter %/, $(obj-y))) +obj-y := $(patsubst %/, %/$(obj)-in.o, $(obj-y)) +subdir-obj-y := $(filter %/$(obj)-in.o, $(obj-y)) + +# '$(OUTPUT)/dir' prefix to all objects +prefix := $(subst ./,,$(OUTPUT)$(dir)/) +obj-y := $(addprefix $(prefix),$(obj-y)) +subdir-obj-y := $(addprefix $(prefix),$(subdir-obj-y)) + +# Final '$(obj)-in.o' object +in-target := $(prefix)$(obj)-in.o + +PHONY += $(subdir-y) + +$(subdir-y): + $(Q)$(MAKE) -f $(build-dir)/Makefile.build dir=$(dir)/$@ obj=$(obj) + +$(sort $(subdir-obj-y)): $(subdir-y) ; + +$(in-target): $(obj-y) FORCE + $(call rule_mkdir) + $(call if_changed,ld_multi) + +__build: $(in-target) + @: + +PHONY += FORCE +FORCE: + +# Include all cmd files to get all the dependency rules +# for all objects included +targets := $(wildcard $(sort $(obj-y) $(in-target) $(MAKECMDGOALS))) +cmd_files := $(wildcard $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd)) + +ifneq ($(cmd_files),) + include $(cmd_files) +endif + +.PHONY: $(PHONY) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature new file mode 100644 index 000000000000..3a0b0ca2a28c --- /dev/null +++ b/tools/build/Makefile.feature @@ -0,0 +1,171 @@ +feature_dir := $(srctree)/tools/build/feature + +ifneq ($(OUTPUT),) + OUTPUT_FEATURES = $(OUTPUT)feature/ + $(shell mkdir -p $(OUTPUT_FEATURES)) +endif + +feature_check = $(eval $(feature_check_code)) +define feature_check_code + feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0) +endef + +feature_set = $(eval $(feature_set_code)) +define feature_set_code + feature-$(1) := 1 +endef + +# +# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output: +# + +# +# Note that this is not a complete list of all feature tests, just +# those that are typically built on a fully configured system. +# +# [ Feature tests not mentioned here have to be built explicitly in +# the rule that uses them - an example for that is the 'bionic' +# feature check. ] +# +FEATURE_TESTS = \ + backtrace \ + dwarf \ + fortify-source \ + sync-compare-and-swap \ + glibc \ + gtk2 \ + gtk2-infobar \ + libaudit \ + libbfd \ + libelf \ + libelf-getphdrnum \ + libelf-mmap \ + libnuma \ + libperl \ + libpython \ + libpython-version \ + libslang \ + libunwind \ + pthread-attr-setaffinity-np \ + stackprotector-all \ + timerfd \ + libdw-dwarf-unwind \ + zlib \ + lzma + +FEATURE_DISPLAY = \ + dwarf \ + glibc \ + gtk2 \ + libaudit \ + libbfd \ + libelf \ + libnuma \ + libperl \ + libpython \ + libslang \ + libunwind \ + libdw-dwarf-unwind \ + zlib \ + lzma + +# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features. +# If in the future we need per-feature checks/flags for features not +# mentioned in this list we need to refactor this ;-). +set_test_all_flags = $(eval $(set_test_all_flags_code)) +define set_test_all_flags_code + FEATURE_CHECK_CFLAGS-all += $(FEATURE_CHECK_CFLAGS-$(1)) + FEATURE_CHECK_LDFLAGS-all += $(FEATURE_CHECK_LDFLAGS-$(1)) +endef + +$(foreach feat,$(FEATURE_TESTS),$(call set_test_all_flags,$(feat))) + +# +# Special fast-path for the 'all features are available' case: +# +$(call feature_check,all,$(MSG)) + +# +# Just in case the build freshly failed, make sure we print the +# feature matrix: +# +ifeq ($(feature-all), 1) + # + # test-all.c passed - just set all the core feature flags to 1: + # + $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat))) +else + $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C $(feature_dir) $(addsuffix .bin,$(FEATURE_TESTS)) >/dev/null 2>&1) + $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat))) +endif + +# +# Print the result of the feature test: +# +feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG)) + +define feature_print_status_code + ifeq ($(feature-$(1)), 1) + MSG = $(shell printf '...%30s: [ \033[32mon\033[m ]' $(1)) + else + MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1)) + endif +endef + +feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG)) +define feature_print_text_code + MSG = $(shell printf '...%30s: %s' $(1) $(2)) +endef + +FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat)))) +FEATURE_DUMP_FILE := $(shell touch $(OUTPUT)FEATURE-DUMP; cat $(OUTPUT)FEATURE-DUMP) + +ifeq ($(dwarf-post-unwind),1) + FEATURE_DUMP += dwarf-post-unwind($(dwarf-post-unwind-text)) +endif + +# The $(feature_display) controls the default detection message +# output. It's set if: +# - detected features differes from stored features from +# last build (in FEATURE-DUMP file) +# - one of the $(FEATURE_DISPLAY) is not detected +# - VF is enabled + +ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)") + $(shell echo "$(FEATURE_DUMP)" > $(OUTPUT)FEATURE-DUMP) + feature_display := 1 +endif + +feature_display_check = $(eval $(feature_check_code)) +define feature_display_check_code + ifneq ($(feature-$(1)), 1) + feature_display := 1 + endif +endef + +$(foreach feat,$(FEATURE_DISPLAY),$(call feature_display_check,$(feat))) + +ifeq ($(VF),1) + feature_display := 1 + feature_verbose := 1 +endif + +ifeq ($(feature_display),1) + $(info ) + $(info Auto-detecting system features:) + $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),)) + + ifeq ($(dwarf-post-unwind),1) + $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) + endif + + ifneq ($(feature_verbose),1) + $(info ) + endif +endif + +ifeq ($(feature_verbose),1) + TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS)) + $(foreach feat,$(TMP),$(call feature_print_status,$(feat),)) + $(info ) +endif diff --git a/tools/perf/config/feature-checks/.gitignore b/tools/build/feature/.gitignore similarity index 52% rename from tools/perf/config/feature-checks/.gitignore rename to tools/build/feature/.gitignore index 80f3da0c3515..09b335b98842 100644 --- a/tools/perf/config/feature-checks/.gitignore +++ b/tools/build/feature/.gitignore @@ -1,2 +1,3 @@ *.d *.bin +*.output diff --git a/tools/perf/config/feature-checks/Makefile b/tools/build/feature/Makefile similarity index 72% rename from tools/perf/config/feature-checks/Makefile rename to tools/build/feature/Makefile index b32ff3372514..463ed8f2a267 100644 --- a/tools/perf/config/feature-checks/Makefile +++ b/tools/build/feature/Makefile @@ -29,33 +29,36 @@ FILES= \ test-stackprotector-all.bin \ test-timerfd.bin \ test-libdw-dwarf-unwind.bin \ + test-libbabeltrace.bin \ test-compile-32.bin \ test-compile-x32.bin \ - test-zlib.bin + test-zlib.bin \ + test-lzma.bin CC := $(CROSS_COMPILE)gcc -MD PKG_CONFIG := $(CROSS_COMPILE)pkg-config all: $(FILES) -BUILD = $(CC) $(CFLAGS) -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS) +__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS) + BUILD = $(__BUILD) > $(OUTPUT)$(@:.bin=.make.output) 2>&1 ############################### test-all.bin: - $(BUILD) -Werror -fstack-protector-all -O2 -Werror -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz + $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma test-hello.bin: $(BUILD) test-pthread-attr-setaffinity-np.bin: - $(BUILD) -D_GNU_SOURCE -Werror -lpthread + $(BUILD) -D_GNU_SOURCE -lpthread test-stackprotector-all.bin: - $(BUILD) -Werror -fstack-protector-all + $(BUILD) -fstack-protector-all test-fortify-source.bin: - $(BUILD) -O2 -Werror -D_FORTIFY_SOURCE=2 + $(BUILD) -O2 -D_FORTIFY_SOURCE=2 test-bionic.bin: $(BUILD) @@ -118,10 +121,10 @@ test-libbfd.bin: $(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl test-liberty.bin: - $(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty + $(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty test-liberty-z.bin: - $(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz + $(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz test-cplus-demangle.bin: $(BUILD) -liberty @@ -133,10 +136,13 @@ test-timerfd.bin: $(BUILD) test-libdw-dwarf-unwind.bin: - $(BUILD) + $(BUILD) # -ldw provided by $(FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind) + +test-libbabeltrace.bin: + $(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace) test-sync-compare-and-swap.bin: - $(BUILD) -Werror + $(BUILD) test-compile-32.bin: $(CC) -m32 -o $(OUTPUT)$@ test-compile.c @@ -147,9 +153,12 @@ test-compile-x32.bin: test-zlib.bin: $(BUILD) -lz +test-lzma.bin: + $(BUILD) -llzma + -include *.d ############################### clean: - rm -f $(FILES) *.d + rm -f $(FILES) *.d $(FILES:.bin=.make.output) diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/build/feature/test-all.c similarity index 87% rename from tools/perf/config/feature-checks/test-all.c rename to tools/build/feature/test-all.c index 6d4d09323922..84689a67814a 100644 --- a/tools/perf/config/feature-checks/test-all.c +++ b/tools/build/feature/test-all.c @@ -98,7 +98,23 @@ #undef main #define main main_test_pthread_attr_setaffinity_np -# include "test-pthread_attr_setaffinity_np.c" +# include "test-pthread-attr-setaffinity-np.c" +#undef main + +# if 0 +/* + * Disable libbabeltrace check for test-all, because the requested + * library version is not released yet in most distributions. Will + * reenable later. + */ + +#define main main_test_libbabeltrace +# include "test-libbabeltrace.c" +#undef main +#endif + +#define main main_test_lzma +# include "test-lzma.c" #undef main int main(int argc, char *argv[]) @@ -126,6 +142,7 @@ int main(int argc, char *argv[]) main_test_sync_compare_and_swap(argc, argv); main_test_zlib(); main_test_pthread_attr_setaffinity_np(); + main_test_lzma(); return 0; } diff --git a/tools/perf/config/feature-checks/test-backtrace.c b/tools/build/feature/test-backtrace.c similarity index 100% rename from tools/perf/config/feature-checks/test-backtrace.c rename to tools/build/feature/test-backtrace.c diff --git a/tools/perf/config/feature-checks/test-bionic.c b/tools/build/feature/test-bionic.c similarity index 100% rename from tools/perf/config/feature-checks/test-bionic.c rename to tools/build/feature/test-bionic.c diff --git a/tools/perf/config/feature-checks/test-compile.c b/tools/build/feature/test-compile.c similarity index 100% rename from tools/perf/config/feature-checks/test-compile.c rename to tools/build/feature/test-compile.c diff --git a/tools/perf/config/feature-checks/test-cplus-demangle.c b/tools/build/feature/test-cplus-demangle.c similarity index 100% rename from tools/perf/config/feature-checks/test-cplus-demangle.c rename to tools/build/feature/test-cplus-demangle.c diff --git a/tools/perf/config/feature-checks/test-dwarf.c b/tools/build/feature/test-dwarf.c similarity index 100% rename from tools/perf/config/feature-checks/test-dwarf.c rename to tools/build/feature/test-dwarf.c diff --git a/tools/perf/config/feature-checks/test-fortify-source.c b/tools/build/feature/test-fortify-source.c similarity index 100% rename from tools/perf/config/feature-checks/test-fortify-source.c rename to tools/build/feature/test-fortify-source.c diff --git a/tools/perf/config/feature-checks/test-glibc.c b/tools/build/feature/test-glibc.c similarity index 100% rename from tools/perf/config/feature-checks/test-glibc.c rename to tools/build/feature/test-glibc.c diff --git a/tools/perf/config/feature-checks/test-gtk2-infobar.c b/tools/build/feature/test-gtk2-infobar.c similarity index 100% rename from tools/perf/config/feature-checks/test-gtk2-infobar.c rename to tools/build/feature/test-gtk2-infobar.c diff --git a/tools/perf/config/feature-checks/test-gtk2.c b/tools/build/feature/test-gtk2.c similarity index 100% rename from tools/perf/config/feature-checks/test-gtk2.c rename to tools/build/feature/test-gtk2.c diff --git a/tools/perf/config/feature-checks/test-hello.c b/tools/build/feature/test-hello.c similarity index 100% rename from tools/perf/config/feature-checks/test-hello.c rename to tools/build/feature/test-hello.c diff --git a/tools/perf/config/feature-checks/test-libaudit.c b/tools/build/feature/test-libaudit.c similarity index 100% rename from tools/perf/config/feature-checks/test-libaudit.c rename to tools/build/feature/test-libaudit.c diff --git a/tools/build/feature/test-libbabeltrace.c b/tools/build/feature/test-libbabeltrace.c new file mode 100644 index 000000000000..9cf802a04885 --- /dev/null +++ b/tools/build/feature/test-libbabeltrace.c @@ -0,0 +1,9 @@ + +#include +#include + +int main(void) +{ + bt_ctf_stream_class_get_packet_context_type((void *) 0); + return 0; +} diff --git a/tools/perf/config/feature-checks/test-libbfd.c b/tools/build/feature/test-libbfd.c similarity index 100% rename from tools/perf/config/feature-checks/test-libbfd.c rename to tools/build/feature/test-libbfd.c diff --git a/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c b/tools/build/feature/test-libdw-dwarf-unwind.c similarity index 100% rename from tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c rename to tools/build/feature/test-libdw-dwarf-unwind.c diff --git a/tools/perf/config/feature-checks/test-libelf-getphdrnum.c b/tools/build/feature/test-libelf-getphdrnum.c similarity index 100% rename from tools/perf/config/feature-checks/test-libelf-getphdrnum.c rename to tools/build/feature/test-libelf-getphdrnum.c diff --git a/tools/perf/config/feature-checks/test-libelf-mmap.c b/tools/build/feature/test-libelf-mmap.c similarity index 100% rename from tools/perf/config/feature-checks/test-libelf-mmap.c rename to tools/build/feature/test-libelf-mmap.c diff --git a/tools/perf/config/feature-checks/test-libelf.c b/tools/build/feature/test-libelf.c similarity index 100% rename from tools/perf/config/feature-checks/test-libelf.c rename to tools/build/feature/test-libelf.c diff --git a/tools/perf/config/feature-checks/test-libnuma.c b/tools/build/feature/test-libnuma.c similarity index 100% rename from tools/perf/config/feature-checks/test-libnuma.c rename to tools/build/feature/test-libnuma.c diff --git a/tools/perf/config/feature-checks/test-libperl.c b/tools/build/feature/test-libperl.c similarity index 100% rename from tools/perf/config/feature-checks/test-libperl.c rename to tools/build/feature/test-libperl.c diff --git a/tools/perf/config/feature-checks/test-libpython-version.c b/tools/build/feature/test-libpython-version.c similarity index 100% rename from tools/perf/config/feature-checks/test-libpython-version.c rename to tools/build/feature/test-libpython-version.c diff --git a/tools/perf/config/feature-checks/test-libpython.c b/tools/build/feature/test-libpython.c similarity index 100% rename from tools/perf/config/feature-checks/test-libpython.c rename to tools/build/feature/test-libpython.c diff --git a/tools/perf/config/feature-checks/test-libslang.c b/tools/build/feature/test-libslang.c similarity index 100% rename from tools/perf/config/feature-checks/test-libslang.c rename to tools/build/feature/test-libslang.c diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/build/feature/test-libunwind-debug-frame.c similarity index 100% rename from tools/perf/config/feature-checks/test-libunwind-debug-frame.c rename to tools/build/feature/test-libunwind-debug-frame.c diff --git a/tools/perf/config/feature-checks/test-libunwind.c b/tools/build/feature/test-libunwind.c similarity index 100% rename from tools/perf/config/feature-checks/test-libunwind.c rename to tools/build/feature/test-libunwind.c diff --git a/tools/build/feature/test-lzma.c b/tools/build/feature/test-lzma.c new file mode 100644 index 000000000000..95adc8ced3dd --- /dev/null +++ b/tools/build/feature/test-lzma.c @@ -0,0 +1,10 @@ +#include + +int main(void) +{ + lzma_stream strm = LZMA_STREAM_INIT; + int ret; + + ret = lzma_stream_decoder(&strm, UINT64_MAX, LZMA_CONCATENATED); + return ret ? -1 : 0; +} diff --git a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c b/tools/build/feature/test-pthread-attr-setaffinity-np.c similarity index 77% rename from tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c rename to tools/build/feature/test-pthread-attr-setaffinity-np.c index 2b81b72eca23..fdada5e8d454 100644 --- a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c +++ b/tools/build/feature/test-pthread-attr-setaffinity-np.c @@ -1,5 +1,6 @@ #include #include +#include int main(void) { @@ -8,7 +9,8 @@ int main(void) cpu_set_t cs; pthread_attr_init(&thread_attr); - /* don't care abt exact args, just the API itself in libpthread */ + CPU_ZERO(&cs); + ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cs), &cs); return ret; diff --git a/tools/perf/config/feature-checks/test-stackprotector-all.c b/tools/build/feature/test-stackprotector-all.c similarity index 100% rename from tools/perf/config/feature-checks/test-stackprotector-all.c rename to tools/build/feature/test-stackprotector-all.c diff --git a/tools/perf/config/feature-checks/test-sync-compare-and-swap.c b/tools/build/feature/test-sync-compare-and-swap.c similarity index 100% rename from tools/perf/config/feature-checks/test-sync-compare-and-swap.c rename to tools/build/feature/test-sync-compare-and-swap.c diff --git a/tools/perf/config/feature-checks/test-timerfd.c b/tools/build/feature/test-timerfd.c similarity index 100% rename from tools/perf/config/feature-checks/test-timerfd.c rename to tools/build/feature/test-timerfd.c diff --git a/tools/perf/config/feature-checks/test-zlib.c b/tools/build/feature/test-zlib.c similarity index 100% rename from tools/perf/config/feature-checks/test-zlib.c rename to tools/build/feature/test-zlib.c diff --git a/tools/build/tests/ex/Build b/tools/build/tests/ex/Build new file mode 100644 index 000000000000..0e6c3e6767e6 --- /dev/null +++ b/tools/build/tests/ex/Build @@ -0,0 +1,8 @@ +ex-y += ex.o +ex-y += a.o +ex-y += b.o +ex-y += empty/ + +libex-y += c.o +libex-y += d.o +libex-y += arch/ diff --git a/tools/build/tests/ex/Makefile b/tools/build/tests/ex/Makefile new file mode 100644 index 000000000000..52d2476073a3 --- /dev/null +++ b/tools/build/tests/ex/Makefile @@ -0,0 +1,23 @@ +export srctree := ../../../.. +export CC := gcc +export LD := ld +export AR := ar + +build := -f $(srctree)/tools/build/Makefile.build dir=. obj +ex: ex-in.o libex-in.o + gcc -o $@ $^ + +ex.%: FORCE + make -f $(srctree)/tools/build/Makefile.build dir=. $@ + +ex-in.o: FORCE + make $(build)=ex + +libex-in.o: FORCE + make $(build)=libex + +clean: + find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + rm -f ex ex.i ex.s + +.PHONY: FORCE diff --git a/tools/build/tests/ex/a.c b/tools/build/tests/ex/a.c new file mode 100644 index 000000000000..851762798c83 --- /dev/null +++ b/tools/build/tests/ex/a.c @@ -0,0 +1,5 @@ + +int a(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/arch/Build b/tools/build/tests/ex/arch/Build new file mode 100644 index 000000000000..55506189efae --- /dev/null +++ b/tools/build/tests/ex/arch/Build @@ -0,0 +1,2 @@ +libex-y += e.o +libex-y += f.o diff --git a/tools/build/tests/ex/arch/e.c b/tools/build/tests/ex/arch/e.c new file mode 100644 index 000000000000..beaa4a1d7ba8 --- /dev/null +++ b/tools/build/tests/ex/arch/e.c @@ -0,0 +1,5 @@ + +int e(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/arch/f.c b/tools/build/tests/ex/arch/f.c new file mode 100644 index 000000000000..7c3e9e9da5b7 --- /dev/null +++ b/tools/build/tests/ex/arch/f.c @@ -0,0 +1,5 @@ + +int f(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/b.c b/tools/build/tests/ex/b.c new file mode 100644 index 000000000000..c24ff9ca9a97 --- /dev/null +++ b/tools/build/tests/ex/b.c @@ -0,0 +1,5 @@ + +int b(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/c.c b/tools/build/tests/ex/c.c new file mode 100644 index 000000000000..e216d0217499 --- /dev/null +++ b/tools/build/tests/ex/c.c @@ -0,0 +1,5 @@ + +int c(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/d.c b/tools/build/tests/ex/d.c new file mode 100644 index 000000000000..80dc0f06151b --- /dev/null +++ b/tools/build/tests/ex/d.c @@ -0,0 +1,5 @@ + +int d(void) +{ + return 0; +} diff --git a/tools/build/tests/ex/empty/Build b/tools/build/tests/ex/empty/Build new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/build/tests/ex/ex.c b/tools/build/tests/ex/ex.c new file mode 100644 index 000000000000..dc42eb2e1a67 --- /dev/null +++ b/tools/build/tests/ex/ex.c @@ -0,0 +1,19 @@ + +int a(void); +int b(void); +int c(void); +int d(void); +int e(void); +int f(void); + +int main(void) +{ + a(); + b(); + c(); + d(); + e(); + f(); + + return 0; +} diff --git a/tools/build/tests/run.sh b/tools/build/tests/run.sh new file mode 100755 index 000000000000..5494f8ea7567 --- /dev/null +++ b/tools/build/tests/run.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +function test_ex { + make -C ex V=1 clean > ex.out 2>&1 + make -C ex V=1 >> ex.out 2>&1 + + if [ ! -x ./ex/ex ]; then + echo FAILED + exit -1 + fi + + make -C ex V=1 clean > /dev/null 2>&1 + rm -f ex.out +} + +function test_ex_suffix { + make -C ex V=1 clean > ex.out 2>&1 + + # use -rR to disable make's builtin rules + make -rR -C ex V=1 ex.o >> ex.out 2>&1 + make -rR -C ex V=1 ex.i >> ex.out 2>&1 + make -rR -C ex V=1 ex.s >> ex.out 2>&1 + + if [ -x ./ex/ex ]; then + echo FAILED + exit -1 + fi + + if [ ! -f ./ex/ex.o -o ! -f ./ex/ex.i -o ! -f ./ex/ex.s ]; then + echo FAILED + exit -1 + fi + + make -C ex V=1 clean > /dev/null 2>&1 + rm -f ex.out +} +echo -n Testing.. + +test_ex +test_ex_suffix + +echo OK diff --git a/tools/lib/api/Build b/tools/lib/api/Build new file mode 100644 index 000000000000..3653965cf481 --- /dev/null +++ b/tools/lib/api/Build @@ -0,0 +1,2 @@ +libapi-y += fd/ +libapi-y += fs/ diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile index 36c08b1f4afb..d8fe29fc19a4 100644 --- a/tools/lib/api/Makefile +++ b/tools/lib/api/Makefile @@ -1,49 +1,43 @@ include ../../scripts/Makefile.include include ../../perf/config/utilities.mak # QUIET_CLEAN +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(shell pwd))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) +endif + CC = $(CROSS_COMPILE)gcc AR = $(CROSS_COMPILE)ar -# guard against environment variables -LIB_H= -LIB_OBJS= +MAKEFLAGS += --no-print-directory -LIB_H += fs/debugfs.h -LIB_H += fs/fs.h -# See comment below about piggybacking... -LIB_H += fd/array.h +LIBFILE = $(OUTPUT)libapi.a -LIB_OBJS += $(OUTPUT)fs/debugfs.o -LIB_OBJS += $(OUTPUT)fs/fs.o -# XXX piggybacking here, need to introduce libapikfd, or rename this -# to plain libapik.a and make it have it all api goodies -LIB_OBJS += $(OUTPUT)fd/array.o - -LIBFILE = libapikfs.a - -CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) -fPIC -EXTLIBS = -lelf -lpthread -lrt -lm -ALL_CFLAGS = $(CFLAGS) $(BASIC_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ALL_LDFLAGS = $(LDFLAGS) +CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) +CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -D_FORTIFY_SOURCE=2 -fPIC +CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 RM = rm -f -$(LIBFILE): $(LIB_OBJS) - $(QUIET_AR)$(RM) $@ && $(AR) rcs $(OUTPUT)$@ $(LIB_OBJS) +build := -f $(srctree)/tools/build/Makefile.build dir=. obj +API_IN := $(OUTPUT)libapi-in.o -$(LIB_OBJS): $(LIB_H) +export srctree OUTPUT CC LD CFLAGS V -libapi_dirs: - $(QUIET_MKDIR)mkdir -p $(OUTPUT)fd $(OUTPUT)fs +all: $(LIBFILE) -$(OUTPUT)%.o: %.c libapi_dirs - $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $< -$(OUTPUT)%.s: %.c libapi_dirs - $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $< -$(OUTPUT)%.o: %.S libapi_dirs - $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $< +$(API_IN): FORCE + @$(MAKE) $(build)=libapi + +$(LIBFILE): $(API_IN) + $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(API_IN) clean: - $(call QUIET_CLEAN, libapi) $(RM) $(LIB_OBJS) $(LIBFILE) + $(call QUIET_CLEAN, libapi) $(RM) $(LIBFILE); \ + find $(if $(OUTPUT),$(OUTPUT),.) -name \*.o | xargs $(RM) -.PHONY: clean +FORCE: + +.PHONY: clean FORCE diff --git a/tools/lib/api/fd/Build b/tools/lib/api/fd/Build new file mode 100644 index 000000000000..605d99f6d71a --- /dev/null +++ b/tools/lib/api/fd/Build @@ -0,0 +1 @@ +libapi-y += array.o diff --git a/tools/lib/api/fs/Build b/tools/lib/api/fs/Build new file mode 100644 index 000000000000..6de5a4f0b501 --- /dev/null +++ b/tools/lib/api/fs/Build @@ -0,0 +1,4 @@ +libapi-y += fs.o +libapi-y += debugfs.o +libapi-y += findfs.o +libapi-y += tracefs.o diff --git a/tools/lib/api/fs/debugfs.c b/tools/lib/api/fs/debugfs.c index d2b18e887071..8305b3e9d48e 100644 --- a/tools/lib/api/fs/debugfs.c +++ b/tools/lib/api/fs/debugfs.c @@ -3,75 +3,50 @@ #include #include #include +#include #include #include +#include +#include #include #include #include "debugfs.h" -char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug"; +#ifndef DEBUGFS_DEFAULT_PATH +#define DEBUGFS_DEFAULT_PATH "/sys/kernel/debug" +#endif + +char debugfs_mountpoint[PATH_MAX + 1] = DEBUGFS_DEFAULT_PATH; static const char * const debugfs_known_mountpoints[] = { - "/sys/kernel/debug", + DEBUGFS_DEFAULT_PATH, "/debug", 0, }; static bool debugfs_found; +bool debugfs_configured(void) +{ + return debugfs_find_mountpoint() != NULL; +} + /* find the path to the mounted debugfs */ const char *debugfs_find_mountpoint(void) { - const char * const *ptr; - char type[100]; - FILE *fp; + const char *ret; if (debugfs_found) return (const char *)debugfs_mountpoint; - ptr = debugfs_known_mountpoints; - while (*ptr) { - if (debugfs_valid_mountpoint(*ptr) == 0) { - debugfs_found = true; - strcpy(debugfs_mountpoint, *ptr); - return debugfs_mountpoint; - } - ptr++; - } + ret = find_mountpoint("debugfs", (long) DEBUGFS_MAGIC, + debugfs_mountpoint, PATH_MAX + 1, + debugfs_known_mountpoints); + if (ret) + debugfs_found = true; - /* give up and parse /proc/mounts */ - fp = fopen("/proc/mounts", "r"); - if (fp == NULL) - return NULL; - - while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", - debugfs_mountpoint, type) == 2) { - if (strcmp(type, "debugfs") == 0) - break; - } - fclose(fp); - - if (strcmp(type, "debugfs") != 0) - return NULL; - - debugfs_found = true; - - return debugfs_mountpoint; -} - -/* verify that a mountpoint is actually a debugfs instance */ - -int debugfs_valid_mountpoint(const char *debugfs) -{ - struct statfs st_fs; - - if (statfs(debugfs, &st_fs) < 0) - return -ENOENT; - else if ((long)st_fs.f_type != (long)DEBUGFS_MAGIC) - return -ENOENT; - - return 0; + return ret; } /* mount the debugfs somewhere if it's not mounted */ @@ -87,7 +62,7 @@ char *debugfs_mount(const char *mountpoint) mountpoint = getenv(PERF_DEBUGFS_ENVIRONMENT); /* if no environment variable, use default */ if (mountpoint == NULL) - mountpoint = "/sys/kernel/debug"; + mountpoint = DEBUGFS_DEFAULT_PATH; } if (mount(NULL, mountpoint, "debugfs", 0, NULL) < 0) diff --git a/tools/lib/api/fs/debugfs.h b/tools/lib/api/fs/debugfs.h index 0739881a9897..455023698d2b 100644 --- a/tools/lib/api/fs/debugfs.h +++ b/tools/lib/api/fs/debugfs.h @@ -1,16 +1,7 @@ #ifndef __API_DEBUGFS_H__ #define __API_DEBUGFS_H__ -#define _STR(x) #x -#define STR(x) _STR(x) - -/* - * On most systems would have given us this, but not on some systems - * (e.g. GNU/Hurd). - */ -#ifndef PATH_MAX -#define PATH_MAX 4096 -#endif +#include "findfs.h" #ifndef DEBUGFS_MAGIC #define DEBUGFS_MAGIC 0x64626720 @@ -20,8 +11,8 @@ #define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR" #endif +bool debugfs_configured(void); const char *debugfs_find_mountpoint(void); -int debugfs_valid_mountpoint(const char *debugfs); char *debugfs_mount(const char *mountpoint); extern char debugfs_mountpoint[]; diff --git a/tools/lib/api/fs/findfs.c b/tools/lib/api/fs/findfs.c new file mode 100644 index 000000000000..49946cb6d7af --- /dev/null +++ b/tools/lib/api/fs/findfs.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include + +#include "findfs.h" + +/* verify that a mountpoint is actually the type we want */ + +int valid_mountpoint(const char *mount, long magic) +{ + struct statfs st_fs; + + if (statfs(mount, &st_fs) < 0) + return -ENOENT; + else if ((long)st_fs.f_type != magic) + return -ENOENT; + + return 0; +} + +/* find the path to a mounted file system */ +const char *find_mountpoint(const char *fstype, long magic, + char *mountpoint, int len, + const char * const *known_mountpoints) +{ + const char * const *ptr; + char format[128]; + char type[100]; + FILE *fp; + + if (known_mountpoints) { + ptr = known_mountpoints; + while (*ptr) { + if (valid_mountpoint(*ptr, magic) == 0) { + strncpy(mountpoint, *ptr, len - 1); + mountpoint[len-1] = 0; + return mountpoint; + } + ptr++; + } + } + + /* give up and parse /proc/mounts */ + fp = fopen("/proc/mounts", "r"); + if (fp == NULL) + return NULL; + + snprintf(format, 128, "%%*s %%%ds %%99s %%*s %%*d %%*d\n", len); + + while (fscanf(fp, format, mountpoint, type) == 2) { + if (strcmp(type, fstype) == 0) + break; + } + fclose(fp); + + if (strcmp(type, fstype) != 0) + return NULL; + + return mountpoint; +} diff --git a/tools/lib/api/fs/findfs.h b/tools/lib/api/fs/findfs.h new file mode 100644 index 000000000000..b6f5d05acc42 --- /dev/null +++ b/tools/lib/api/fs/findfs.h @@ -0,0 +1,23 @@ +#ifndef __API_FINDFS_H__ +#define __API_FINDFS_H__ + +#include + +#define _STR(x) #x +#define STR(x) _STR(x) + +/* + * On most systems would have given us this, but not on some systems + * (e.g. GNU/Hurd). + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +const char *find_mountpoint(const char *fstype, long magic, + char *mountpoint, int len, + const char * const *known_mountpoints); + +int valid_mountpoint(const char *mount, long magic); + +#endif /* __API_FINDFS_H__ */ diff --git a/tools/lib/api/fs/tracefs.c b/tools/lib/api/fs/tracefs.c new file mode 100644 index 000000000000..e4aa9688b71e --- /dev/null +++ b/tools/lib/api/fs/tracefs.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tracefs.h" + +#ifndef TRACEFS_DEFAULT_PATH +#define TRACEFS_DEFAULT_PATH "/sys/kernel/tracing" +#endif + +char tracefs_mountpoint[PATH_MAX + 1] = TRACEFS_DEFAULT_PATH; + +static const char * const tracefs_known_mountpoints[] = { + TRACEFS_DEFAULT_PATH, + "/sys/kernel/debug/tracing", + "/tracing", + "/trace", + 0, +}; + +static bool tracefs_found; + +bool tracefs_configured(void) +{ + return tracefs_find_mountpoint() != NULL; +} + +/* find the path to the mounted tracefs */ +const char *tracefs_find_mountpoint(void) +{ + const char *ret; + + if (tracefs_found) + return (const char *)tracefs_mountpoint; + + ret = find_mountpoint("tracefs", (long) TRACEFS_MAGIC, + tracefs_mountpoint, PATH_MAX + 1, + tracefs_known_mountpoints); + + if (ret) + tracefs_found = true; + + return ret; +} + +/* mount the tracefs somewhere if it's not mounted */ +char *tracefs_mount(const char *mountpoint) +{ + /* see if it's already mounted */ + if (tracefs_find_mountpoint()) + goto out; + + /* if not mounted and no argument */ + if (mountpoint == NULL) { + /* see if environment variable set */ + mountpoint = getenv(PERF_TRACEFS_ENVIRONMENT); + /* if no environment variable, use default */ + if (mountpoint == NULL) + mountpoint = TRACEFS_DEFAULT_PATH; + } + + if (mount(NULL, mountpoint, "tracefs", 0, NULL) < 0) + return NULL; + + /* save the mountpoint */ + tracefs_found = true; + strncpy(tracefs_mountpoint, mountpoint, sizeof(tracefs_mountpoint)); +out: + return tracefs_mountpoint; +} diff --git a/tools/lib/api/fs/tracefs.h b/tools/lib/api/fs/tracefs.h new file mode 100644 index 000000000000..da780ac49acb --- /dev/null +++ b/tools/lib/api/fs/tracefs.h @@ -0,0 +1,21 @@ +#ifndef __API_TRACEFS_H__ +#define __API_TRACEFS_H__ + +#include "findfs.h" + +#ifndef TRACEFS_MAGIC +#define TRACEFS_MAGIC 0x74726163 +#endif + +#ifndef PERF_TRACEFS_ENVIRONMENT +#define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR" +#endif + +bool tracefs_configured(void); +const char *tracefs_find_mountpoint(void); +int tracefs_valid_mountpoint(const char *debugfs); +char *tracefs_mount(const char *mountpoint); + +extern char tracefs_mountpoint[]; + +#endif /* __API_DEBUGFS_H__ */ diff --git a/tools/lib/lockdep/Build b/tools/lib/lockdep/Build new file mode 100644 index 000000000000..6f667355b068 --- /dev/null +++ b/tools/lib/lockdep/Build @@ -0,0 +1 @@ +liblockdep-y += common.o lockdep.o preload.o rbtree.o diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile index 4b866c54f624..0c356fb65022 100644 --- a/tools/lib/lockdep/Makefile +++ b/tools/lib/lockdep/Makefile @@ -35,6 +35,10 @@ bindir = $(prefix)/$(bindir_relative) export DESTDIR DESTDIR_SQ INSTALL +MAKEFLAGS += --no-print-directory + +include ../../scripts/Makefile.include + # copy a bit from Linux kbuild ifeq ("$(origin V)", "command line") @@ -44,56 +48,21 @@ ifndef VERBOSE VERBOSE = 0 endif -ifeq ("$(origin O)", "command line") - BUILD_OUTPUT := $(O) +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(shell pwd))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) endif -ifeq ($(BUILD_SRC),) -ifneq ($(BUILD_OUTPUT),) - -define build_output - $(if $(VERBOSE:1=),@)$(MAKE) -C $(BUILD_OUTPUT) \ - BUILD_SRC=$(CURDIR) -f $(CURDIR)/Makefile $1 -endef - -saved-output := $(BUILD_OUTPUT) -BUILD_OUTPUT := $(shell cd $(BUILD_OUTPUT) && /bin/pwd) -$(if $(BUILD_OUTPUT),, \ - $(error output directory "$(saved-output)" does not exist)) - -all: sub-make - -gui: force - $(call build_output, all_cmd) - -$(filter-out gui,$(MAKECMDGOALS)): sub-make - -sub-make: force - $(call build_output, $(MAKECMDGOALS)) - - -# Leave processing to above invocation of make -skip-makefile := 1 - -endif # BUILD_OUTPUT -endif # BUILD_SRC - -# We process the rest of the Makefile if this is the final invocation of make -ifeq ($(skip-makefile),) - -srctree := $(realpath $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR))) -objtree := $(realpath $(CURDIR)) -src := $(srctree) -obj := $(objtree) - -export prefix libdir bindir src obj - # Shell quotes libdir_SQ = $(subst ','\'',$(libdir)) bindir_SQ = $(subst ','\'',$(bindir)) -LIB_FILE = liblockdep.a liblockdep.so.$(LIBLOCKDEP_VERSION) +LIB_IN := $(OUTPUT)liblockdep-in.o + BIN_FILE = lockdep +LIB_FILE = $(OUTPUT)liblockdep.a $(OUTPUT)liblockdep.so.$(LIBLOCKDEP_VERSION) CONFIG_INCLUDES = CONFIG_LIBS = @@ -108,33 +77,23 @@ INCLUDES = -I. -I./uinclude -I./include -I../../include $(CONFIG_INCLUDES) # Set compile option CFLAGS if not set elsewhere CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g +CFLAGS += -fPIC override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ) ifeq ($(VERBOSE),1) Q = - print_compile = - print_app_build = - print_fpic_compile = print_shared_lib_compile = print_install = else Q = @ - print_compile = echo ' CC '$(OBJ); - print_app_build = echo ' BUILD '$(OBJ); - print_fpic_compile = echo ' CC FPIC '$(OBJ); - print_shared_lib_compile = echo ' BUILD SHARED LIB '$(OBJ); - print_static_lib_build = echo ' BUILD STATIC LIB '$(OBJ); - print_install = echo ' INSTALL '$1' to $(DESTDIR_SQ)$2'; + print_shared_lib_compile = echo ' LD '$(OBJ); + print_static_lib_build = echo ' LD '$(OBJ); + print_install = echo ' INSTALL '$1' to $(DESTDIR_SQ)$2'; endif -do_fpic_compile = \ - ($(print_fpic_compile) \ - $(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@) - -do_app_build = \ - ($(print_app_build) \ - $(CC) $^ -rdynamic -o $@ $(CONFIG_LIBS) $(LIBS)) +export srctree OUTPUT CC LD CFLAGS V +build := -f $(srctree)/tools/build/Makefile.build dir=. obj do_compile_shared_library = \ ($(print_shared_lib_compile) \ @@ -144,22 +103,6 @@ do_build_static_lib = \ ($(print_static_lib_build) \ $(RM) $@; $(AR) rcs $@ $^) - -define do_compile - $(print_compile) \ - $(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@; -endef - -$(obj)/%.o: $(src)/%.c - $(Q)$(call do_compile) - -%.o: $(src)/%.c - $(Q)$(call do_compile) - -PEVENT_LIB_OBJS = common.o lockdep.o preload.o rbtree.o - -ALL_OBJS = $(PEVENT_LIB_OBJS) - CMD_TARGETS = $(LIB_FILE) TARGETS = $(CMD_TARGETS) @@ -169,42 +112,15 @@ all: all_cmd all_cmd: $(CMD_TARGETS) -liblockdep.so.$(LIBLOCKDEP_VERSION): $(PEVENT_LIB_OBJS) +$(LIB_IN): force + $(Q)$(MAKE) $(build)=liblockdep + +liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN) $(Q)$(do_compile_shared_library) -liblockdep.a: $(PEVENT_LIB_OBJS) +liblockdep.a: $(LIB_IN) $(Q)$(do_build_static_lib) -$(PEVENT_LIB_OBJS): %.o: $(src)/%.c - $(Q)$(do_fpic_compile) - -## make deps - -all_objs := $(sort $(ALL_OBJS)) -all_deps := $(all_objs:%.o=.%.d) - -# let .d file also depends on the source and header files -define check_deps - @set -e; $(RM) $@; \ - $(CC) -MM $(CFLAGS) $< > $@.$$$$; \ - sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - $(RM) $@.$$$$ -endef - -$(all_deps): .%.d: $(src)/%.c - $(Q)$(call check_deps) - -$(all_objs) : %.o : .%.d - -dep_includes := $(wildcard $(all_deps)) - -ifneq ($(dep_includes),) - include $(dep_includes) -endif - -### Detect environment changes -TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE) - tags: force $(RM) tags find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \ @@ -233,8 +149,6 @@ clean: $(RM) *.o *~ $(TARGETS) *.a *liblockdep*.so* $(VERSION_FILES) .*.d $(RM) tags TAGS -endif # skip-makefile - PHONY += force force: diff --git a/tools/lib/traceevent/Build b/tools/lib/traceevent/Build new file mode 100644 index 000000000000..c681d0575d16 --- /dev/null +++ b/tools/lib/traceevent/Build @@ -0,0 +1,17 @@ +libtraceevent-y += event-parse.o +libtraceevent-y += event-plugin.o +libtraceevent-y += trace-seq.o +libtraceevent-y += parse-filter.o +libtraceevent-y += parse-utils.o +libtraceevent-y += kbuffer-parse.o + +plugin_jbd2-y += plugin_jbd2.o +plugin_hrtimer-y += plugin_hrtimer.o +plugin_kmem-y += plugin_kmem.o +plugin_kvm-y += plugin_kvm.o +plugin_mac80211-y += plugin_mac80211.o +plugin_sched_switch-y += plugin_sched_switch.o +plugin_function-y += plugin_function.o +plugin_xen-y += plugin_xen.o +plugin_scsi-y += plugin_scsi.o +plugin_cfg80211-y += plugin_cfg80211.o diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index 005c9cc06935..d410da335e3d 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -67,7 +67,7 @@ PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)" PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))' endif -include $(if $(BUILD_SRC),$(BUILD_SRC)/)../../scripts/Makefile.include +include ../../scripts/Makefile.include # copy a bit from Linux kbuild @@ -78,40 +78,13 @@ ifndef VERBOSE VERBOSE = 0 endif -ifeq ("$(origin O)", "command line") - BUILD_OUTPUT := $(O) +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(shell pwd))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) endif -ifeq ($(BUILD_SRC),) -ifneq ($(OUTPUT),) - -define build_output - $(if $(VERBOSE:1=),@)+$(MAKE) -C $(OUTPUT) \ - BUILD_SRC=$(CURDIR)/ -f $(CURDIR)/Makefile $1 -endef - -all: sub-make - -$(MAKECMDGOALS): sub-make - -sub-make: force - $(call build_output, $(MAKECMDGOALS)) - - -# Leave processing to above invocation of make -skip-makefile := 1 - -endif # OUTPUT -endif # BUILD_SRC - -# We process the rest of the Makefile if this is the final invocation of make -ifeq ($(skip-makefile),) - -srctree := $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR)) -objtree := $(CURDIR) -src := $(srctree) -obj := $(objtree) - export prefix bindir src obj # Shell quotes @@ -132,16 +105,19 @@ EXTRAVERSION = $(EP_EXTRAVERSION) OBJ = $@ N = -export Q VERBOSE - EVENT_PARSE_VERSION = $(EP_VERSION).$(EP_PATCHLEVEL).$(EP_EXTRAVERSION) -INCLUDES = -I. -I $(srctree)/../../include $(CONFIG_INCLUDES) +INCLUDES = -I. -I $(srctree)/tools/include $(CONFIG_INCLUDES) -# Set compile option CFLAGS if not set elsewhere -CFLAGS ?= -g -Wall +# Set compile option CFLAGS +ifdef EXTRA_CFLAGS + CFLAGS := $(EXTRA_CFLAGS) +else + CFLAGS := -g -Wall +endif # Append required CFLAGS +override CFLAGS += -fPIC override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ) override CFLAGS += $(udis86-flags) -D_GNU_SOURCE @@ -151,74 +127,58 @@ else Q = @ endif -do_compile_shared_library = \ - ($(print_shared_lib_compile) \ - $(CC) --shared $^ -o $@) +# Disable command line variables (CFLAGS) overide from top +# level Makefile (perf), otherwise build Makefile will get +# the same command line setup. +MAKEOVERRIDES= -do_plugin_build = \ - ($(print_plugin_build) \ - $(CC) $(CFLAGS) -shared -nostartfiles -o $@ $<) +export srctree OUTPUT CC LD CFLAGS V +build := -f $(srctree)/tools/build/Makefile.build dir=. obj -do_build_static_lib = \ - ($(print_static_lib_build) \ - $(RM) $@; $(AR) rcs $@ $^) +PLUGINS = plugin_jbd2.so +PLUGINS += plugin_hrtimer.so +PLUGINS += plugin_kmem.so +PLUGINS += plugin_kvm.so +PLUGINS += plugin_mac80211.so +PLUGINS += plugin_sched_switch.so +PLUGINS += plugin_function.so +PLUGINS += plugin_xen.so +PLUGINS += plugin_scsi.so +PLUGINS += plugin_cfg80211.so +PLUGINS := $(addprefix $(OUTPUT),$(PLUGINS)) +PLUGINS_IN := $(PLUGINS:.so=-in.o) -do_compile = $(QUIET_CC)$(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@; - -$(obj)/%.o: $(src)/%.c - $(call do_compile) - -%.o: $(src)/%.c - $(call do_compile) - -PEVENT_LIB_OBJS = event-parse.o -PEVENT_LIB_OBJS += event-plugin.o -PEVENT_LIB_OBJS += trace-seq.o -PEVENT_LIB_OBJS += parse-filter.o -PEVENT_LIB_OBJS += parse-utils.o -PEVENT_LIB_OBJS += kbuffer-parse.o - -PLUGIN_OBJS = plugin_jbd2.o -PLUGIN_OBJS += plugin_hrtimer.o -PLUGIN_OBJS += plugin_kmem.o -PLUGIN_OBJS += plugin_kvm.o -PLUGIN_OBJS += plugin_mac80211.o -PLUGIN_OBJS += plugin_sched_switch.o -PLUGIN_OBJS += plugin_function.o -PLUGIN_OBJS += plugin_xen.o -PLUGIN_OBJS += plugin_scsi.o -PLUGIN_OBJS += plugin_cfg80211.o - -PLUGINS := $(PLUGIN_OBJS:.o=.so) - -ALL_OBJS = $(PEVENT_LIB_OBJS) $(PLUGIN_OBJS) +TE_IN := $(OUTPUT)libtraceevent-in.o +LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) CMD_TARGETS = $(LIB_FILE) $(PLUGINS) TARGETS = $(CMD_TARGETS) - all: all_cmd all_cmd: $(CMD_TARGETS) -libtraceevent.so: $(PEVENT_LIB_OBJS) +$(TE_IN): force + $(Q)$(MAKE) $(build)=libtraceevent + +$(OUTPUT)libtraceevent.so: $(TE_IN) $(QUIET_LINK)$(CC) --shared $^ -o $@ -libtraceevent.a: $(PEVENT_LIB_OBJS) +$(OUTPUT)libtraceevent.a: $(TE_IN) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ plugins: $(PLUGINS) -$(PEVENT_LIB_OBJS): %.o: $(src)/%.c TRACEEVENT-CFLAGS - $(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@ +__plugin_obj = $(notdir $@) + plugin_obj = $(__plugin_obj:-in.o=) -$(PLUGIN_OBJS): %.o : $(src)/%.c - $(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) -fPIC -o $@ $< +$(PLUGINS_IN): force + $(Q)$(MAKE) $(build)=$(plugin_obj) -$(PLUGINS): %.so: %.o - $(QUIET_LINK)$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $< +$(OUTPUT)%.so: $(OUTPUT)%-in.o + $(QUIET_LINK)$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $^ define make_version.h (echo '/* This file is automatically generated. Do not modify. */'; \ @@ -255,40 +215,6 @@ define update_dir fi); endef -## make deps - -all_objs := $(sort $(ALL_OBJS)) -all_deps := $(all_objs:%.o=.%.d) - -# let .d file also depends on the source and header files -define check_deps - @set -e; $(RM) $@; \ - $(CC) -MM $(CFLAGS) $< > $@.$$$$; \ - sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ - $(RM) $@.$$$$ -endef - -$(all_deps): .%.d: $(src)/%.c - $(Q)$(call check_deps) - -$(all_objs) : %.o : .%.d - -dep_includes := $(wildcard $(all_deps)) - -ifneq ($(dep_includes),) - include $(dep_includes) -endif - -### Detect environment changes -TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE) - -TRACEEVENT-CFLAGS: force - @FLAGS='$(TRACK_CFLAGS)'; \ - if test x"$$FLAGS" != x"`cat TRACEEVENT-CFLAGS 2>/dev/null`" ; then \ - echo 1>&2 " FLAGS: * new build flags or cross compiler"; \ - echo "$$FLAGS" >TRACEEVENT-CFLAGS; \ - fi - tags: force $(RM) tags find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \ @@ -327,14 +253,9 @@ clean: $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d \ $(RM) TRACEEVENT-CFLAGS tags TAGS -endif # skip-makefile - PHONY += force plugins force: -plugins: - @echo > /dev/null - # Declare the contents of the .PHONY variable as phony. We keep that # information in a variable so we can use it in if_changed and friends. .PHONY: $(PHONY) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 2c0bd8f2aad0..e0917c0f5d9f 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -304,7 +304,10 @@ int pevent_register_comm(struct pevent *pevent, const char *comm, int pid) if (!item) return -1; - item->comm = strdup(comm); + if (comm) + item->comm = strdup(comm); + else + item->comm = strdup("<...>"); if (!item->comm) { free(item); return -1; @@ -318,9 +321,14 @@ int pevent_register_comm(struct pevent *pevent, const char *comm, int pid) return 0; } -void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock) +int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock) { - pevent->trace_clock = trace_clock; + pevent->trace_clock = strdup(trace_clock); + if (!pevent->trace_clock) { + errno = ENOMEM; + return -1; + } + return 0; } struct func_map { @@ -758,6 +766,11 @@ static void free_arg(struct print_arg *arg) free_arg(arg->hex.field); free_arg(arg->hex.size); break; + case PRINT_INT_ARRAY: + free_arg(arg->int_array.field); + free_arg(arg->int_array.count); + free_arg(arg->int_array.el_size); + break; case PRINT_TYPE: free(arg->typecast.type); free_arg(arg->typecast.item); @@ -1926,7 +1939,22 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) goto out_warn_free; type = process_arg_token(event, right, tok, type); - arg->op.right = right; + + if (right->type == PRINT_OP && + get_op_prio(arg->op.op) < get_op_prio(right->op.op)) { + struct print_arg tmp; + + /* rotate ops according to the priority */ + arg->op.right = right->op.left; + + tmp = *arg; + *arg = *right; + *right = tmp; + + arg->op.left = right; + } else { + arg->op.right = right; + } } else if (strcmp(token, "[") == 0) { @@ -2014,6 +2042,38 @@ process_entry(struct event_format *event __maybe_unused, struct print_arg *arg, return EVENT_ERROR; } +static int alloc_and_process_delim(struct event_format *event, char *next_token, + struct print_arg **print_arg) +{ + struct print_arg *field; + enum event_type type; + char *token; + int ret = 0; + + field = alloc_arg(); + if (!field) { + do_warning_event(event, "%s: not enough memory!", __func__); + errno = ENOMEM; + return -1; + } + + type = process_arg(event, field, &token); + + if (test_type_token(type, token, EVENT_DELIM, next_token)) { + errno = EINVAL; + ret = -1; + free_arg(field); + goto out_free_token; + } + + *print_arg = field; + +out_free_token: + free_token(token); + + return ret; +} + static char *arg_eval (struct print_arg *arg); static unsigned long long @@ -2486,49 +2546,46 @@ out_free: static enum event_type process_hex(struct event_format *event, struct print_arg *arg, char **tok) { - struct print_arg *field; - enum event_type type; - char *token = NULL; - memset(arg, 0, sizeof(*arg)); arg->type = PRINT_HEX; - field = alloc_arg(); - if (!field) { - do_warning_event(event, "%s: not enough memory!", __func__); - goto out_free; - } + if (alloc_and_process_delim(event, ",", &arg->hex.field)) + goto out; - type = process_arg(event, field, &token); + if (alloc_and_process_delim(event, ")", &arg->hex.size)) + goto free_field; - if (test_type_token(type, token, EVENT_DELIM, ",")) - goto out_free; + return read_token_item(tok); - arg->hex.field = field; +free_field: + free_arg(arg->hex.field); +out: + *tok = NULL; + return EVENT_ERROR; +} - free_token(token); +static enum event_type +process_int_array(struct event_format *event, struct print_arg *arg, char **tok) +{ + memset(arg, 0, sizeof(*arg)); + arg->type = PRINT_INT_ARRAY; - field = alloc_arg(); - if (!field) { - do_warning_event(event, "%s: not enough memory!", __func__); - *tok = NULL; - return EVENT_ERROR; - } + if (alloc_and_process_delim(event, ",", &arg->int_array.field)) + goto out; - type = process_arg(event, field, &token); + if (alloc_and_process_delim(event, ",", &arg->int_array.count)) + goto free_field; - if (test_type_token(type, token, EVENT_DELIM, ")")) - goto out_free; + if (alloc_and_process_delim(event, ")", &arg->int_array.el_size)) + goto free_size; - arg->hex.size = field; + return read_token_item(tok); - free_token(token); - type = read_token_item(tok); - return type; - - out_free: - free_arg(field); - free_token(token); +free_size: + free_arg(arg->int_array.count); +free_field: + free_arg(arg->int_array.field); +out: *tok = NULL; return EVENT_ERROR; } @@ -2828,6 +2885,10 @@ process_function(struct event_format *event, struct print_arg *arg, free_token(token); return process_hex(event, arg, tok); } + if (strcmp(token, "__print_array") == 0) { + free_token(token); + return process_int_array(event, arg, tok); + } if (strcmp(token, "__get_str") == 0) { free_token(token); return process_str(event, arg, tok); @@ -3356,6 +3417,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg break; case PRINT_FLAGS: case PRINT_SYMBOL: + case PRINT_INT_ARRAY: case PRINT_HEX: break; case PRINT_TYPE: @@ -3568,7 +3630,7 @@ static const struct flag flags[] = { { "HRTIMER_RESTART", 1 }, }; -static unsigned long long eval_flag(const char *flag) +static long long eval_flag(const char *flag) { int i; @@ -3584,7 +3646,7 @@ static unsigned long long eval_flag(const char *flag) if (strcmp(flags[i].name, flag) == 0) return flags[i].value; - return 0; + return -1LL; } static void print_str_to_seq(struct trace_seq *s, const char *format, @@ -3658,7 +3720,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, struct print_flag_sym *flag; struct format_field *field; struct printk_map *printk; - unsigned long long val, fval; + long long val, fval; unsigned long addr; char *str; unsigned char *hex; @@ -3717,11 +3779,11 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, print = 0; for (flag = arg->flags.flags; flag; flag = flag->next) { fval = eval_flag(flag->value); - if (!val && !fval) { + if (!val && fval < 0) { print_str_to_seq(s, format, len_arg, flag->str); break; } - if (fval && (val & fval) == fval) { + if (fval > 0 && (val & fval) == fval) { if (print && arg->flags.delim) trace_seq_puts(s, arg->flags.delim); print_str_to_seq(s, format, len_arg, flag->str); @@ -3766,6 +3828,54 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, } break; + case PRINT_INT_ARRAY: { + void *num; + int el_size; + + if (arg->int_array.field->type == PRINT_DYNAMIC_ARRAY) { + unsigned long offset; + struct format_field *field = + arg->int_array.field->dynarray.field; + offset = pevent_read_number(pevent, + data + field->offset, + field->size); + num = data + (offset & 0xffff); + } else { + field = arg->int_array.field->field.field; + if (!field) { + str = arg->int_array.field->field.name; + field = pevent_find_any_field(event, str); + if (!field) + goto out_warning_field; + arg->int_array.field->field.field = field; + } + num = data + field->offset; + } + len = eval_num_arg(data, size, event, arg->int_array.count); + el_size = eval_num_arg(data, size, event, + arg->int_array.el_size); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(s, ' '); + + if (el_size == 1) { + trace_seq_printf(s, "%u", *(uint8_t *)num); + } else if (el_size == 2) { + trace_seq_printf(s, "%u", *(uint16_t *)num); + } else if (el_size == 4) { + trace_seq_printf(s, "%u", *(uint32_t *)num); + } else if (el_size == 8) { + trace_seq_printf(s, "%lu", *(uint64_t *)num); + } else { + trace_seq_printf(s, "BAD SIZE:%d 0x%x", + el_size, *(uint8_t *)num); + el_size = 1; + } + + num += el_size; + } + break; + } case PRINT_TYPE: break; case PRINT_STRING: { @@ -3997,6 +4107,10 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc goto process_again; case '.': goto process_again; + case 'z': + case 'Z': + ls = 1; + goto process_again; case 'p': ls = 1; /* fall through */ @@ -4939,6 +5053,96 @@ const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid) return comm; } +static struct cmdline * +pid_from_cmdlist(struct pevent *pevent, const char *comm, struct cmdline *next) +{ + struct cmdline_list *cmdlist = (struct cmdline_list *)next; + + if (cmdlist) + cmdlist = cmdlist->next; + else + cmdlist = pevent->cmdlist; + + while (cmdlist && strcmp(cmdlist->comm, comm) != 0) + cmdlist = cmdlist->next; + + return (struct cmdline *)cmdlist; +} + +/** + * pevent_data_pid_from_comm - return the pid from a given comm + * @pevent: a handle to the pevent + * @comm: the cmdline to find the pid from + * @next: the cmdline structure to find the next comm + * + * This returns the cmdline structure that holds a pid for a given + * comm, or NULL if none found. As there may be more than one pid for + * a given comm, the result of this call can be passed back into + * a recurring call in the @next paramater, and then it will find the + * next pid. + * Also, it does a linear seach, so it may be slow. + */ +struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *comm, + struct cmdline *next) +{ + struct cmdline *cmdline; + + /* + * If the cmdlines have not been converted yet, then use + * the list. + */ + if (!pevent->cmdlines) + return pid_from_cmdlist(pevent, comm, next); + + if (next) { + /* + * The next pointer could have been still from + * a previous call before cmdlines were created + */ + if (next < pevent->cmdlines || + next >= pevent->cmdlines + pevent->cmdline_count) + next = NULL; + else + cmdline = next++; + } + + if (!next) + cmdline = pevent->cmdlines; + + while (cmdline < pevent->cmdlines + pevent->cmdline_count) { + if (strcmp(cmdline->comm, comm) == 0) + return cmdline; + cmdline++; + } + return NULL; +} + +/** + * pevent_cmdline_pid - return the pid associated to a given cmdline + * @cmdline: The cmdline structure to get the pid from + * + * Returns the pid for a give cmdline. If @cmdline is NULL, then + * -1 is returned. + */ +int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline) +{ + struct cmdline_list *cmdlist = (struct cmdline_list *)cmdline; + + if (!cmdline) + return -1; + + /* + * If cmdlines have not been created yet, or cmdline is + * not part of the array, then treat it as a cmdlist instead. + */ + if (!pevent->cmdlines || + cmdline < pevent->cmdlines || + cmdline >= pevent->cmdlines + pevent->cmdline_count) + return cmdlist->pid; + + return cmdline->pid; +} + /** * pevent_data_comm_from_pid - parse the data into the print format * @s: the trace_seq to write to @@ -5256,6 +5460,15 @@ static void print_args(struct print_arg *args) print_args(args->hex.size); printf(")"); break; + case PRINT_INT_ARRAY: + printf("__print_array("); + print_args(args->int_array.field); + printf(", "); + print_args(args->int_array.count); + printf(", "); + print_args(args->int_array.el_size); + printf(")"); + break; case PRINT_STRING: case PRINT_BSTRING: printf("__get_str(%s)", args->string.string); @@ -6228,15 +6441,20 @@ void pevent_ref(struct pevent *pevent) pevent->ref_count++; } +void pevent_free_format_field(struct format_field *field) +{ + free(field->type); + free(field->name); + free(field); +} + static void free_format_fields(struct format_field *field) { struct format_field *next; while (field) { next = field->next; - free(field->type); - free(field->name); - free(field); + pevent_free_format_field(field); field = next; } } @@ -6341,6 +6559,7 @@ void pevent_free(struct pevent *pevent) free_handler(handle); } + free(pevent->trace_clock); free(pevent->events); free(pevent->sort_events); diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 7a3873ff9a4f..86a5839fb048 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -91,6 +92,7 @@ extern int trace_seq_putc(struct trace_seq *s, unsigned char c); extern void trace_seq_terminate(struct trace_seq *s); +extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp); extern int trace_seq_do_printf(struct trace_seq *s); @@ -114,7 +116,7 @@ struct pevent_plugin_option { char *name; char *plugin_alias; char *description; - char *value; + const char *value; void *priv; int set; }; @@ -152,6 +154,10 @@ struct pevent_plugin_option { * .plugin_alias is used to give a shorter name to access * the vairable. Useful if a plugin handles more than one event. * + * If .value is not set, then it is considered a boolean and only + * .set will be processed. If .value is defined, then it is considered + * a string option and .set will be ignored. + * * PEVENT_PLUGIN_ALIAS: (optional) * The name to use for finding options (uses filename if not defined) */ @@ -245,6 +251,12 @@ struct print_arg_hex { struct print_arg *size; }; +struct print_arg_int_array { + struct print_arg *field; + struct print_arg *count; + struct print_arg *el_size; +}; + struct print_arg_dynarray { struct format_field *field; struct print_arg *index; @@ -273,6 +285,7 @@ enum print_arg_type { PRINT_FLAGS, PRINT_SYMBOL, PRINT_HEX, + PRINT_INT_ARRAY, PRINT_TYPE, PRINT_STRING, PRINT_BSTRING, @@ -292,6 +305,7 @@ struct print_arg { struct print_arg_flags flags; struct print_arg_symbol symbol; struct print_arg_hex hex; + struct print_arg_int_array int_array; struct print_arg_func func; struct print_arg_string string; struct print_arg_bitmask bitmask; @@ -597,7 +611,7 @@ enum trace_flag_type { }; int pevent_register_comm(struct pevent *pevent, const char *comm, int pid); -void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock); +int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock); int pevent_register_function(struct pevent *pevent, char *name, unsigned long long addr, char *mod); int pevent_register_print_string(struct pevent *pevent, const char *fmt, @@ -617,6 +631,7 @@ enum pevent_errno pevent_parse_format(struct pevent *pevent, const char *buf, unsigned long size, const char *sys); void pevent_free_format(struct event_format *event); +void pevent_free_format_field(struct format_field *field); void *pevent_get_field_raw(struct trace_seq *s, struct event_format *event, const char *name, struct pevent_record *record, @@ -675,6 +690,11 @@ int pevent_data_type(struct pevent *pevent, struct pevent_record *rec); struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type); int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec); const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid); +struct cmdline; +struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *comm, + struct cmdline *next); +int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline); + void pevent_event_info(struct trace_seq *s, struct event_format *event, struct pevent_record *record); int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum, diff --git a/tools/lib/traceevent/event-plugin.c b/tools/lib/traceevent/event-plugin.c index 136162c03af1..a16756ae3526 100644 --- a/tools/lib/traceevent/event-plugin.c +++ b/tools/lib/traceevent/event-plugin.c @@ -18,6 +18,7 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +#include #include #include #include @@ -49,6 +50,52 @@ struct plugin_list { void *handle; }; +static void lower_case(char *str) +{ + if (!str) + return; + for (; *str; str++) + *str = tolower(*str); +} + +static int update_option_value(struct pevent_plugin_option *op, const char *val) +{ + char *op_val; + + if (!val) { + /* toggle, only if option is boolean */ + if (op->value) + /* Warn? */ + return 0; + op->set ^= 1; + return 0; + } + + /* + * If the option has a value then it takes a string + * otherwise the option is a boolean. + */ + if (op->value) { + op->value = val; + return 0; + } + + /* Option is boolean, must be either "1", "0", "true" or "false" */ + + op_val = strdup(val); + if (!op_val) + return -1; + lower_case(op_val); + + if (strcmp(val, "1") == 0 || strcmp(val, "true") == 0) + op->set = 1; + else if (strcmp(val, "0") == 0 || strcmp(val, "false") == 0) + op->set = 0; + free(op_val); + + return 0; +} + /** * traceevent_plugin_list_options - get list of plugin options * @@ -120,6 +167,7 @@ update_option(const char *file, struct pevent_plugin_option *option) { struct trace_plugin_options *op; char *plugin; + int ret = 0; if (option->plugin_alias) { plugin = strdup(option->plugin_alias); @@ -144,9 +192,10 @@ update_option(const char *file, struct pevent_plugin_option *option) if (strcmp(op->option, option->name) != 0) continue; - option->value = op->value; - option->set ^= 1; - goto out; + ret = update_option_value(option, op->value); + if (ret) + goto out; + break; } /* first look for unnamed options */ @@ -156,14 +205,13 @@ update_option(const char *file, struct pevent_plugin_option *option) if (strcmp(op->option, option->name) != 0) continue; - option->value = op->value; - option->set ^= 1; + ret = update_option_value(option, op->value); break; } out: free(plugin); - return 0; + return ret; } /** diff --git a/tools/lib/traceevent/kbuffer-parse.c b/tools/lib/traceevent/kbuffer-parse.c index dcc665228c71..3bcada3ae05a 100644 --- a/tools/lib/traceevent/kbuffer-parse.c +++ b/tools/lib/traceevent/kbuffer-parse.c @@ -372,7 +372,6 @@ translate_data(struct kbuffer *kbuf, void *data, void **rptr, switch (type_len) { case KBUFFER_TYPE_PADDING: *length = read_4(kbuf, data); - data += *length; break; case KBUFFER_TYPE_TIME_EXTEND: @@ -730,3 +729,14 @@ void kbuffer_set_old_format(struct kbuffer *kbuf) kbuf->next_event = __old_next_event; } + +/** + * kbuffer_start_of_data - return offset of where data starts on subbuffer + * @kbuf: The kbuffer + * + * Returns the location on the subbuffer where the data starts. + */ +int kbuffer_start_of_data(struct kbuffer *kbuf) +{ + return kbuf->start; +} diff --git a/tools/lib/traceevent/kbuffer.h b/tools/lib/traceevent/kbuffer.h index c831f64b17a0..03dce757553f 100644 --- a/tools/lib/traceevent/kbuffer.h +++ b/tools/lib/traceevent/kbuffer.h @@ -63,5 +63,6 @@ int kbuffer_missed_events(struct kbuffer *kbuf); int kbuffer_subbuffer_size(struct kbuffer *kbuf); void kbuffer_set_old_format(struct kbuffer *kbuf); +int kbuffer_start_of_data(struct kbuffer *kbuf); #endif /* _K_BUFFER_H */ diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index b50234402fc2..0144b3d1bb77 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1058,6 +1058,7 @@ process_filter(struct event_format *event, struct filter_arg **parg, *parg = current_op; else *parg = current_exp; + free(token); return PEVENT_ERRNO__UNBALANCED_PAREN; } break; @@ -1168,6 +1169,7 @@ process_filter(struct event_format *event, struct filter_arg **parg, *parg = current_op; + free(token); return 0; fail_alloc: diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c index ec3bd16a5488..292dc9f1d233 100644 --- a/tools/lib/traceevent/trace-seq.c +++ b/tools/lib/traceevent/trace-seq.c @@ -231,19 +231,24 @@ void trace_seq_terminate(struct trace_seq *s) s->buffer[s->len] = 0; } -int trace_seq_do_printf(struct trace_seq *s) +int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp) { TRACE_SEQ_CHECK(s); switch (s->state) { case TRACE_SEQ__GOOD: - return printf("%.*s", s->len, s->buffer); + return fprintf(fp, "%.*s", s->len, s->buffer); case TRACE_SEQ__BUFFER_POISONED: - puts("Usage of trace_seq after it was destroyed"); + fprintf(fp, "%s\n", "Usage of trace_seq after it was destroyed"); break; case TRACE_SEQ__MEM_ALLOC_FAILED: - puts("Can't allocate trace_seq buffer memory"); + fprintf(fp, "%s\n", "Can't allocate trace_seq buffer memory"); break; } return -1; } + +int trace_seq_do_printf(struct trace_seq *s) +{ + return trace_seq_do_fprintf(s, stdout); +} diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index 40399c3d97d6..812f904193e8 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -1,6 +1,7 @@ PERF-CFLAGS PERF-GUI-VARS PERF-VERSION-FILE +FEATURE-DUMP perf perf-read-vdso32 perf-read-vdsox32 diff --git a/tools/perf/Build b/tools/perf/Build new file mode 100644 index 000000000000..b77370ef7005 --- /dev/null +++ b/tools/perf/Build @@ -0,0 +1,44 @@ +perf-y += builtin-bench.o +perf-y += builtin-annotate.o +perf-y += builtin-diff.o +perf-y += builtin-evlist.o +perf-y += builtin-help.o +perf-y += builtin-sched.o +perf-y += builtin-buildid-list.o +perf-y += builtin-buildid-cache.o +perf-y += builtin-list.o +perf-y += builtin-record.o +perf-y += builtin-report.o +perf-y += builtin-stat.o +perf-y += builtin-timechart.o +perf-y += builtin-top.o +perf-y += builtin-script.o +perf-y += builtin-kmem.o +perf-y += builtin-lock.o +perf-y += builtin-kvm.o +perf-y += builtin-inject.o +perf-y += builtin-mem.o +perf-y += builtin-data.o + +perf-$(CONFIG_AUDIT) += builtin-trace.o +perf-$(CONFIG_LIBELF) += builtin-probe.o + +perf-y += bench/ +perf-y += tests/ + +perf-y += perf.o + +paths += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" +paths += -DPERF_INFO_PATH="BUILD_STR($(infodir_SQ))" +paths += -DPERF_MAN_PATH="BUILD_STR($(mandir_SQ))" + +CFLAGS_builtin-help.o += $(paths) +CFLAGS_builtin-timechart.o += $(paths) +CFLAGS_perf.o += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" -include $(OUTPUT)PERF-VERSION-FILE + +libperf-y += util/ +libperf-y += arch/ +libperf-y += ui/ +libperf-y += scripts/ + +gtk-y += ui/gtk/ diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt new file mode 100644 index 000000000000..f6fc6507ba55 --- /dev/null +++ b/tools/perf/Documentation/Build.txt @@ -0,0 +1,49 @@ + +1) perf build +============= +The perf build process consists of several separated building blocks, +which are linked together to form the perf binary: + - libperf library (static) + - perf builtin commands + - traceevent library (static) + - GTK ui library + +Several makefiles govern the perf build: + + - Makefile + top level Makefile working as a wrapper that calls the main + Makefile.perf with a -j option to do parallel builds. + + - Makefile.perf + main makefile that triggers build of all perf objects including + installation and documentation processing. + + - tools/build/Makefile.build + main makefile of the build framework + + - tools/build/Build.include + build framework generic definitions + + - Build makefiles + makefiles that defines build objects + +Please refer to tools/build/Documentation/Build.txt for more +information about build framework. + + +2) perf build +============= +The Makefile.perf triggers the build framework for build objects: + perf, libperf, gtk + +resulting in following objects: + $ ls *-in.o + gtk-in.o libperf-in.o perf-in.o + +Those objects are then used in final linking: + libperf-gtk.so <- gtk-in.o libperf-in.o + perf <- perf-in.o libperf-in.o + + +NOTE this description is omitting other libraries involved, only + focusing on build framework outcomes diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt index 0294c57b1f5e..dd07b55f58d8 100644 --- a/tools/perf/Documentation/perf-buildid-cache.txt +++ b/tools/perf/Documentation/perf-buildid-cache.txt @@ -12,9 +12,9 @@ SYNOPSIS DESCRIPTION ----------- -This command manages the build-id cache. It can add and remove files to/from -the cache. In the future it should as well purge older entries, set upper -limits for the space used by the cache, etc. +This command manages the build-id cache. It can add, remove, update and purge +files to/from the cache. In the future it should as well set upper limits for +the space used by the cache, etc. OPTIONS ------- @@ -36,14 +36,24 @@ OPTIONS actually made. -r:: --remove=:: - Remove specified file from the cache. + Remove a cached binary which has same build-id of specified file + from the cache. +-p:: +--purge=:: + Purge all cached binaries including older caches which have specified + path from the cache. -M:: --missing=:: List missing build ids in the cache for the specified file. -u:: ---update:: - Update specified file of the cache. It can be used to update kallsyms - kernel dso to vmlinux in order to support annotation. +--update=:: + Update specified file of the cache. Note that this doesn't remove + older entires since those may be still needed for annotating old + (or remote) perf.data. Only if there is already a cache which has + exactly same build-id, that is replaced by new one. It can be used + to update kallsyms and kernel dso to vmlinux in order to support + annotation. + -v:: --verbose:: Be more verbose. diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt new file mode 100644 index 000000000000..be8fa1a0a97e --- /dev/null +++ b/tools/perf/Documentation/perf-data.txt @@ -0,0 +1,40 @@ +perf-data(1) +============== + +NAME +---- +perf-data - Data file related processing + +SYNOPSIS +-------- +[verse] +'perf data' [] []", + +DESCRIPTION +----------- +Data file related processing. + +COMMANDS +-------- +convert:: + Converts perf data file into another format (only CTF [1] format is support by now). + It's possible to set data-convert debug variable to get debug messages from conversion, + like: + perf --debug data-convert data convert ... + +OPTIONS for 'convert' +--------------------- +--to-ctf:: + Triggers the CTF conversion, specify the path of CTF data directory. + +-i:: + Specify input perf data file path. + +-v:: +--verbose:: + Be more verbose (show counter open errors, etc). + +SEE ALSO +-------- +linkperf:perf[1] +[1] Common Trace Format - http://www.efficios.com/ctf diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index e463caa3eb49..d1deb573877f 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -20,12 +20,20 @@ If no parameters are passed it will assume perf.data.old and perf.data. The differential profile is displayed only for events matching both specified perf.data files. +If no parameters are passed the samples will be sorted by dso and symbol. +As the perf.data files could come from different binaries, the symbols addresses +could vary. So perf diff is based on the comparison of the files and +symbols name. + OPTIONS ------- -D:: --dump-raw-trace:: Dump raw trace in ASCII. +--kallsyms=:: + kallsyms pathname + -m:: --modules:: Load module symbols. WARNING: use only with -k and LIVE kernel diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt index 7c8fbbf3f61c..150253cc3c97 100644 --- a/tools/perf/Documentation/perf-kmem.txt +++ b/tools/perf/Documentation/perf-kmem.txt @@ -25,6 +25,10 @@ OPTIONS --input=:: Select the input file (default: perf.data unless stdin is a fifo) +-v:: +--verbose:: + Be more verbose. (show symbol address, etc) + --caller:: Show per-callsite statistics diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 3e2aec94f806..bada8933fdd4 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -26,6 +26,7 @@ counted. The following modifiers exist: u - user-space counting k - kernel counting h - hypervisor counting + I - non idle counting G - guest counting (in KVM guests) H - host counting (not in KVM guests) p - precise level @@ -127,6 +128,12 @@ To limit the list use: One or more types can be used at the same time, listing the events for the types specified. +Support raw format: + +. '--raw-dump', shows the raw-dump of all the events. +. '--raw-dump [hw|sw|cache|tracepoint|pmu|event_glob]', shows the raw-dump of + a certain kind of events. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-top[1], diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index aaa869be3dc1..239609c09f83 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -47,6 +47,12 @@ OPTIONS -v:: --verbose:: Be more verbose (show parsed arguments, etc). + Can not use with -q. + +-q:: +--quiet:: + Be quiet (do not show any messages including errors). + Can not use with -v. -a:: --add=:: @@ -96,7 +102,7 @@ OPTIONS Dry run. With this option, --add and --del doesn't execute actual adding and removal operations. ---max-probes:: +--max-probes=NUM:: Set the maximum number of probe points for an event. Default is 128. -x:: @@ -104,8 +110,13 @@ OPTIONS Specify path to the executable or shared library file for user space tracing. Can also be used with --funcs option. +--demangle:: + Demangle application symbols. --no-demangle is also available + for disabling demangling. + --demangle-kernel:: - Demangle kernel symbols. + Demangle kernel symbols. --no-demangle-kernel is also available + for disabling kernel demangling. In absence of -m/-x options, perf probe checks if the first argument after the options is an absolute path name. If its an absolute path, perf probe @@ -137,6 +148,7 @@ Each probe argument follows below syntax. [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE] 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.) +'$vars' special argument is also available for NAME, it is expanded to the local variables which can access at given probe point. 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type. On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid. diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 31e977459c51..4847a793de65 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -55,6 +55,11 @@ OPTIONS If you want to profile write accesses in [0x1000~1008), just set 'mem:0x1000/8:w'. + - a group of events surrounded by a pair of brace ("{event1,event2,...}"). + Each event is separated by commas and the group should be quoted to + prevent the shell interpretation. You also need to use --group on + "perf report" to view group events together. + --filter=:: Event filter. @@ -62,9 +67,6 @@ OPTIONS --all-cpus:: System-wide collection from all CPUs. --l:: - Scale counter values. - -p:: --pid=:: Record events on existing process ID (comma separated list). @@ -107,6 +109,10 @@ OPTIONS specification with appended unit character - B/K/M/G. The size is rounded up to have nearest pages power of two value. +--group:: + Put all events in a single event group. This precedes the --event + option and remains only for backward compatibility. See --event. + -g:: Enables call-graph (stack chain/backtrace) recording. @@ -115,13 +121,19 @@ OPTIONS implies -g. Allows specifying "fp" (frame pointer) or "dwarf" - (DWARF's CFI - Call Frame Information) as the method to collect + (DWARF's CFI - Call Frame Information) or "lbr" + (Hardware Last Branch Record facility) as the method to collect the information used to show the call graphs. In some systems, where binaries are build with gcc --fomit-frame-pointer, using the "fp" method will produce bogus call graphs, using "dwarf", if available (perf tools linked to the libunwind library) should be used instead. + Using the "lbr" method doesn't require any compiler options. It + will produce call graphs from the hardware LBR registers. The + main limition is that it is only available on new Intel + platforms, such as Haswell. It can only get user call chain. It + doesn't work with branch stack sampling at the same time. -q:: --quiet:: @@ -235,6 +247,16 @@ Capture machine state (registers) at interrupt, i.e., on counter overflows for each sample. List of captured registers depends on the architecture. This option is off by default. +--running-time:: +Record running and enabled time for read events (:S) + +-k:: +--clockid:: +Sets the clock id to use for the various time fields in the perf_event_type +records. See clock_gettime(). In particular CLOCK_MONOTONIC and +CLOCK_MONOTONIC_RAW are supported, some events might also allow +CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index dd7cccdde498..4879cf638824 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -40,6 +40,11 @@ OPTIONS Only consider symbols in these comms. CSV that understands file://filename entries. This option will affect the percentage of the overhead column. See --percentage for more info. +--pid=:: + Only show events for given process ID (comma separated list). + +--tid=:: + Only show events for given thread ID (comma separated list). -d:: --dsos=:: Only consider symbols in these dsos. CSV that understands diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index a21eec05bc42..79445750fcb3 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -193,6 +193,12 @@ OPTIONS Only display events for these comms. CSV that understands file://filename entries. +--pid=:: + Only show events for given process ID (comma separated list). + +--tid=:: + Only show events for given thread ID (comma separated list). + -I:: --show-info:: Display extended information about the perf.data file. This adds diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 7e1b1f2bb83c..ba03fd5d1a54 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -55,6 +55,9 @@ OPTIONS --uid=:: Record events in threads owned by uid. Name or number. +--filter-pids=:: + Filter out events for these pids and for 'trace' itself (comma separated list). + -v:: --verbose=:: Verbosity level. @@ -115,6 +118,9 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --syscalls:: Trace system calls. This options is enabled by default. +--event:: + Trace other events, see 'perf list' for a complete list. + PAGEFAULTS ---------- diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt index 1e8e400b4493..2b131776363e 100644 --- a/tools/perf/Documentation/perf.txt +++ b/tools/perf/Documentation/perf.txt @@ -13,11 +13,16 @@ SYNOPSIS OPTIONS ------- --debug:: - Setup debug variable (just verbose for now) in value + Setup debug variable (see list below) in value range (0, 10). Use like: --debug verbose # sets verbose = 1 --debug verbose=2 # sets verbose = 2 + List of debug variables allowed to set: + verbose - general debug messages + ordered-events - ordered events object debug messages + data-convert - data convert command debug messages + --buildid-dir:: Setup buildid cache directory. It has higher priority than buildid.dir config file option. diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index fbbfdc39271d..11ccbb22ea2b 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,5 +1,6 @@ tools/perf tools/scripts +tools/build tools/lib/traceevent tools/lib/api tools/lib/symbol/kallsyms.c diff --git a/tools/perf/Makefile b/tools/perf/Makefile index cb2e5868c8e8..c699dc35eef9 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -24,8 +24,8 @@ unexport MAKEFLAGS # (To override it, run 'make JOBS=1' and similar.) # ifeq ($(JOBS),) - JOBS := $(shell grep -c ^processor /proc/cpuinfo 2>/dev/null) - ifeq ($(JOBS),) + JOBS := $(shell egrep -c '^processor|^CPU' /proc/cpuinfo 2>/dev/null) + ifeq ($(JOBS),0) JOBS := 1 endif endif diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index aa6a50447c32..c43a20517591 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -68,7 +68,11 @@ include config/utilities.mak # for reading the x32 mode 32-bit compatibility VDSO in 64-bit mode # # Define NO_ZLIB if you do not want to support compressed kernel modules - +# +# Define LIBBABELTRACE if you DO want libbabeltrace support +# for CTF data format. +# +# Define NO_LZMA if you do not want to support compressed (xz) kernel modules ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(shell pwd))) @@ -82,13 +86,29 @@ endif ifneq ($(OUTPUT),) #$(info Determined 'OUTPUT' to be $(OUTPUT)) +# Adding $(OUTPUT) as a directory to look for source files, +# because use generated output files as sources dependency +# for flex/bison parsers. +VPATH += $(OUTPUT) +export VPATH endif +ifeq ($(V),1) + Q = +else + Q = @ +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + $(OUTPUT)PERF-VERSION-FILE: ../../.git/HEAD - @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT) - @touch $(OUTPUT)PERF-VERSION-FILE + $(Q)$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT) + $(Q)touch $(OUTPUT)PERF-VERSION-FILE CC = $(CROSS_COMPILE)gcc +LD = $(CROSS_COMPILE)ld AR = $(CROSS_COMPILE)ar PKG_CONFIG = $(CROSS_COMPILE)pkg-config @@ -127,10 +147,6 @@ export prefix bindir sharedir sysconfdir DESTDIR SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ # Guard against environment variables -BUILTIN_OBJS = -LIB_H = -LIB_OBJS = -GTK_OBJS = PYRF_OBJS = SCRIPT_SH = @@ -155,8 +171,8 @@ endif LIBTRACEEVENT = $(TE_PATH)libtraceevent.a export LIBTRACEEVENT -LIBAPIKFS = $(LIB_PATH)libapikfs.a -export LIBAPIKFS +LIBAPI = $(LIB_PATH)libapi.a +export LIBAPI # python extension build directories PYTHON_EXTBUILD := $(OUTPUT)python_ext_build/ @@ -167,7 +183,7 @@ export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources) -PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPIKFS) +PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPI) $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \ @@ -206,297 +222,9 @@ endif export PERL_PATH -$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c - $(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l - -$(OUTPUT)util/parse-events-bison.c: util/parse-events.y - $(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_ - -$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c - $(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l - -$(OUTPUT)util/pmu-bison.c: util/pmu.y - $(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_ - -$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c -$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c - LIB_FILE=$(OUTPUT)libperf.a -LIB_H += ../lib/symbol/kallsyms.h -LIB_H += ../../include/uapi/linux/perf_event.h -LIB_H += ../../include/linux/rbtree.h -LIB_H += ../../include/linux/list.h -LIB_H += ../../include/uapi/linux/const.h -LIB_H += ../include/linux/hash.h -LIB_H += ../../include/linux/stringify.h -LIB_H += util/include/linux/bitmap.h -LIB_H += ../include/linux/bitops.h -LIB_H += ../include/asm-generic/bitops/arch_hweight.h -LIB_H += ../include/asm-generic/bitops/atomic.h -LIB_H += ../include/asm-generic/bitops/const_hweight.h -LIB_H += ../include/asm-generic/bitops/find.h -LIB_H += ../include/asm-generic/bitops/fls64.h -LIB_H += ../include/asm-generic/bitops/fls.h -LIB_H += ../include/asm-generic/bitops/__ffs.h -LIB_H += ../include/asm-generic/bitops/__fls.h -LIB_H += ../include/asm-generic/bitops/hweight.h -LIB_H += ../include/asm-generic/bitops.h -LIB_H += ../include/linux/compiler.h -LIB_H += ../include/linux/log2.h -LIB_H += util/include/linux/const.h -LIB_H += util/include/linux/ctype.h -LIB_H += util/include/linux/kernel.h -LIB_H += util/include/linux/list.h -LIB_H += ../include/linux/export.h -LIB_H += util/include/linux/poison.h -LIB_H += util/include/linux/rbtree.h -LIB_H += util/include/linux/rbtree_augmented.h -LIB_H += util/include/linux/string.h -LIB_H += ../include/linux/types.h -LIB_H += util/include/linux/linkage.h -LIB_H += util/include/asm/asm-offsets.h -LIB_H += ../include/asm/bug.h -LIB_H += util/include/asm/byteorder.h -LIB_H += util/include/asm/swab.h -LIB_H += util/include/asm/system.h -LIB_H += util/include/asm/uaccess.h -LIB_H += util/include/dwarf-regs.h -LIB_H += util/include/asm/dwarf2.h -LIB_H += util/include/asm/cpufeature.h -LIB_H += util/include/asm/unistd_32.h -LIB_H += util/include/asm/unistd_64.h -LIB_H += perf.h -LIB_H += util/annotate.h -LIB_H += util/cache.h -LIB_H += util/callchain.h -LIB_H += util/build-id.h -LIB_H += util/db-export.h -LIB_H += util/debug.h -LIB_H += util/pmu.h -LIB_H += util/event.h -LIB_H += util/evsel.h -LIB_H += util/evlist.h -LIB_H += util/exec_cmd.h -LIB_H += util/find-vdso-map.c -LIB_H += util/levenshtein.h -LIB_H += util/machine.h -LIB_H += util/map.h -LIB_H += util/parse-options.h -LIB_H += util/parse-events.h -LIB_H += util/quote.h -LIB_H += util/util.h -LIB_H += util/xyarray.h -LIB_H += util/header.h -LIB_H += util/help.h -LIB_H += util/session.h -LIB_H += util/ordered-events.h -LIB_H += util/strbuf.h -LIB_H += util/strlist.h -LIB_H += util/strfilter.h -LIB_H += util/svghelper.h -LIB_H += util/tool.h -LIB_H += util/run-command.h -LIB_H += util/sigchain.h -LIB_H += util/dso.h -LIB_H += util/symbol.h -LIB_H += util/color.h -LIB_H += util/values.h -LIB_H += util/sort.h -LIB_H += util/hist.h -LIB_H += util/comm.h -LIB_H += util/thread.h -LIB_H += util/thread_map.h -LIB_H += util/trace-event.h -LIB_H += util/probe-finder.h -LIB_H += util/dwarf-aux.h -LIB_H += util/probe-event.h -LIB_H += util/pstack.h -LIB_H += util/cpumap.h -LIB_H += util/top.h -LIB_H += $(ARCH_INCLUDE) -LIB_H += util/cgroup.h -LIB_H += $(LIB_INCLUDE)traceevent/event-parse.h -LIB_H += util/target.h -LIB_H += util/rblist.h -LIB_H += util/intlist.h -LIB_H += util/perf_regs.h -LIB_H += util/unwind.h -LIB_H += util/vdso.h -LIB_H += util/tsc.h -LIB_H += ui/helpline.h -LIB_H += ui/progress.h -LIB_H += ui/util.h -LIB_H += ui/ui.h -LIB_H += util/data.h -LIB_H += util/kvm-stat.h -LIB_H += util/thread-stack.h - -LIB_OBJS += $(OUTPUT)util/abspath.o -LIB_OBJS += $(OUTPUT)util/alias.o -LIB_OBJS += $(OUTPUT)util/annotate.o -LIB_OBJS += $(OUTPUT)util/build-id.o -LIB_OBJS += $(OUTPUT)util/config.o -LIB_OBJS += $(OUTPUT)util/ctype.o -LIB_OBJS += $(OUTPUT)util/db-export.o -LIB_OBJS += $(OUTPUT)util/pmu.o -LIB_OBJS += $(OUTPUT)util/environment.o -LIB_OBJS += $(OUTPUT)util/event.o -LIB_OBJS += $(OUTPUT)util/evlist.o -LIB_OBJS += $(OUTPUT)util/evsel.o -LIB_OBJS += $(OUTPUT)util/exec_cmd.o -LIB_OBJS += $(OUTPUT)util/find_next_bit.o -LIB_OBJS += $(OUTPUT)util/help.o -LIB_OBJS += $(OUTPUT)util/kallsyms.o -LIB_OBJS += $(OUTPUT)util/levenshtein.o -LIB_OBJS += $(OUTPUT)util/parse-options.o -LIB_OBJS += $(OUTPUT)util/parse-events.o -LIB_OBJS += $(OUTPUT)util/path.o -LIB_OBJS += $(OUTPUT)util/rbtree.o -LIB_OBJS += $(OUTPUT)util/bitmap.o -LIB_OBJS += $(OUTPUT)util/hweight.o -LIB_OBJS += $(OUTPUT)util/run-command.o -LIB_OBJS += $(OUTPUT)util/quote.o -LIB_OBJS += $(OUTPUT)util/strbuf.o -LIB_OBJS += $(OUTPUT)util/string.o -LIB_OBJS += $(OUTPUT)util/strlist.o -LIB_OBJS += $(OUTPUT)util/strfilter.o -LIB_OBJS += $(OUTPUT)util/top.o -LIB_OBJS += $(OUTPUT)util/usage.o -LIB_OBJS += $(OUTPUT)util/wrapper.o -LIB_OBJS += $(OUTPUT)util/sigchain.o -LIB_OBJS += $(OUTPUT)util/dso.o -LIB_OBJS += $(OUTPUT)util/symbol.o -LIB_OBJS += $(OUTPUT)util/symbol-elf.o -LIB_OBJS += $(OUTPUT)util/color.o -LIB_OBJS += $(OUTPUT)util/pager.o -LIB_OBJS += $(OUTPUT)util/header.o -LIB_OBJS += $(OUTPUT)util/callchain.o -LIB_OBJS += $(OUTPUT)util/values.o -LIB_OBJS += $(OUTPUT)util/debug.o -LIB_OBJS += $(OUTPUT)util/machine.o -LIB_OBJS += $(OUTPUT)util/map.o -LIB_OBJS += $(OUTPUT)util/pstack.o -LIB_OBJS += $(OUTPUT)util/session.o -LIB_OBJS += $(OUTPUT)util/ordered-events.o -LIB_OBJS += $(OUTPUT)util/comm.o -LIB_OBJS += $(OUTPUT)util/thread.o -LIB_OBJS += $(OUTPUT)util/thread_map.o -LIB_OBJS += $(OUTPUT)util/trace-event-parse.o -LIB_OBJS += $(OUTPUT)util/parse-events-flex.o -LIB_OBJS += $(OUTPUT)util/parse-events-bison.o -LIB_OBJS += $(OUTPUT)util/pmu-flex.o -LIB_OBJS += $(OUTPUT)util/pmu-bison.o -LIB_OBJS += $(OUTPUT)util/trace-event-read.o -LIB_OBJS += $(OUTPUT)util/trace-event-info.o -LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o -LIB_OBJS += $(OUTPUT)util/trace-event.o -LIB_OBJS += $(OUTPUT)util/svghelper.o -LIB_OBJS += $(OUTPUT)util/sort.o -LIB_OBJS += $(OUTPUT)util/hist.o -LIB_OBJS += $(OUTPUT)util/probe-event.o -LIB_OBJS += $(OUTPUT)util/util.o -LIB_OBJS += $(OUTPUT)util/xyarray.o -LIB_OBJS += $(OUTPUT)util/cpumap.o -LIB_OBJS += $(OUTPUT)util/cgroup.o -LIB_OBJS += $(OUTPUT)util/target.o -LIB_OBJS += $(OUTPUT)util/rblist.o -LIB_OBJS += $(OUTPUT)util/intlist.o -LIB_OBJS += $(OUTPUT)util/vdso.o -LIB_OBJS += $(OUTPUT)util/stat.o -LIB_OBJS += $(OUTPUT)util/record.o -LIB_OBJS += $(OUTPUT)util/srcline.o -LIB_OBJS += $(OUTPUT)util/data.o -LIB_OBJS += $(OUTPUT)util/tsc.o -LIB_OBJS += $(OUTPUT)util/cloexec.o -LIB_OBJS += $(OUTPUT)util/thread-stack.o - -LIB_OBJS += $(OUTPUT)ui/setup.o -LIB_OBJS += $(OUTPUT)ui/helpline.o -LIB_OBJS += $(OUTPUT)ui/progress.o -LIB_OBJS += $(OUTPUT)ui/util.o -LIB_OBJS += $(OUTPUT)ui/hist.o -LIB_OBJS += $(OUTPUT)ui/stdio/hist.o - -LIB_OBJS += $(OUTPUT)arch/common.o - -LIB_OBJS += $(OUTPUT)tests/parse-events.o -LIB_OBJS += $(OUTPUT)tests/dso-data.o -LIB_OBJS += $(OUTPUT)tests/attr.o -LIB_OBJS += $(OUTPUT)tests/vmlinux-kallsyms.o -LIB_OBJS += $(OUTPUT)tests/open-syscall.o -LIB_OBJS += $(OUTPUT)tests/open-syscall-all-cpus.o -LIB_OBJS += $(OUTPUT)tests/open-syscall-tp-fields.o -LIB_OBJS += $(OUTPUT)tests/mmap-basic.o -LIB_OBJS += $(OUTPUT)tests/perf-record.o -LIB_OBJS += $(OUTPUT)tests/rdpmc.o -LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o -LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o -LIB_OBJS += $(OUTPUT)tests/fdarray.o -LIB_OBJS += $(OUTPUT)tests/pmu.o -LIB_OBJS += $(OUTPUT)tests/hists_common.o -LIB_OBJS += $(OUTPUT)tests/hists_link.o -LIB_OBJS += $(OUTPUT)tests/hists_filter.o -LIB_OBJS += $(OUTPUT)tests/hists_output.o -LIB_OBJS += $(OUTPUT)tests/hists_cumulate.o -LIB_OBJS += $(OUTPUT)tests/python-use.o -LIB_OBJS += $(OUTPUT)tests/bp_signal.o -LIB_OBJS += $(OUTPUT)tests/bp_signal_overflow.o -LIB_OBJS += $(OUTPUT)tests/task-exit.o -LIB_OBJS += $(OUTPUT)tests/sw-clock.o -ifeq ($(ARCH),x86) -LIB_OBJS += $(OUTPUT)tests/perf-time-to-tsc.o -endif -LIB_OBJS += $(OUTPUT)tests/code-reading.o -LIB_OBJS += $(OUTPUT)tests/sample-parsing.o -LIB_OBJS += $(OUTPUT)tests/parse-no-sample-id-all.o -ifndef NO_DWARF_UNWIND -ifeq ($(ARCH),$(filter $(ARCH),x86 arm)) -LIB_OBJS += $(OUTPUT)tests/dwarf-unwind.o -endif -endif -LIB_OBJS += $(OUTPUT)tests/mmap-thread-lookup.o -LIB_OBJS += $(OUTPUT)tests/thread-mg-share.o -LIB_OBJS += $(OUTPUT)tests/switch-tracking.o - -BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o -BUILTIN_OBJS += $(OUTPUT)builtin-bench.o -# Benchmark modules -BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o -BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o -ifeq ($(ARCH), x86) -ifeq ($(IS_64_BIT), 1) -BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o -BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o -endif -endif -BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o -BUILTIN_OBJS += $(OUTPUT)bench/futex-hash.o -BUILTIN_OBJS += $(OUTPUT)bench/futex-wake.o -BUILTIN_OBJS += $(OUTPUT)bench/futex-requeue.o - -BUILTIN_OBJS += $(OUTPUT)builtin-diff.o -BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o -BUILTIN_OBJS += $(OUTPUT)builtin-help.o -BUILTIN_OBJS += $(OUTPUT)builtin-sched.o -BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o -BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o -BUILTIN_OBJS += $(OUTPUT)builtin-list.o -BUILTIN_OBJS += $(OUTPUT)builtin-record.o -BUILTIN_OBJS += $(OUTPUT)builtin-report.o -BUILTIN_OBJS += $(OUTPUT)builtin-stat.o -BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o -BUILTIN_OBJS += $(OUTPUT)builtin-top.o -BUILTIN_OBJS += $(OUTPUT)builtin-script.o -BUILTIN_OBJS += $(OUTPUT)builtin-probe.o -BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o -BUILTIN_OBJS += $(OUTPUT)builtin-lock.o -BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o -BUILTIN_OBJS += $(OUTPUT)builtin-inject.o -BUILTIN_OBJS += $(OUTPUT)tests/builtin-test.o -BUILTIN_OBJS += $(OUTPUT)builtin-mem.o - -PERFLIBS = $(LIB_FILE) $(LIBAPIKFS) $(LIBTRACEEVENT) +PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT) # We choose to avoid "if .. else if .. else .. endif endif" # because maintaining the nesting to match is a pain. If @@ -508,67 +236,9 @@ ifneq ($(OUTPUT),) CFLAGS += -I$(OUTPUT) endif -ifdef NO_LIBELF -# Remove ELF/DWARF dependent codes -LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS)) -LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS)) -LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS)) -LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS)) - -BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS)) - -# Use minimal symbol handling -LIB_OBJS += $(OUTPUT)util/symbol-minimal.o - -else # NO_LIBELF -ifndef NO_DWARF - LIB_OBJS += $(OUTPUT)util/probe-finder.o - LIB_OBJS += $(OUTPUT)util/dwarf-aux.o -endif # NO_DWARF -endif # NO_LIBELF - -ifndef NO_LIBDW_DWARF_UNWIND - LIB_OBJS += $(OUTPUT)util/unwind-libdw.o - LIB_H += util/unwind-libdw.h -endif - -ifndef NO_LIBUNWIND - LIB_OBJS += $(OUTPUT)util/unwind-libunwind.o -endif -LIB_OBJS += $(OUTPUT)tests/keep-tracking.o - -ifndef NO_LIBAUDIT - BUILTIN_OBJS += $(OUTPUT)builtin-trace.o -endif - -ifndef NO_SLANG - LIB_OBJS += $(OUTPUT)ui/browser.o - LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o - LIB_OBJS += $(OUTPUT)ui/browsers/hists.o - LIB_OBJS += $(OUTPUT)ui/browsers/map.o - LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o - LIB_OBJS += $(OUTPUT)ui/browsers/header.o - LIB_OBJS += $(OUTPUT)ui/tui/setup.o - LIB_OBJS += $(OUTPUT)ui/tui/util.o - LIB_OBJS += $(OUTPUT)ui/tui/helpline.o - LIB_OBJS += $(OUTPUT)ui/tui/progress.o - LIB_H += ui/tui/tui.h - LIB_H += ui/browser.h - LIB_H += ui/browsers/map.h - LIB_H += ui/keysyms.h - LIB_H += ui/libslang.h -endif - ifndef NO_GTK2 ALL_PROGRAMS += $(OUTPUT)libperf-gtk.so - - GTK_OBJS += $(OUTPUT)ui/gtk/browser.o - GTK_OBJS += $(OUTPUT)ui/gtk/hists.o - GTK_OBJS += $(OUTPUT)ui/gtk/setup.o - GTK_OBJS += $(OUTPUT)ui/gtk/util.o - GTK_OBJS += $(OUTPUT)ui/gtk/helpline.o - GTK_OBJS += $(OUTPUT)ui/gtk/progress.o - GTK_OBJS += $(OUTPUT)ui/gtk/annotate.o + GTK_IN := $(OUTPUT)gtk-in.o install-gtk: $(OUTPUT)libperf-gtk.so $(call QUIET_INSTALL, 'GTK UI') \ @@ -576,31 +246,6 @@ install-gtk: $(OUTPUT)libperf-gtk.so $(INSTALL) $(OUTPUT)libperf-gtk.so '$(DESTDIR_SQ)$(libdir_SQ)' endif -ifndef NO_LIBPERL - LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o - LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o -endif - -ifndef NO_LIBPYTHON - LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o - LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o -endif - -ifeq ($(NO_PERF_REGS),0) - ifeq ($(ARCH),x86) - LIB_H += arch/x86/include/perf_regs.h - endif - LIB_OBJS += $(OUTPUT)util/perf_regs.o -endif - -ifndef NO_LIBNUMA - BUILTIN_OBJS += $(OUTPUT)bench/numa.o -endif - -ifndef NO_ZLIB - LIB_OBJS += $(OUTPUT)util/zlib.o -endif - ifdef ASCIIDOC8 export ASCIIDOC8 endif @@ -616,40 +261,30 @@ SHELL = $(SHELL_PATH) all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) please_set_SHELL_PATH_to_a_more_modern_shell: - @$$(:) + $(Q)$$(:) shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell strip: $(PROGRAMS) $(OUTPUT)perf $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf -$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \ - '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ - $(CFLAGS) -c $(filter %.c,$^) -o $@ +PERF_IN := $(OUTPUT)perf-in.o -$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OUTPUT)perf.o \ - $(BUILTIN_OBJS) $(LIBS) -o $@ +export srctree OUTPUT RM CC LD AR CFLAGS V BISON FLEX +build := -f $(srctree)/tools/build/Makefile.build dir=. obj -$(GTK_OBJS): $(OUTPUT)%.o: %.c $(LIB_H) - $(QUIET_CC)$(CC) -o $@ -c -fPIC $(CFLAGS) $(GTK_CFLAGS) $< +$(PERF_IN): $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h FORCE + $(Q)$(MAKE) $(build)=perf -$(OUTPUT)libperf-gtk.so: $(GTK_OBJS) $(PERFLIBS) +$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(PERF_IN) $(LIBS) -o $@ + +$(GTK_IN): FORCE + $(Q)$(MAKE) $(build)=gtk + +$(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS) $(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS) -$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \ - '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ - '-DPERF_MAN_PATH="$(mandir_SQ)"' \ - '-DPERF_INFO_PATH="$(infodir_SQ)"' $< - -$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \ - '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ - '-DPERF_MAN_PATH="$(mandir_SQ)"' \ - '-DPERF_INFO_PATH="$(infodir_SQ)"' $< - $(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt $(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt) @@ -659,8 +294,7 @@ $(SCRIPTS) : % : %.sh $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@' # These can record PERF_VERSION -$(OUTPUT)perf.o perf.spec \ - $(SCRIPTS) \ +perf.spec $(SCRIPTS) \ : $(OUTPUT)PERF-VERSION-FILE .SUFFIXES: @@ -683,90 +317,33 @@ endif # These two need to be here so that when O= is not used they take precedence # over the general rule for .o -$(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -w $< +# get relative building directory (to $(OUTPUT)) +# and '.' if it's $(OUTPUT) itself +__build-dir = $(subst $(OUTPUT),,$(dir $@)) +build-dir = $(if $(__build-dir),$(__build-dir),.) -$(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $< +single_dep: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h -$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $< -$(OUTPUT)%.i: %.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $< -$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -S $(CFLAGS) $< -$(OUTPUT)%.o: %.S - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $< -$(OUTPUT)%.s: %.S - $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $< +$(OUTPUT)%.o: %.c single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \ - '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \ - '-DPREFIX="$(prefix_SQ)"' \ - $< +$(OUTPUT)%.i: %.c single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \ - '-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \ - $< +$(OUTPUT)%.s: %.c single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \ - -DPYTHONPATH='"$(OUTPUT)python"' \ - -DPYTHON='"$(PYTHON_WORD)"' \ - $< +$(OUTPUT)%-bison.o: %.c single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)tests/dwarf-unwind.o: tests/dwarf-unwind.c - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -fno-optimize-sibling-calls $< +$(OUTPUT)%-flex.o: %.c single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< +$(OUTPUT)%.o: %.S single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ -$(OUTPUT)ui/setup.o: ui/setup.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DLIBDIR='"$(libdir_SQ)"' $< - -$(OUTPUT)ui/browser.o: ui/browser.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $< - -$(OUTPUT)ui/browsers/annotate.o: ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $< - -$(OUTPUT)ui/browsers/hists.o: ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $< - -$(OUTPUT)ui/browsers/map.o: ui/browsers/map.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $< - -$(OUTPUT)ui/browsers/scripts.o: ui/browsers/scripts.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $< - -$(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $< - -$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< - -$(OUTPUT)util/hweight.o: ../../lib/hweight.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< - -$(OUTPUT)util/find_next_bit.o: ../lib/util/find_next_bit.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< - -$(OUTPUT)util/parse-events.o: util/parse-events.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-redundant-decls $< - -$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default $< - -$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default $< - -$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $< - -$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $< +$(OUTPUT)%.i: %.S single_dep FORCE + $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ $(OUTPUT)perf-%: %.o $(PERFLIBS) $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $(LDFLAGS) $(filter %.o,$^) $(LIBS) @@ -781,58 +358,34 @@ $(OUTPUT)perf-read-vdsox32: perf-read-vdso.c util/find-vdso-map.c $(QUIET_CC)$(CC) -mx32 $(filter -static,$(LDFLAGS)) -Wall -Werror -o $@ perf-read-vdso.c endif -$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) -$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) +$(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) -# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So -# we depend the various files onto their directories. -DIRECTORY_DEPS = $(LIB_OBJS) $(BUILTIN_OBJS) $(GTK_OBJS) -DIRECTORY_DEPS += $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h -# no need to add flex objects, because they depend on bison ones -DIRECTORY_DEPS += $(OUTPUT)util/parse-events-bison.c -DIRECTORY_DEPS += $(OUTPUT)util/pmu-bison.c +LIBPERF_IN := $(OUTPUT)libperf-in.o -OUTPUT_DIRECTORIES := $(sort $(dir $(DIRECTORY_DEPS))) +$(LIBPERF_IN): FORCE + $(Q)$(MAKE) $(build)=libperf -$(DIRECTORY_DEPS): | $(OUTPUT_DIRECTORIES) -# In the second step, we make a rule to actually create these directories -$(OUTPUT_DIRECTORIES): - $(QUIET_MKDIR)$(MKDIR) -p $@ 2>/dev/null +$(LIB_FILE): $(LIBPERF_IN) + $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN) $(LIB_OBJS) -$(LIB_FILE): $(LIB_OBJS) - $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) - -# libtraceevent.a -TE_SOURCES = $(wildcard $(TRACE_EVENT_DIR)*.[ch]) - -LIBTRACEEVENT_FLAGS = $(QUIET_SUBDIR1) O=$(OUTPUT) -LIBTRACEEVENT_FLAGS += CFLAGS="-g -Wall $(EXTRA_CFLAGS)" LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) -$(LIBTRACEEVENT): $(TE_SOURCES) $(OUTPUT)PERF-CFLAGS - $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) libtraceevent.a plugins +$(LIBTRACEEVENT): FORCE + $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a plugins $(LIBTRACEEVENT)-clean: $(call QUIET_CLEAN, libtraceevent) - @$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null + $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null install-traceevent-plugins: $(LIBTRACEEVENT) - $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) install_plugins + $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) install_plugins -LIBAPIKFS_SOURCES = $(wildcard $(LIB_PATH)fs/*.[ch] $(LIB_PATH)fd/*.[ch]) +$(LIBAPI): FORCE + $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a -# if subdir is set, we've been called from above so target has been built -# already -$(LIBAPIKFS): $(LIBAPIKFS_SOURCES) -ifeq ($(subdir),) - $(QUIET_SUBDIR0)$(LIB_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libapikfs.a -endif - -$(LIBAPIKFS)-clean: -ifeq ($(subdir),) - $(call QUIET_CLEAN, libapikfs) - @$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null -endif +$(LIBAPI)-clean: + $(call QUIET_CLEAN, libapi) + $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null help: @echo 'Perf make targets:' @@ -888,17 +441,6 @@ cscope: $(QUIET_GEN)$(RM) cscope*; \ $(FIND) $(TAG_FOLDERS) -name '*.[hcS]' -print | xargs cscope -b $(TAG_FILES) -### Detect prefix changes -TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):\ - $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ):$(plugindir_SQ) - -$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS - @FLAGS='$(TRACK_CFLAGS)'; \ - if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \ - echo 1>&2 " FLAGS: * new build flags or prefix"; \ - echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \ - fi - ### Testing rules # GNU make supports exporting all variables by "export" without parameters. @@ -981,12 +523,14 @@ $(INSTALL_DOC_TARGETS): # config-clean: $(call QUIET_CLEAN, config) - @$(MAKE) -C config/feature-checks clean >/dev/null + $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null -clean: $(LIBTRACEEVENT)-clean $(LIBAPIKFS)-clean config-clean - $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_OBJS) $(BUILTIN_OBJS) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf.o $(LANG_BINDINGS) $(GTK_OBJS) +clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean config-clean + $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) + $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)$(RM) .config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 - $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-FEATURES $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* + $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean $(python-clean) @@ -1000,7 +544,9 @@ else GIT-HEAD-PHONY = endif +FORCE: + .PHONY: all install clean config-clean strip install-gtk .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell -.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope .FORCE-PERF-CFLAGS +.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE single_dep diff --git a/tools/perf/arch/Build b/tools/perf/arch/Build new file mode 100644 index 000000000000..109eb75cf7de --- /dev/null +++ b/tools/perf/arch/Build @@ -0,0 +1,2 @@ +libperf-y += common.o +libperf-y += $(ARCH)/ diff --git a/tools/perf/arch/arm/Build b/tools/perf/arch/arm/Build new file mode 100644 index 000000000000..41bf61da476a --- /dev/null +++ b/tools/perf/arch/arm/Build @@ -0,0 +1,2 @@ +libperf-y += util/ +libperf-$(CONFIG_DWARF_UNWIND) += tests/ diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile index 09d62153d384..7fbca175099e 100644 --- a/tools/perf/arch/arm/Makefile +++ b/tools/perf/arch/arm/Makefile @@ -1,14 +1,3 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o -endif -ifndef NO_LIBUNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o -endif -ifndef NO_LIBDW_DWARF_UNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o -endif -ifndef NO_DWARF_UNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o endif diff --git a/tools/perf/arch/arm/tests/Build b/tools/perf/arch/arm/tests/Build new file mode 100644 index 000000000000..b30eff9bcc83 --- /dev/null +++ b/tools/perf/arch/arm/tests/Build @@ -0,0 +1,2 @@ +libperf-y += regs_load.o +libperf-y += dwarf-unwind.o diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build new file mode 100644 index 000000000000..d22e3d07de3d --- /dev/null +++ b/tools/perf/arch/arm/util/Build @@ -0,0 +1,4 @@ +libperf-$(CONFIG_DWARF) += dwarf-regs.o + +libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o +libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/arm64/Build b/tools/perf/arch/arm64/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/arm64/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile index 67e9b3d38e89..7fbca175099e 100644 --- a/tools/perf/arch/arm64/Makefile +++ b/tools/perf/arch/arm64/Makefile @@ -1,7 +1,3 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o -endif -ifndef NO_LIBUNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o endif diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build new file mode 100644 index 000000000000..e58123a8912b --- /dev/null +++ b/tools/perf/arch/arm64/util/Build @@ -0,0 +1,2 @@ +libperf-$(CONFIG_DWARF) += dwarf-regs.o +libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/powerpc/Build b/tools/perf/arch/powerpc/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/powerpc/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile index 6f7782bea5dd..7fbca175099e 100644 --- a/tools/perf/arch/powerpc/Makefile +++ b/tools/perf/arch/powerpc/Makefile @@ -1,6 +1,3 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/skip-callchain-idx.o endif -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build new file mode 100644 index 000000000000..0af6e9b3f728 --- /dev/null +++ b/tools/perf/arch/powerpc/util/Build @@ -0,0 +1,4 @@ +libperf-y += header.o + +libperf-$(CONFIG_DWARF) += dwarf-regs.o +libperf-$(CONFIG_DWARF) += skip-callchain-idx.o diff --git a/tools/perf/arch/s390/Build b/tools/perf/arch/s390/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/s390/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile index 798ac7379c5f..21322e0385b8 100644 --- a/tools/perf/arch/s390/Makefile +++ b/tools/perf/arch/s390/Makefile @@ -1,7 +1,4 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o HAVE_KVM_STAT_SUPPORT := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/kvm-stat.o diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build new file mode 100644 index 000000000000..8a61372bb47a --- /dev/null +++ b/tools/perf/arch/s390/util/Build @@ -0,0 +1,4 @@ +libperf-y += header.o +libperf-y += kvm-stat.o + +libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/sh/Build b/tools/perf/arch/sh/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/sh/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/sh/Makefile b/tools/perf/arch/sh/Makefile index 15130b50dfe3..7fbca175099e 100644 --- a/tools/perf/arch/sh/Makefile +++ b/tools/perf/arch/sh/Makefile @@ -1,4 +1,3 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif diff --git a/tools/perf/arch/sh/util/Build b/tools/perf/arch/sh/util/Build new file mode 100644 index 000000000000..954e287bbb89 --- /dev/null +++ b/tools/perf/arch/sh/util/Build @@ -0,0 +1 @@ +libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/sparc/Build b/tools/perf/arch/sparc/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/sparc/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile index 15130b50dfe3..7fbca175099e 100644 --- a/tools/perf/arch/sparc/Makefile +++ b/tools/perf/arch/sparc/Makefile @@ -1,4 +1,3 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif diff --git a/tools/perf/arch/sparc/util/Build b/tools/perf/arch/sparc/util/Build new file mode 100644 index 000000000000..954e287bbb89 --- /dev/null +++ b/tools/perf/arch/sparc/util/Build @@ -0,0 +1 @@ +libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/x86/Build b/tools/perf/arch/x86/Build new file mode 100644 index 000000000000..41bf61da476a --- /dev/null +++ b/tools/perf/arch/x86/Build @@ -0,0 +1,2 @@ +libperf-y += util/ +libperf-$(CONFIG_DWARF_UNWIND) += tests/ diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index 9b21881db52f..21322e0385b8 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -1,19 +1,4 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif -ifndef NO_LIBUNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o -endif -ifndef NO_LIBDW_DWARF_UNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o -endif -ifndef NO_DWARF_UNWIND -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o -endif -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/tsc.o -LIB_H += arch/$(ARCH)/util/tsc.h HAVE_KVM_STAT_SUPPORT := 1 -LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/kvm-stat.o diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build new file mode 100644 index 000000000000..b30eff9bcc83 --- /dev/null +++ b/tools/perf/arch/x86/tests/Build @@ -0,0 +1,2 @@ +libperf-y += regs_load.o +libperf-y += dwarf-unwind.o diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build new file mode 100644 index 000000000000..cfbccc4e3187 --- /dev/null +++ b/tools/perf/arch/x86/util/Build @@ -0,0 +1,8 @@ +libperf-y += header.o +libperf-y += tsc.o +libperf-y += kvm-stat.o + +libperf-$(CONFIG_DWARF) += dwarf-regs.o + +libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o +libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build new file mode 100644 index 000000000000..5ce98023d518 --- /dev/null +++ b/tools/perf/bench/Build @@ -0,0 +1,11 @@ +perf-y += sched-messaging.o +perf-y += sched-pipe.o +perf-y += mem-memcpy.o +perf-y += futex-hash.o +perf-y += futex-wake.o +perf-y += futex-requeue.o + +perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o +perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o + +perf-$(CONFIG_NUMA) += numa.o diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 747f86103599..71bf7451c0ca 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -208,7 +208,7 @@ static int __cmd_annotate(struct perf_annotate *ann) goto out; } - ret = perf_session__process_events(session, &ann->tool); + ret = perf_session__process_events(session); if (ret) goto out; diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 50e6b66aea1f..d47a0cdc71c9 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -125,8 +125,7 @@ static int build_id_cache__kcore_existing(const char *from_dir, char *to_dir, return ret; } -static int build_id_cache__add_kcore(const char *filename, const char *debugdir, - bool force) +static int build_id_cache__add_kcore(const char *filename, bool force) { char dir[32], sbuildid[BUILD_ID_SIZE * 2 + 1]; char from_dir[PATH_MAX], to_dir[PATH_MAX]; @@ -143,7 +142,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir, return -1; scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s", - debugdir, sbuildid); + buildid_dir, sbuildid); if (!force && !build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) { @@ -155,7 +154,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir, return -1; scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s", - debugdir, sbuildid, dir); + buildid_dir, sbuildid, dir); if (mkdir_p(to_dir, 0755)) return -1; @@ -183,7 +182,7 @@ static int build_id_cache__add_kcore(const char *filename, const char *debugdir, return 0; } -static int build_id_cache__add_file(const char *filename, const char *debugdir) +static int build_id_cache__add_file(const char *filename) { char sbuild_id[BUILD_ID_SIZE * 2 + 1]; u8 build_id[BUILD_ID_SIZE]; @@ -195,16 +194,14 @@ static int build_id_cache__add_file(const char *filename, const char *debugdir) } build_id__sprintf(build_id, sizeof(build_id), sbuild_id); - err = build_id_cache__add_s(sbuild_id, debugdir, filename, + err = build_id_cache__add_s(sbuild_id, filename, false, false); - if (verbose) - pr_info("Adding %s %s: %s\n", sbuild_id, filename, - err ? "FAIL" : "Ok"); + pr_debug("Adding %s %s: %s\n", sbuild_id, filename, + err ? "FAIL" : "Ok"); return err; } -static int build_id_cache__remove_file(const char *filename, - const char *debugdir) +static int build_id_cache__remove_file(const char *filename) { u8 build_id[BUILD_ID_SIZE]; char sbuild_id[BUILD_ID_SIZE * 2 + 1]; @@ -217,10 +214,34 @@ static int build_id_cache__remove_file(const char *filename, } build_id__sprintf(build_id, sizeof(build_id), sbuild_id); - err = build_id_cache__remove_s(sbuild_id, debugdir); - if (verbose) - pr_info("Removing %s %s: %s\n", sbuild_id, filename, - err ? "FAIL" : "Ok"); + err = build_id_cache__remove_s(sbuild_id); + pr_debug("Removing %s %s: %s\n", sbuild_id, filename, + err ? "FAIL" : "Ok"); + + return err; +} + +static int build_id_cache__purge_path(const char *pathname) +{ + struct strlist *list; + struct str_node *pos; + int err; + + err = build_id_cache__list_build_ids(pathname, &list); + if (err) + goto out; + + strlist__for_each(pos, list) { + err = build_id_cache__remove_s(pos->s); + pr_debug("Removing %s %s: %s\n", pos->s, pathname, + err ? "FAIL" : "Ok"); + if (err) + break; + } + strlist__delete(list); + +out: + pr_debug("Purging %s: %s\n", pathname, err ? "FAIL" : "Ok"); return err; } @@ -252,13 +273,12 @@ static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *f return 0; } -static int build_id_cache__update_file(const char *filename, - const char *debugdir) +static int build_id_cache__update_file(const char *filename) { u8 build_id[BUILD_ID_SIZE]; char sbuild_id[BUILD_ID_SIZE * 2 + 1]; - int err; + int err = 0; if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); @@ -266,14 +286,14 @@ static int build_id_cache__update_file(const char *filename, } build_id__sprintf(build_id, sizeof(build_id), sbuild_id); - err = build_id_cache__remove_s(sbuild_id, debugdir); - if (!err) { - err = build_id_cache__add_s(sbuild_id, debugdir, filename, - false, false); - } - if (verbose) - pr_info("Updating %s %s: %s\n", sbuild_id, filename, - err ? "FAIL" : "Ok"); + if (build_id_cache__cached(sbuild_id)) + err = build_id_cache__remove_s(sbuild_id); + + if (!err) + err = build_id_cache__add_s(sbuild_id, filename, false, false); + + pr_debug("Updating %s %s: %s\n", sbuild_id, filename, + err ? "FAIL" : "Ok"); return err; } @@ -287,6 +307,7 @@ int cmd_buildid_cache(int argc, const char **argv, bool force = false; char const *add_name_list_str = NULL, *remove_name_list_str = NULL, + *purge_name_list_str = NULL, *missing_filename = NULL, *update_name_list_str = NULL, *kcore_filename = NULL; @@ -304,6 +325,8 @@ int cmd_buildid_cache(int argc, const char **argv, "file", "kcore file to add"), OPT_STRING('r', "remove", &remove_name_list_str, "file list", "file(s) to remove"), + OPT_STRING('p', "purge", &purge_name_list_str, "path list", + "path(s) to remove (remove old caches too)"), OPT_STRING('M', "missing", &missing_filename, "file", "to find missing build ids in the cache"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), @@ -320,6 +343,11 @@ int cmd_buildid_cache(int argc, const char **argv, argc = parse_options(argc, argv, buildid_cache_options, buildid_cache_usage, 0); + if (argc || (!add_name_list_str && !kcore_filename && + !remove_name_list_str && !purge_name_list_str && + !missing_filename && !update_name_list_str)) + usage_with_options(buildid_cache_usage, buildid_cache_options); + if (missing_filename) { file.path = missing_filename; file.force = force; @@ -338,7 +366,7 @@ int cmd_buildid_cache(int argc, const char **argv, list = strlist__new(true, add_name_list_str); if (list) { strlist__for_each(pos, list) - if (build_id_cache__add_file(pos->s, buildid_dir)) { + if (build_id_cache__add_file(pos->s)) { if (errno == EEXIST) { pr_debug("%s already in the cache\n", pos->s); @@ -356,7 +384,25 @@ int cmd_buildid_cache(int argc, const char **argv, list = strlist__new(true, remove_name_list_str); if (list) { strlist__for_each(pos, list) - if (build_id_cache__remove_file(pos->s, buildid_dir)) { + if (build_id_cache__remove_file(pos->s)) { + if (errno == ENOENT) { + pr_debug("%s wasn't in the cache\n", + pos->s); + continue; + } + pr_warning("Couldn't remove %s: %s\n", + pos->s, strerror_r(errno, sbuf, sizeof(sbuf))); + } + + strlist__delete(list); + } + } + + if (purge_name_list_str) { + list = strlist__new(true, purge_name_list_str); + if (list) { + strlist__for_each(pos, list) + if (build_id_cache__purge_path(pos->s)) { if (errno == ENOENT) { pr_debug("%s wasn't in the cache\n", pos->s); @@ -377,7 +423,7 @@ int cmd_buildid_cache(int argc, const char **argv, list = strlist__new(true, update_name_list_str); if (list) { strlist__for_each(pos, list) - if (build_id_cache__update_file(pos->s, buildid_dir)) { + if (build_id_cache__update_file(pos->s)) { if (errno == ENOENT) { pr_debug("%s wasn't in the cache\n", pos->s); @@ -391,8 +437,7 @@ int cmd_buildid_cache(int argc, const char **argv, } } - if (kcore_filename && - build_id_cache__add_kcore(kcore_filename, buildid_dir, force)) + if (kcore_filename && build_id_cache__add_kcore(kcore_filename, force)) pr_warning("Couldn't add %s\n", kcore_filename); out: diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index ed3873b3e238..feb420f74c2d 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -74,7 +74,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID */ if (with_hits || perf_data_file__is_pipe(&file)) - perf_session__process_events(session, &build_id__mark_dso_hit_ops); + perf_session__process_events(session); perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits); perf_session__delete(session); diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c new file mode 100644 index 000000000000..d6525bc54d13 --- /dev/null +++ b/tools/perf/builtin-data.c @@ -0,0 +1,123 @@ +#include +#include "builtin.h" +#include "perf.h" +#include "debug.h" +#include "parse-options.h" +#include "data-convert-bt.h" + +typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix); + +struct data_cmd { + const char *name; + const char *summary; + data_cmd_fn_t fn; +}; + +static struct data_cmd data_cmds[]; + +#define for_each_cmd(cmd) \ + for (cmd = data_cmds; cmd && cmd->name; cmd++) + +static const struct option data_options[] = { + OPT_END() +}; + +static const char * const data_subcommands[] = { "convert", NULL }; + +static const char *data_usage[] = { + "perf data [] []", + NULL +}; + +static void print_usage(void) +{ + struct data_cmd *cmd; + + printf("Usage:\n"); + printf("\t%s\n\n", data_usage[0]); + printf("\tAvailable commands:\n"); + + for_each_cmd(cmd) { + printf("\t %s\t- %s\n", cmd->name, cmd->summary); + } + + printf("\n"); +} + +static const char * const data_convert_usage[] = { + "perf data convert []", + NULL +}; + +static int cmd_data_convert(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + const char *to_ctf = NULL; + bool force = false; + const struct option options[] = { + OPT_INCR('v', "verbose", &verbose, "be more verbose"), + OPT_STRING('i', "input", &input_name, "file", "input file name"), +#ifdef HAVE_LIBBABELTRACE_SUPPORT + OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"), +#endif + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), + OPT_END() + }; + +#ifndef HAVE_LIBBABELTRACE_SUPPORT + pr_err("No conversion support compiled in.\n"); + return -1; +#endif + + argc = parse_options(argc, argv, options, + data_convert_usage, 0); + if (argc) { + usage_with_options(data_convert_usage, options); + return -1; + } + + if (to_ctf) { +#ifdef HAVE_LIBBABELTRACE_SUPPORT + return bt_convert__perf2ctf(input_name, to_ctf, force); +#else + pr_err("The libbabeltrace support is not compiled in.\n"); + return -1; +#endif + } + + return 0; +} + +static struct data_cmd data_cmds[] = { + { "convert", "converts data file between formats", cmd_data_convert }, + { .name = NULL, }, +}; + +int cmd_data(int argc, const char **argv, const char *prefix) +{ + struct data_cmd *cmd; + const char *cmdstr; + + /* No command specified. */ + if (argc < 2) + goto usage; + + argc = parse_options_subcommand(argc, argv, data_options, data_subcommands, data_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc < 1) + goto usage; + + cmdstr = argv[0]; + + for_each_cmd(cmd) { + if (strcmp(cmd->name, cmdstr)) + continue; + + return cmd->fn(argc, argv, prefix); + } + + pr_err("Unknown command: %s\n", cmdstr); +usage: + print_usage(); + return -1; +} diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 74aada554b12..df6307b4050a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -747,7 +747,7 @@ static int __cmd_diff(void) goto out_delete; } - ret = perf_session__process_events(d->session, &tool); + ret = perf_session__process_events(d->session); if (ret) { pr_err("Failed to process %s\n", d->file.path); goto out_delete; @@ -791,6 +791,8 @@ static const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), + OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, + "file", "kallsyms pathname"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", @@ -802,7 +804,7 @@ static const struct option options[] = { OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..." " Please refer the man page for the complete list."), - OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", + OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added between " "columns '.' is reserved."), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 0f93f859b782..695ec5a50cf2 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -24,6 +24,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details struct perf_data_file file = { .path = file_name, .mode = PERF_DATA_MODE_READ, + .force = details->force, }; session = perf_session__new(&file, 0, NULL); @@ -47,6 +48,7 @@ int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused) "Show all event attr details"), OPT_BOOLEAN('g', "group", &details.event_group, "Show event group information"), + OPT_BOOLEAN('f', "force", &details.force, "don't complain, do it"), OPT_END() }; const char * const evlist_usage[] = { diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 25d20628212e..36486eade1ef 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -437,7 +437,18 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused) HELP_FORMAT_INFO), OPT_END(), }; - const char * const builtin_help_usage[] = { + const char * const builtin_help_subcommands[] = { + "buildid-cache", "buildid-list", "diff", "evlist", "help", "list", + "record", "report", "bench", "stat", "timechart", "top", "annotate", + "script", "sched", "kmem", "lock", "kvm", "test", "inject", "mem", "data", +#ifdef HAVE_LIBELF_SUPPORT + "probe", +#endif +#ifdef HAVE_LIBAUDIT_SUPPORT + "trace", +#endif + NULL }; + const char *builtin_help_usage[] = { "perf help [--all] [--man|--web|--info] [command]", NULL }; @@ -448,8 +459,8 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused) perf_config(perf_help_config, &help_format); - argc = parse_options(argc, argv, builtin_help_options, - builtin_help_usage, 0); + argc = parse_options_subcommand(argc, argv, builtin_help_options, + builtin_help_subcommands, builtin_help_usage, 0); if (show_all) { printf("\n usage: %s\n\n", perf_usage_string); diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a13641e066f5..40a33d7334cc 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -53,6 +53,13 @@ static int perf_event__repipe_synth(struct perf_tool *tool, return 0; } +static int perf_event__repipe_oe_synth(struct perf_tool *tool, + union perf_event *event, + struct ordered_events *oe __maybe_unused) +{ + return perf_event__repipe_synth(tool, event); +} + static int perf_event__repipe_op2_synth(struct perf_tool *tool, union perf_event *event, struct perf_session *session @@ -359,8 +366,6 @@ static int __cmd_inject(struct perf_inject *inject) } else if (inject->sched_stat) { struct perf_evsel *evsel; - inject->tool.ordered_events = true; - evlist__for_each(session->evlist, evsel) { const char *name = perf_evsel__name(evsel); @@ -379,7 +384,7 @@ static int __cmd_inject(struct perf_inject *inject) if (!file_out->is_pipe) lseek(fd, session->header.data_offset, SEEK_SET); - ret = perf_session__process_events(session, &inject->tool); + ret = perf_session__process_events(session); if (!file_out->is_pipe) { if (inject->build_ids) @@ -408,7 +413,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) .unthrottle = perf_event__repipe, .attr = perf_event__repipe_attr, .tracing_data = perf_event__repipe_op2_synth, - .finished_round = perf_event__repipe_op2_synth, + .finished_round = perf_event__repipe_oe_synth, .build_id = perf_event__repipe_op2_synth, .id_index = perf_event__repipe_op2_synth, }, @@ -438,6 +443,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) "be more verbose (show build ids, etc)"), OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file", "kallsyms pathname"), + OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), OPT_END() }; const char * const inject_usage[] = { @@ -458,6 +464,8 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) return -1; } + inject.tool.ordered_events = inject.sched_stat; + file.path = inject.input_name; inject.session = perf_session__new(&file, true, &inject.tool); if (inject.session == NULL) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index f295141025bc..4ebf65c79434 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -20,6 +20,7 @@ #include #include +#include struct alloc_stat; typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *); @@ -275,10 +276,10 @@ static void __print_result(struct rb_root *root, struct perf_session *session, struct rb_node *next; struct machine *machine = &session->machines.host; - printf("%.102s\n", graph_dotted_line); + printf("%.105s\n", graph_dotted_line); printf(" %-34s |", is_caller ? "Callsite": "Alloc Ptr"); printf(" Total_alloc/Per | Total_req/Per | Hit | Ping-pong | Frag\n"); - printf("%.102s\n", graph_dotted_line); + printf("%.105s\n", graph_dotted_line); next = rb_first(root); @@ -304,7 +305,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session, snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr); printf(" %-34s |", buf); - printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %8lu | %6.3f%%\n", + printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n", (unsigned long long)data->bytes_alloc, (unsigned long)data->bytes_alloc / data->hit, (unsigned long long)data->bytes_req, @@ -317,21 +318,21 @@ static void __print_result(struct rb_root *root, struct perf_session *session, } if (n_lines == -1) - printf(" ... | ... | ... | ... | ... | ... \n"); + printf(" ... | ... | ... | ... | ... | ... \n"); - printf("%.102s\n", graph_dotted_line); + printf("%.105s\n", graph_dotted_line); } static void print_summary(void) { printf("\nSUMMARY\n=======\n"); - printf("Total bytes requested: %lu\n", total_requested); - printf("Total bytes allocated: %lu\n", total_allocated); - printf("Total bytes wasted on internal fragmentation: %lu\n", + printf("Total bytes requested: %'lu\n", total_requested); + printf("Total bytes allocated: %'lu\n", total_allocated); + printf("Total bytes wasted on internal fragmentation: %'lu\n", total_allocated - total_requested); printf("Internal fragmentation: %f%%\n", fragmentation(total_requested, total_allocated)); - printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs); + printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs); } static void print_result(struct perf_session *session) @@ -426,7 +427,7 @@ static int __cmd_kmem(struct perf_session *session) } setup_pager(); - err = perf_session__process_events(session, &perf_kmem); + err = perf_session__process_events(session); if (err != 0) goto out; sort_result(); @@ -559,6 +560,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg) { char *tok; char *str = strdup(arg); + char *pos = str; if (!str) { pr_err("%s: strdup failed\n", __func__); @@ -566,7 +568,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg) } while (true) { - tok = strsep(&str, ","); + tok = strsep(&pos, ","); if (!tok) break; if (sort_dimension__add(tok, sort_list) < 0) { @@ -660,8 +662,13 @@ static int __cmd_record(int argc, const char **argv) int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) { const char * const default_sort_order = "frag,hit,bytes"; + struct perf_data_file file = { + .mode = PERF_DATA_MODE_READ, + }; const struct option kmem_options[] = { OPT_STRING('i', "input", &input_name, "file", "input file name"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL, "show per-callsite statistics", parse_caller_opt), OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL, @@ -671,6 +678,7 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) parse_sort_opt), OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt), OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"), + OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), OPT_END() }; const char *const kmem_subcommands[] = { "record", "stat", NULL }; @@ -679,10 +687,6 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) NULL }; struct perf_session *session; - struct perf_data_file file = { - .path = input_name, - .mode = PERF_DATA_MODE_READ, - }; int ret = -1; argc = parse_options_subcommand(argc, argv, kmem_options, @@ -696,6 +700,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) return __cmd_record(argc, argv); } + file.path = input_name; + session = perf_session__new(&file, false, &perf_kmem); if (session == NULL) return -1; @@ -703,6 +709,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) symbol__init(&session->header.env); if (!strcmp(argv[0], "stat")) { + setlocale(LC_ALL, ""); + if (cpu__setup_cpunode_map()) goto out_delete; diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 0894a817f67e..1f9338f6109c 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -18,6 +18,7 @@ #include "util/stat.h" #include "util/top.h" #include "util/data.h" +#include "util/ordered-events.h" #include #ifdef HAVE_TIMERFD_SUPPORT @@ -730,9 +731,9 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, return -1; } - err = perf_session_queue_event(kvm->session, event, &kvm->tool, &sample, 0); + err = perf_session__queue_event(kvm->session, event, &sample, 0); /* - * FIXME: Here we can't consume the event, as perf_session_queue_event will + * FIXME: Here we can't consume the event, as perf_session__queue_event will * point to it, and it'll get possibly overwritten by the kernel. */ perf_evlist__mmap_consume(kvm->evlist, idx); @@ -783,8 +784,10 @@ static int perf_kvm__mmap_read(struct perf_kvm_stat *kvm) /* flush queue after each round in which we processed events */ if (ntotal) { - kvm->session->ordered_events.next_flush = flush_time; - err = kvm->tool.finished_round(&kvm->tool, NULL, kvm->session); + struct ordered_events *oe = &kvm->session->ordered_events; + + oe->next_flush = flush_time; + err = ordered_events__flush(oe, OE_FLUSH__ROUND); if (err) { if (kvm->lost_events) pr_info("\nLost events: %" PRIu64 "\n\n", @@ -1044,6 +1047,7 @@ static int read_events(struct perf_kvm_stat *kvm) struct perf_data_file file = { .path = kvm->file_name, .mode = PERF_DATA_MODE_READ, + .force = kvm->force, }; kvm->tool = eops; @@ -1066,7 +1070,7 @@ static int read_events(struct perf_kvm_stat *kvm) if (ret < 0) return ret; - return perf_session__process_events(kvm->session, &kvm->tool); + return perf_session__process_events(kvm->session); } static int parse_target_str(struct perf_kvm_stat *kvm) @@ -1201,6 +1205,7 @@ kvm_events_report(struct perf_kvm_stat *kvm, int argc, const char **argv) " time (sort by avg time)"), OPT_STRING('p', "pid", &kvm->opts.target.pid, "pid", "analyze events only for given process id(s)"), + OPT_BOOLEAN('f', "force", &kvm->force, "don't complain, do it"), OPT_END() }; diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 198f3c3aff95..af5bd0514108 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -36,38 +36,36 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused) setup_pager(); - if (raw_dump) { - print_events(NULL, true); - return 0; - } + if (!raw_dump) + printf("\nList of pre-defined events (to be used in -e):\n\n"); if (argc == 0) { - print_events(NULL, false); + print_events(NULL, raw_dump); return 0; } for (i = 0; i < argc; ++i) { - if (i) - putchar('\n'); - if (strncmp(argv[i], "tracepoint", 10) == 0) - print_tracepoint_events(NULL, NULL, false); + if (strcmp(argv[i], "tracepoint") == 0) + print_tracepoint_events(NULL, NULL, raw_dump); else if (strcmp(argv[i], "hw") == 0 || strcmp(argv[i], "hardware") == 0) - print_events_type(PERF_TYPE_HARDWARE); + print_symbol_events(NULL, PERF_TYPE_HARDWARE, + event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump); else if (strcmp(argv[i], "sw") == 0 || strcmp(argv[i], "software") == 0) - print_events_type(PERF_TYPE_SOFTWARE); + print_symbol_events(NULL, PERF_TYPE_SOFTWARE, + event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump); else if (strcmp(argv[i], "cache") == 0 || strcmp(argv[i], "hwcache") == 0) - print_hwcache_events(NULL, false); + print_hwcache_events(NULL, raw_dump); else if (strcmp(argv[i], "pmu") == 0) - print_pmu_events(NULL, false); + print_pmu_events(NULL, raw_dump); else { char *sep = strchr(argv[i], ':'), *s; int sep_idx; if (sep == NULL) { - print_events(argv[i], false); + print_events(argv[i], raw_dump); continue; } sep_idx = sep - argv[i]; @@ -76,7 +74,7 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused) return -1; s[sep_idx] = '\0'; - print_tracepoint_events(s, s + sep_idx + 1, false); + print_tracepoint_events(s, s + sep_idx + 1, raw_dump); free(s); } } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index e7ec71589da6..d49c2ab85fc2 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -846,6 +846,8 @@ static const struct perf_evsel_str_handler lock_tracepoints[] = { { "lock:lock_release", perf_evsel__process_lock_release, }, /* CONFIG_LOCKDEP */ }; +static bool force; + static int __cmd_report(bool display_info) { int err = -EINVAL; @@ -857,6 +859,7 @@ static int __cmd_report(bool display_info) struct perf_data_file file = { .path = input_name, .mode = PERF_DATA_MODE_READ, + .force = force, }; session = perf_session__new(&file, false, &eops); @@ -878,7 +881,7 @@ static int __cmd_report(bool display_info) if (select_key()) goto out_delete; - err = perf_session__process_events(session, &eops); + err = perf_session__process_events(session); if (err) goto out_delete; @@ -945,6 +948,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused) "dump thread list in perf.data"), OPT_BOOLEAN('m', "map", &info_map, "map of lock instances (address:name table)"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_END() }; const struct option lock_options[] = { @@ -956,6 +960,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused) const struct option report_options[] = { OPT_STRING('k', "key", &sort_key, "acquired", "key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), /* TODO: type */ OPT_END() }; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 9b5663950a4d..675216e08bfc 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -15,6 +15,7 @@ struct perf_mem { char const *input_name; bool hide_unresolved; bool dump_raw; + bool force; int operation; const char *cpu_list; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -120,6 +121,7 @@ static int report_raw_events(struct perf_mem *mem) struct perf_data_file file = { .path = input_name, .mode = PERF_DATA_MODE_READ, + .force = mem->force, }; int err = -EINVAL; int ret; @@ -141,7 +143,7 @@ static int report_raw_events(struct perf_mem *mem) printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); - err = perf_session__process_events(session, &mem->tool); + err = perf_session__process_events(session); if (err) return err; @@ -286,10 +288,11 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused) "input file name"), OPT_STRING('C', "cpu", &mem.cpu_list, "cpu", "list of cpus to profile"), - OPT_STRING('x', "field-separator", &symbol_conf.field_sep, + OPT_STRING_NOEMPTY('x', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added" " between columns '.' is reserved."), + OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), OPT_END() }; const char *const mem_subcommands[] = { "record", "report", NULL }; diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 921bb6942503..f7b1af67e9f6 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -56,6 +56,7 @@ static struct { bool mod_events; bool uprobes; bool quiet; + bool target_used; int nevents; struct perf_probe_event events[MAX_PROBES]; struct strlist *dellist; @@ -78,6 +79,12 @@ static int parse_probe_event(const char *str) } pev->uprobes = params.uprobes; + if (params.target) { + pev->target = strdup(params.target); + if (!pev->target) + return -ENOMEM; + params.target_used = true; + } /* Parse a perf-probe command into event */ ret = parse_perf_probe_command(str, pev); @@ -102,6 +109,7 @@ static int set_target(const char *ptr) params.target = strdup(ptr); if (!params.target) return -ENOMEM; + params.target_used = false; found = 1; buf = ptr + (strlen(ptr) - 3); @@ -178,7 +186,7 @@ static int opt_set_target(const struct option *opt, const char *str, int ret = -ENOENT; char *tmp; - if (str && !params.target) { + if (str) { if (!strcmp(opt->long_name, "exec")) params.uprobes = true; #ifdef HAVE_DWARF_SUPPORT @@ -200,7 +208,9 @@ static int opt_set_target(const struct option *opt, const char *str, if (!tmp) return -ENOMEM; } + free(params.target); params.target = tmp; + params.target_used = false; ret = 0; } @@ -485,9 +495,14 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused) } if (params.nevents) { + /* Ensure the last given target is used */ + if (params.target && !params.target_used) { + pr_warning(" Error: -x/-m must follow the probe definitions.\n"); + usage_with_options(probe_usage, options); + } + ret = add_perf_probe_events(params.events, params.nevents, params.max_probe_points, - params.target, params.force_add); if (ret < 0) { pr_err_with_code(" Error: Failed to add events.", ret); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 404ab3434052..c3efdfb630b5 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -70,8 +70,8 @@ static int process_synthesized_event(struct perf_tool *tool, static int record__mmap_read(struct record *rec, int idx) { struct perf_mmap *md = &rec->evlist->mmap[idx]; - unsigned int head = perf_mmap__read_head(md); - unsigned int old = md->prev; + u64 head = perf_mmap__read_head(md); + u64 old = md->prev; unsigned char *data = md->base + page_size; unsigned long size; void *buf; @@ -161,8 +161,9 @@ try_again: } } - if (perf_evlist__apply_filters(evlist)) { - error("failed to set filter with %d (%s)\n", errno, + if (perf_evlist__apply_filters(evlist, &pos)) { + error("failed to set filter \"%s\" on event %s with %d (%s)\n", + pos->filter, perf_evsel__name(pos), errno, strerror_r(errno, msg, sizeof(msg))); rc = -1; goto out; @@ -225,7 +226,7 @@ static int process_buildids(struct record *rec) */ symbol_conf.ignore_vmlinux_buildid = true; - return perf_session__process_events(session, &rec->tool); + return perf_session__process_events(session); } static void perf_event__synthesize_guest_os(struct machine *machine, void *data) @@ -343,7 +344,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGINT, sig_handler); signal(SIGTERM, sig_handler); - session = perf_session__new(file, false, NULL); + session = perf_session__new(file, false, tool); if (session == NULL) { pr_err("Perf session creation failed.\n"); return -1; @@ -658,7 +659,7 @@ error: static void callchain_debug(void) { - static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF" }; + static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; pr_debug("callchain: type %s\n", str[callchain_param.record_mode]); @@ -710,6 +711,90 @@ static int perf_record_config(const char *var, const char *value, void *cb) return perf_default_config(var, value, cb); } +struct clockid_map { + const char *name; + int clockid; +}; + +#define CLOCKID_MAP(n, c) \ + { .name = n, .clockid = (c), } + +#define CLOCKID_END { .name = NULL, } + + +/* + * Add the missing ones, we need to build on many distros... + */ +#ifndef CLOCK_MONOTONIC_RAW +#define CLOCK_MONOTONIC_RAW 4 +#endif +#ifndef CLOCK_BOOTTIME +#define CLOCK_BOOTTIME 7 +#endif +#ifndef CLOCK_TAI +#define CLOCK_TAI 11 +#endif + +static const struct clockid_map clockids[] = { + /* available for all events, NMI safe */ + CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), + CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), + + /* available for some events */ + CLOCKID_MAP("realtime", CLOCK_REALTIME), + CLOCKID_MAP("boottime", CLOCK_BOOTTIME), + CLOCKID_MAP("tai", CLOCK_TAI), + + /* available for the lazy */ + CLOCKID_MAP("mono", CLOCK_MONOTONIC), + CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), + CLOCKID_MAP("real", CLOCK_REALTIME), + CLOCKID_MAP("boot", CLOCK_BOOTTIME), + + CLOCKID_END, +}; + +static int parse_clockid(const struct option *opt, const char *str, int unset) +{ + struct record_opts *opts = (struct record_opts *)opt->value; + const struct clockid_map *cm; + const char *ostr = str; + + if (unset) { + opts->use_clockid = 0; + return 0; + } + + /* no arg passed */ + if (!str) + return 0; + + /* no setting it twice */ + if (opts->use_clockid) + return -1; + + opts->use_clockid = true; + + /* if its a number, we're done */ + if (sscanf(str, "%d", &opts->clockid) == 1) + return 0; + + /* allow a "CLOCK_" prefix to the name */ + if (!strncasecmp(str, "CLOCK_", 6)) + str += 6; + + for (cm = clockids; cm->name; cm++) { + if (!strcasecmp(str, cm->name)) { + opts->clockid = cm->clockid; + return 0; + } + } + + opts->use_clockid = false; + ui__warning("unknown clockid %s, check man page\n", ostr); + return -1; +} + static const char * const __record_usage[] = { "perf record [] []", "perf record [] -- []", @@ -751,9 +836,9 @@ static struct record record = { #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: " #ifdef HAVE_DWARF_UNWIND_SUPPORT -const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf"; +const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf lbr"; #else -const char record_callchain_help[] = CALLCHAIN_HELP "fp"; +const char record_callchain_help[] = CALLCHAIN_HELP "fp lbr"; #endif /* @@ -839,6 +924,11 @@ struct option __record_options[] = { "use per-thread mmaps"), OPT_BOOLEAN('I', "intr-regs", &record.opts.sample_intr_regs, "Sample machine registers on interrupt"), + OPT_BOOLEAN(0, "running-time", &record.opts.running_time, + "Record running/enabled time of read (:S) events"), + OPT_CALLBACK('k', "clockid", &record.opts, + "clockid", "clockid to use for events, see clock_gettime()", + parse_clockid), OPT_END() }; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2f91094e228b..476cdf7afcca 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -249,6 +249,8 @@ static int report__setup_sample_type(struct report *rep) if ((sample_type & PERF_SAMPLE_REGS_USER) && (sample_type & PERF_SAMPLE_STACK_USER)) callchain_param.record_mode = CALLCHAIN_DWARF; + else if (sample_type & PERF_SAMPLE_BRANCH_STACK) + callchain_param.record_mode = CALLCHAIN_LBR; else callchain_param.record_mode = CALLCHAIN_FP; } @@ -302,7 +304,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report if (rep->mem_mode) { ret += fprintf(fp, "\n# Total weight : %" PRIu64, nr_events); - ret += fprintf(fp, "\n# Sort order : %s", sort_order); + ret += fprintf(fp, "\n# Sort order : %s", sort_order ? : default_mem_sort_order); } else ret += fprintf(fp, "\n# Event count (approx.): %" PRIu64, nr_events); return ret + fprintf(fp, "\n#\n"); @@ -345,7 +347,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, static void report__warn_kptr_restrict(const struct report *rep) { struct map *kernel_map = rep->session->machines.host.vmlinux_maps[MAP__FUNCTION]; - struct kmap *kernel_kmap = map__kmap(kernel_map); + struct kmap *kernel_kmap = kernel_map ? map__kmap(kernel_map) : NULL; if (kernel_map == NULL || (kernel_map->dso->hit && @@ -480,7 +482,7 @@ static int __cmd_report(struct report *rep) if (ret) return ret; - ret = perf_session__process_events(session, &rep->tool); + ret = perf_session__process_events(session); if (ret) return ret; @@ -667,6 +669,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) "only consider symbols in these dsos"), OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", "only consider symbols in these comms"), + OPT_STRING(0, "pid", &symbol_conf.pid_list_str, "pid[,pid...]", + "only consider symbols in these pids"), + OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]", + "only consider symbols in these tids"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), OPT_STRING(0, "symbol-filter", &report.symbol_filter_str, "filter", @@ -674,7 +680,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str, "width[,width...]", "don't try to adjust column width, use these fixed values"), - OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", + OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added between " "columns '.' is reserved."), OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved, @@ -766,7 +772,7 @@ repeat: * 0/1 means the user chose a mode. */ if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) && - branch_call_mode == -1) { + !branch_call_mode) { sort__mode = SORT_MODE__BRANCH; symbol_conf.cumulate_callchain = false; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 891c3930080e..5275bab70313 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -23,12 +23,13 @@ #include #include #include +#include #define PR_SET_NAME 15 /* Set process name */ #define MAX_CPUS 4096 #define COMM_LEN 20 #define SYM_LEN 129 -#define MAX_PID 65536 +#define MAX_PID 1024000 struct sched_atom; @@ -124,7 +125,7 @@ struct perf_sched { struct perf_tool tool; const char *sort_order; unsigned long nr_tasks; - struct task_desc *pid_to_task[MAX_PID]; + struct task_desc **pid_to_task; struct task_desc **tasks; const struct trace_sched_handler *tp_handler; pthread_mutex_t start_work_mutex; @@ -169,6 +170,7 @@ struct perf_sched { u64 cpu_last_switched[MAX_CPUS]; struct rb_root atom_root, sorted_atom_root; struct list_head sort_list, cmp_pid; + bool force; }; static u64 get_nsecs(void) @@ -326,8 +328,19 @@ static struct task_desc *register_pid(struct perf_sched *sched, unsigned long pid, const char *comm) { struct task_desc *task; + static int pid_max; - BUG_ON(pid >= MAX_PID); + if (sched->pid_to_task == NULL) { + if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) + pid_max = MAX_PID; + BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); + } + if (pid >= (unsigned long)pid_max) { + BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * + sizeof(struct task_desc *))) == NULL); + while (pid >= (unsigned long)pid_max) + sched->pid_to_task[pid_max++] = NULL; + } task = sched->pid_to_task[pid]; @@ -346,7 +359,7 @@ static struct task_desc *register_pid(struct perf_sched *sched, sched->pid_to_task[pid] = task; sched->nr_tasks++; - sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *)); + sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); BUG_ON(!sched->tasks); sched->tasks[task->nr] = task; @@ -425,24 +438,45 @@ static u64 get_cpu_usage_nsec_parent(void) return sum; } -static int self_open_counters(void) +static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) { struct perf_event_attr attr; - char sbuf[STRERR_BUFSIZE]; + char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; int fd; + struct rlimit limit; + bool need_privilege = false; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_TASK_CLOCK; +force_again: fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); - if (fd < 0) + if (fd < 0) { + if (errno == EMFILE) { + if (sched->force) { + BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); + limit.rlim_cur += sched->nr_tasks - cur_task; + if (limit.rlim_cur > limit.rlim_max) { + limit.rlim_max = limit.rlim_cur; + need_privilege = true; + } + if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { + if (need_privilege && errno == EPERM) + strcpy(info, "Need privilege\n"); + } else + goto force_again; + } else + strcpy(info, "Have a try with -f option\n"); + } pr_err("Error: sys_perf_event_open() syscall returned " - "with %d (%s)\n", fd, - strerror_r(errno, sbuf, sizeof(sbuf))); + "with %d (%s)\n%s", fd, + strerror_r(errno, sbuf, sizeof(sbuf)), info); + exit(EXIT_FAILURE); + } return fd; } @@ -460,6 +494,7 @@ static u64 get_cpu_usage_nsec_self(int fd) struct sched_thread_parms { struct task_desc *task; struct perf_sched *sched; + int fd; }; static void *thread_func(void *ctx) @@ -470,13 +505,12 @@ static void *thread_func(void *ctx) u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; - int fd; + int fd = parms->fd; zfree(&parms); sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); - fd = self_open_counters(); if (fd < 0) return NULL; again: @@ -528,6 +562,7 @@ static void create_tasks(struct perf_sched *sched) BUG_ON(parms == NULL); parms->task = task = sched->tasks[i]; parms->sched = sched; + parms->fd = self_open_counters(sched, i); sem_init(&task->sleep_sem, 0, 0); sem_init(&task->ready_for_work, 0, 0); sem_init(&task->work_done_sem, 0, 0); @@ -572,13 +607,13 @@ static void wait_for_tasks(struct perf_sched *sched) cpu_usage_1 = get_cpu_usage_nsec_parent(); if (!sched->runavg_cpu_usage) sched->runavg_cpu_usage = sched->cpu_usage; - sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10; + sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; if (!sched->runavg_parent_cpu_usage) sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; - sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 + - sched->parent_cpu_usage)/10; + sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + + sched->parent_cpu_usage)/sched->replay_repeat; ret = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(ret); @@ -610,7 +645,7 @@ static void run_one_test(struct perf_sched *sched) sched->sum_fluct += fluct; if (!sched->run_avg) sched->run_avg = delta; - sched->run_avg = (sched->run_avg * 9 + delta) / 10; + sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0); @@ -831,7 +866,7 @@ static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) return -1; } - atoms->thread = thread; + atoms->thread = thread__get(thread); INIT_LIST_HEAD(&atoms->work_list); __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid); return 0; @@ -1439,8 +1474,7 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_ return err; } -static int perf_sched__read_events(struct perf_sched *sched, - struct perf_session **psession) +static int perf_sched__read_events(struct perf_sched *sched) { const struct perf_evsel_str_handler handlers[] = { { "sched:sched_switch", process_sched_switch_event, }, @@ -1453,7 +1487,9 @@ static int perf_sched__read_events(struct perf_sched *sched, struct perf_data_file file = { .path = input_name, .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; + int rc = -1; session = perf_session__new(&file, false, &sched->tool); if (session == NULL) { @@ -1467,27 +1503,21 @@ static int perf_sched__read_events(struct perf_sched *sched, goto out_delete; if (perf_session__has_traces(session, "record -R")) { - int err = perf_session__process_events(session, &sched->tool); + int err = perf_session__process_events(session); if (err) { pr_err("Failed to process events, error %d", err); goto out_delete; } - sched->nr_events = session->stats.nr_events[0]; - sched->nr_lost_events = session->stats.total_lost; - sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST]; + sched->nr_events = session->evlist->stats.nr_events[0]; + sched->nr_lost_events = session->evlist->stats.total_lost; + sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST]; } - if (psession) - *psession = session; - else - perf_session__delete(session); - - return 0; - + rc = 0; out_delete: perf_session__delete(session); - return -1; + return rc; } static void print_bad_events(struct perf_sched *sched) @@ -1515,12 +1545,10 @@ static void print_bad_events(struct perf_sched *sched) static int perf_sched__lat(struct perf_sched *sched) { struct rb_node *next; - struct perf_session *session; setup_pager(); - /* save session -- references to threads are held in work_list */ - if (perf_sched__read_events(sched, &session)) + if (perf_sched__read_events(sched)) return -1; perf_sched__sort_lat(sched); @@ -1537,6 +1565,7 @@ static int perf_sched__lat(struct perf_sched *sched) work_list = rb_entry(next, struct work_atoms, node); output_lat_thread(sched, work_list); next = rb_next(next); + thread__zput(work_list->thread); } printf(" -----------------------------------------------------------------------------------------------------------------\n"); @@ -1548,7 +1577,6 @@ static int perf_sched__lat(struct perf_sched *sched) print_bad_events(sched); printf("\n"); - perf_session__delete(session); return 0; } @@ -1557,7 +1585,7 @@ static int perf_sched__map(struct perf_sched *sched) sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); setup_pager(); - if (perf_sched__read_events(sched, NULL)) + if (perf_sched__read_events(sched)) return -1; print_bad_events(sched); return 0; @@ -1572,7 +1600,7 @@ static int perf_sched__replay(struct perf_sched *sched) test_calibrations(sched); - if (perf_sched__read_events(sched, NULL)) + if (perf_sched__read_events(sched)) return -1; printf("nr_run_events: %ld\n", sched->nr_run_events); @@ -1693,6 +1721,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), + OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), OPT_END() }; const struct option sched_options[] = { diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ce304dfd962a..58f10b8e6ff2 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -446,9 +446,9 @@ static void print_sample_bts(union perf_event *event, } static void process_event(union perf_event *event, struct perf_sample *sample, - struct perf_evsel *evsel, struct thread *thread, - struct addr_location *al) + struct perf_evsel *evsel, struct addr_location *al) { + struct thread *thread = al->thread; struct perf_event_attr *attr = &evsel->attr; if (output[attr->type].fields == 0) @@ -549,14 +549,6 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, struct machine *machine) { struct addr_location al; - struct thread *thread = machine__findnew_thread(machine, sample->pid, - sample->tid); - - if (thread == NULL) { - pr_debug("problem processing %d event, skipping it.\n", - event->header.type); - return -1; - } if (debug_mode) { if (sample->time < last_timestamp) { @@ -581,7 +573,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; - scripting_ops->process_event(event, sample, evsel, thread, &al); + scripting_ops->process_event(event, sample, evsel, &al); return 0; } @@ -800,7 +792,7 @@ static int __cmd_script(struct perf_script *script) script->tool.mmap2 = process_mmap2_event; } - ret = perf_session__process_events(script->session, &script->tool); + ret = perf_session__process_events(script->session); if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); @@ -1523,6 +1515,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) .ordering_requires_timestamps = true, }, }; + struct perf_data_file file = { + .mode = PERF_DATA_MODE_READ, + }; const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), @@ -1550,7 +1545,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "When printing symbols do not display call chain"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), - OPT_CALLBACK('f', "fields", NULL, "str", + OPT_CALLBACK('F', "fields", NULL, "str", "comma separated output fields prepend with 'type:'. " "Valid types: hw,sw,trace,raw. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," @@ -1562,6 +1557,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", "only display events for these comms"), + OPT_STRING(0, "pid", &symbol_conf.pid_list_str, "pid[,pid...]", + "only consider symbols in these pids"), + OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]", + "only consider symbols in these tids"), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path, @@ -1570,9 +1569,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "Show the fork/comm/exit events"), OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events, "Show the mmap events"), + OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), OPT_END() }; - const char * const script_usage[] = { + const char * const script_subcommands[] = { "record", "report", NULL }; + const char *script_usage[] = { "perf script []", "perf script [] record