baaf553d3b
This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on arm64. This allows each ftrace callsite to provide an ftrace_ops to the common ftrace trampoline, allowing each callsite to invoke distinct tracer functions without the need to fall back to list processing or to allocate custom trampolines for each callsite. This significantly speeds up cases where multiple distinct trace functions are used and callsites are mostly traced by a single tracer. The main idea is to place a pointer to the ftrace_ops as a literal at a fixed offset from the function entry point, which can be recovered by the common ftrace trampoline. Using a 64-bit literal avoids branch range limitations, and permits the ops to be swapped atomically without special considerations that apply to code-patching. In future this will also allow for the implementation of DYNAMIC_FTRACE_WITH_DIRECT_CALLS without branch range limitations by using additional fields in struct ftrace_ops. As noted in the core patch adding support for DYNAMIC_FTRACE_WITH_CALL_OPS, this approach allows for directly invoking ftrace_ops::func even for ftrace_ops which are dynamically-allocated (or part of a module), without going via ftrace_ops_list_func. Currently, this approach is not compatible with CLANG_CFI, as the presence/absence of pre-function NOPs changes the offset of the pre-function type hash, and there's no existing mechanism to ensure a consistent offset for instrumented and uninstrumented functions. When CLANG_CFI is enabled, the existing scheme with a global ops->func pointer is used, and there should be no functional change. I am currently working with others to allow the two to work together in future (though this will liekly require updated compiler support). I've benchamrked this with the ftrace_ops sample module [1], which is not currently upstream, but available at: https://lore.kernel.org/lkml/20230103124912.2948963-1-mark.rutland@arm.com git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git ftrace-ops-sample-20230109 Using that module I measured the total time taken for 100,000 calls to a trivial instrumented function, with a number of tracers enabled with relevant filters (which would apply to the instrumented function) and a number of tracers enabled with irrelevant filters (which would not apply to the instrumented function). I tested on an M1 MacBook Pro, running under a HVF-accelerated QEMU VM (i.e. on real hardware). Before this patch: Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,583 | 0.95 | - 0 | 1 || 93,709 | 0.94 | - 0 | 2 || 93,666 | 0.94 | - 0 | 10 || 93,709 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 6,467,833 | 64.68 | 63.73 1 | 2 || 7,509,708 | 75.10 | 74.15 1 | 10 || 23,786,792 | 237.87 | 236.92 1 | 100 || 106,432,500 | 1,064.43 | 1063.38 ---------+------------++-------------+--------------+------------ 1 | 0 || 1,431,875 | 14.32 | 13.37 2 | 0 || 6,456,334 | 64.56 | 63.62 10 | 0 || 22,717,000 | 227.17 | 226.22 100 | 0 || 103,293,667 | 1032.94 | 1031.99 ---------+------------++-------------+--------------+-------------- Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. After this patch Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,541 | 0.95 | - 0 | 1 || 93,666 | 0.94 | - 0 | 2 || 93,709 | 0.94 | - 0 | 10 || 93,667 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 281,000 | 2.81 | 1.86 1 | 2 || 281,042 | 2.81 | 1.87 1 | 10 || 280,958 | 2.81 | 1.86 1 | 100 || 281,250 | 2.81 | 1.87 ---------+------------++-------------+--------------+------------ 1 | 0 || 280,959 | 2.81 | 1.86 2 | 0 || 6,502,708 | 65.03 | 64.08 10 | 0 || 18,681,209 | 186.81 | 185.87 100 | 0 || 103,550,458 | 1,035.50 | 1034.56 ---------+------------++-------------+--------------+------------ Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. As can be seen from the above: a) Whenever there is a single relevant tracer function associated with a tracee, the overhead of invoking the tracer is constant, and does not scale with the number of tracers which are *not* associated with that tracee. b) The overhead for a single relevant tracer has dropped to ~1/7 of the overhead prior to this series (from 13.37ns to 1.86ns). This is largely due to permitting calls to dynamically-allocated ftrace_ops without going through ftrace_ops_list_func. I've run the ftrace selftests from v6.2-rc3, which reports: | # of passed: 110 | # of failed: 0 | # of unresolved: 3 | # of untested: 0 | # of unsupported: 0 | # of xfailed: 1 | # of undefined(test bug): 0 ... where the unresolved entries were the tests for DIRECT functions (which are not supported), and the checkbashisms selftest (which is irrelevant here): | [8] Test ftrace direct functions against tracers [UNRESOLVED] | [9] Test ftrace direct functions against kprobes [UNRESOLVED] | [62] Meta-selftest: Checkbashisms [UNRESOLVED] ... with all other tests passing (or failing as expected). Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Florent Revest <revest@chromium.org> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230123134603.1064407-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
203 lines
9.4 KiB
C
203 lines
9.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Based on arch/arm/kernel/asm-offsets.c
|
|
*
|
|
* Copyright (C) 1995-2003 Russell King
|
|
* 2001-2002 Keith Owens
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*/
|
|
|
|
#include <linux/arm_sdei.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/dma-mapping.h>
|
|
#include <linux/kvm_host.h>
|
|
#include <linux/preempt.h>
|
|
#include <linux/suspend.h>
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/fixmap.h>
|
|
#include <asm/thread_info.h>
|
|
#include <asm/memory.h>
|
|
#include <asm/signal32.h>
|
|
#include <asm/smp_plat.h>
|
|
#include <asm/suspend.h>
|
|
#include <linux/kbuild.h>
|
|
#include <linux/arm-smccc.h>
|
|
|
|
int main(void)
|
|
{
|
|
DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
|
|
BLANK();
|
|
DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu));
|
|
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
|
|
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
|
|
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
|
|
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
|
|
#endif
|
|
#ifdef CONFIG_SHADOW_CALL_STACK
|
|
DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base));
|
|
DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp));
|
|
#endif
|
|
DEFINE(TSK_STACK, offsetof(struct task_struct, stack));
|
|
#ifdef CONFIG_STACKPROTECTOR
|
|
DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary));
|
|
#endif
|
|
BLANK();
|
|
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
|
|
DEFINE(THREAD_SCTLR_USER, offsetof(struct task_struct, thread.sctlr_user));
|
|
#ifdef CONFIG_ARM64_PTR_AUTH
|
|
DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user));
|
|
#endif
|
|
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
|
|
DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel));
|
|
#endif
|
|
#ifdef CONFIG_ARM64_MTE
|
|
DEFINE(THREAD_MTE_CTRL, offsetof(struct task_struct, thread.mte_ctrl));
|
|
#endif
|
|
BLANK();
|
|
DEFINE(S_X0, offsetof(struct pt_regs, regs[0]));
|
|
DEFINE(S_X2, offsetof(struct pt_regs, regs[2]));
|
|
DEFINE(S_X4, offsetof(struct pt_regs, regs[4]));
|
|
DEFINE(S_X6, offsetof(struct pt_regs, regs[6]));
|
|
DEFINE(S_X8, offsetof(struct pt_regs, regs[8]));
|
|
DEFINE(S_X10, offsetof(struct pt_regs, regs[10]));
|
|
DEFINE(S_X12, offsetof(struct pt_regs, regs[12]));
|
|
DEFINE(S_X14, offsetof(struct pt_regs, regs[14]));
|
|
DEFINE(S_X16, offsetof(struct pt_regs, regs[16]));
|
|
DEFINE(S_X18, offsetof(struct pt_regs, regs[18]));
|
|
DEFINE(S_X20, offsetof(struct pt_regs, regs[20]));
|
|
DEFINE(S_X22, offsetof(struct pt_regs, regs[22]));
|
|
DEFINE(S_X24, offsetof(struct pt_regs, regs[24]));
|
|
DEFINE(S_X26, offsetof(struct pt_regs, regs[26]));
|
|
DEFINE(S_X28, offsetof(struct pt_regs, regs[28]));
|
|
DEFINE(S_FP, offsetof(struct pt_regs, regs[29]));
|
|
DEFINE(S_LR, offsetof(struct pt_regs, regs[30]));
|
|
DEFINE(S_SP, offsetof(struct pt_regs, sp));
|
|
DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate));
|
|
DEFINE(S_PC, offsetof(struct pt_regs, pc));
|
|
DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno));
|
|
DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1));
|
|
DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save));
|
|
DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe));
|
|
DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs));
|
|
BLANK();
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
|
|
DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0]));
|
|
DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2]));
|
|
DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4]));
|
|
DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6]));
|
|
DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8]));
|
|
DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp));
|
|
DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr));
|
|
DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp));
|
|
DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc));
|
|
DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs));
|
|
BLANK();
|
|
#endif
|
|
#ifdef CONFIG_COMPAT
|
|
DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0));
|
|
DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0));
|
|
BLANK();
|
|
#endif
|
|
DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter));
|
|
BLANK();
|
|
DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm));
|
|
DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags));
|
|
BLANK();
|
|
DEFINE(VM_EXEC, VM_EXEC);
|
|
BLANK();
|
|
DEFINE(PAGE_SZ, PAGE_SIZE);
|
|
BLANK();
|
|
DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE);
|
|
DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
|
|
BLANK();
|
|
DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
|
|
DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
|
|
DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
|
|
BLANK();
|
|
DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
|
|
BLANK();
|
|
DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val));
|
|
DEFINE(FTR_OVR_MASK_OFFSET, offsetof(struct arm64_ftr_override, mask));
|
|
BLANK();
|
|
#ifdef CONFIG_KVM
|
|
DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt));
|
|
DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1));
|
|
DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
|
|
DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs));
|
|
DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1]));
|
|
DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1]));
|
|
DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
|
|
DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1]));
|
|
DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1]));
|
|
DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1]));
|
|
DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1]));
|
|
DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
|
|
DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt));
|
|
DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2));
|
|
DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2));
|
|
DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2));
|
|
DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
|
|
DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa));
|
|
DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2));
|
|
DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr));
|
|
DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr));
|
|
#endif
|
|
#ifdef CONFIG_CPU_PM
|
|
DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
|
|
DEFINE(MPIDR_HASH_MASK, offsetof(struct mpidr_hash, mask));
|
|
DEFINE(MPIDR_HASH_SHIFTS, offsetof(struct mpidr_hash, shift_aff));
|
|
DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS, offsetof(struct sleep_stack_data, system_regs));
|
|
DEFINE(SLEEP_STACK_DATA_CALLEE_REGS, offsetof(struct sleep_stack_data, callee_saved_regs));
|
|
#endif
|
|
DEFINE(ARM_SMCCC_RES_X0_OFFS, offsetof(struct arm_smccc_res, a0));
|
|
DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2));
|
|
DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id));
|
|
DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS, offsetof(struct arm_smccc_1_2_regs, a0));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS, offsetof(struct arm_smccc_1_2_regs, a2));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS, offsetof(struct arm_smccc_1_2_regs, a4));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS, offsetof(struct arm_smccc_1_2_regs, a6));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS, offsetof(struct arm_smccc_1_2_regs, a8));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS, offsetof(struct arm_smccc_1_2_regs, a10));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS, offsetof(struct arm_smccc_1_2_regs, a12));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS, offsetof(struct arm_smccc_1_2_regs, a14));
|
|
DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS, offsetof(struct arm_smccc_1_2_regs, a16));
|
|
BLANK();
|
|
DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address));
|
|
DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address));
|
|
DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next));
|
|
DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val));
|
|
BLANK();
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
DEFINE(TRAMP_VALIAS, TRAMP_VALIAS);
|
|
#endif
|
|
#ifdef CONFIG_ARM_SDE_INTERFACE
|
|
DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs));
|
|
DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority));
|
|
#endif
|
|
#ifdef CONFIG_ARM64_PTR_AUTH
|
|
DEFINE(PTRAUTH_USER_KEY_APIA, offsetof(struct ptrauth_keys_user, apia));
|
|
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
|
|
DEFINE(PTRAUTH_KERNEL_KEY_APIA, offsetof(struct ptrauth_keys_kernel, apia));
|
|
#endif
|
|
BLANK();
|
|
#endif
|
|
#ifdef CONFIG_KEXEC_CORE
|
|
DEFINE(KIMAGE_ARCH_DTB_MEM, offsetof(struct kimage, arch.dtb_mem));
|
|
DEFINE(KIMAGE_ARCH_EL2_VECTORS, offsetof(struct kimage, arch.el2_vectors));
|
|
DEFINE(KIMAGE_ARCH_ZERO_PAGE, offsetof(struct kimage, arch.zero_page));
|
|
DEFINE(KIMAGE_ARCH_PHYS_OFFSET, offsetof(struct kimage, arch.phys_offset));
|
|
DEFINE(KIMAGE_ARCH_TTBR1, offsetof(struct kimage, arch.ttbr1));
|
|
DEFINE(KIMAGE_HEAD, offsetof(struct kimage, head));
|
|
DEFINE(KIMAGE_START, offsetof(struct kimage, start));
|
|
BLANK();
|
|
#endif
|
|
#ifdef CONFIG_FUNCTION_TRACER
|
|
DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func));
|
|
#endif
|
|
return 0;
|
|
}
|