c6d3cd32fd
When CONFIG_FUNCTION_GRAPH_TRACER is selected and the function graph tracer is in use, unwind_frame() may erroneously associate a traced function with an incorrect return address. This can happen when starting an unwind from a pt_regs, or when unwinding across an exception boundary. This can be seen when recording with perf while the function graph tracer is in use. For example: | # echo function_graph > /sys/kernel/debug/tracing/current_tracer | # perf record -g -e raw_syscalls:sys_enter:k /bin/true | # perf report ... reports the callchain erroneously as: | el0t_64_sync | el0t_64_sync_handler | el0_svc_common.constprop.0 | perf_callchain | get_perf_callchain | syscall_trace_enter | syscall_trace_enter ... whereas when the function graph tracer is not in use, it reports: | el0t_64_sync | el0t_64_sync_handler | el0_svc | do_el0_svc | el0_svc_common.constprop.0 | syscall_trace_enter | syscall_trace_enter The underlying problem is that ftrace_graph_get_ret_stack() takes an index offset from the most recent entry added to the fgraph return stack. We start an unwind at offset 0, and increment the offset each time we encounter a rewritten return address (i.e. when we see `return_to_handler`). This is broken in two cases: 1) Between creating a pt_regs and starting the unwind, function calls may place entries on the stack, leaving an arbitrary offset which we can only determine by performing a full unwind from the caller of the unwind code (and relying on none of the unwind code being instrumented). This can result in erroneous entries being reported in a backtrace recorded by perf or kfence when the function graph tracer is in use. Currently show_regs() is unaffected as dump_backtrace() performs an initial unwind. 2) When unwinding across an exception boundary (whether continuing an unwind or starting a new unwind from regs), we currently always skip the LR of the interrupted context. Where this was live and contained a rewritten address, we won't consume the corresponding fgraph ret stack entry, leaving subsequent entries off-by-one. This can result in erroneous entries being reported in a backtrace performed by any in-kernel unwinder when that backtrace crosses an exception boundary, with entries after the boundary being reported incorrectly. This includes perf, kfence, show_regs(), panic(), etc. To fix this, we need to be able to uniquely identify each rewritten return address such that we can map this back to the original return address. We can use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR to associate each rewritten return address with a unique location on the stack. As the return address is passed in the LR (and so is not guaranteed a unique location in memory), we use the FP upon entry to the function (i.e. the address of the caller's frame record) as the return address pointer. Any nested call will have a different FP value as the caller must create its own frame record and update FP to point to this. Since ftrace_graph_ret_addr() requires the return address with the PAC stripped, the stripping of the PAC is moved before the fixup of the rewritten address. As we would unconditionally strip the PAC, moving this earlier is not harmful, and we can avoid a redundant strip in the return address fixup code. I've tested this with the perf case above, the ftrace selftests, and a number of ad-hoc unwinder tests. The tests all pass, and I have seen no unexpected behaviour as a result of this change. I've tested with pointer authentication under QEMU TCG where magic-sysrq+l correctly recovers the original return addresses. Note that this doesn't fix the issue of skipping a live LR at an exception boundary, which is a more general problem and requires more substantial rework. Were we to consume the LR in all cases this would result in warnings where the interrupted context's LR contains `return_to_handler`, but the FP has been altered, e.g. | func: | <--- ftrace entry ---> // logs FP & LR, rewrites LR | STP FP, LR, [SP, #-16]! | MOV FP, SP | <--- INTERRUPT ---> ... as ftrace_graph_get_ret_stack() fill not find a matching entry, triggering the WARN_ON_ONCE() in unwind_frame(). Link: https://lore.kernel.org/r/20211025164925.GB2001@C02TD0UTHF1T.local Link: https://lore.kernel.org/r/20211027132529.30027-1-mark.rutland@arm.com Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com> Cc: Mark Brown <broonie@kernel.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Deacon <will@kernel.org> Reviewed-by: Mark Brown <broonie@kernel.org> Link: https://lore.kernel.org/r/20211029162245.39761-1-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
246 lines
6.0 KiB
C
246 lines
6.0 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Stack tracing support
|
|
*
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/export.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/kprobes.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/debug.h>
|
|
#include <linux/sched/task_stack.h>
|
|
#include <linux/stacktrace.h>
|
|
|
|
#include <asm/irq.h>
|
|
#include <asm/pointer_auth.h>
|
|
#include <asm/stack_pointer.h>
|
|
#include <asm/stacktrace.h>
|
|
|
|
/*
|
|
* AArch64 PCS assigns the frame pointer to x29.
|
|
*
|
|
* A simple function prologue looks like this:
|
|
* sub sp, sp, #0x10
|
|
* stp x29, x30, [sp]
|
|
* mov x29, sp
|
|
*
|
|
* A simple function epilogue looks like this:
|
|
* mov sp, x29
|
|
* ldp x29, x30, [sp]
|
|
* add sp, sp, #0x10
|
|
*/
|
|
|
|
|
|
void start_backtrace(struct stackframe *frame, unsigned long fp,
|
|
unsigned long pc)
|
|
{
|
|
frame->fp = fp;
|
|
frame->pc = pc;
|
|
#ifdef CONFIG_KRETPROBES
|
|
frame->kr_cur = NULL;
|
|
#endif
|
|
|
|
/*
|
|
* Prime the first unwind.
|
|
*
|
|
* In unwind_frame() we'll check that the FP points to a valid stack,
|
|
* which can't be STACK_TYPE_UNKNOWN, and the first unwind will be
|
|
* treated as a transition to whichever stack that happens to be. The
|
|
* prev_fp value won't be used, but we set it to 0 such that it is
|
|
* definitely not an accessible stack address.
|
|
*/
|
|
bitmap_zero(frame->stacks_done, __NR_STACK_TYPES);
|
|
frame->prev_fp = 0;
|
|
frame->prev_type = STACK_TYPE_UNKNOWN;
|
|
}
|
|
|
|
/*
|
|
* Unwind from one frame record (A) to the next frame record (B).
|
|
*
|
|
* We terminate early if the location of B indicates a malformed chain of frame
|
|
* records (e.g. a cycle), determined based on the location and fp value of A
|
|
* and the location (but not the fp value) of B.
|
|
*/
|
|
int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
|
|
{
|
|
unsigned long fp = frame->fp;
|
|
struct stack_info info;
|
|
|
|
if (!tsk)
|
|
tsk = current;
|
|
|
|
/* Final frame; nothing to unwind */
|
|
if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
|
|
return -ENOENT;
|
|
|
|
if (fp & 0x7)
|
|
return -EINVAL;
|
|
|
|
if (!on_accessible_stack(tsk, fp, 16, &info))
|
|
return -EINVAL;
|
|
|
|
if (test_bit(info.type, frame->stacks_done))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* As stacks grow downward, any valid record on the same stack must be
|
|
* at a strictly higher address than the prior record.
|
|
*
|
|
* Stacks can nest in several valid orders, e.g.
|
|
*
|
|
* TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
|
|
* TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
|
|
*
|
|
* ... but the nesting itself is strict. Once we transition from one
|
|
* stack to another, it's never valid to unwind back to that first
|
|
* stack.
|
|
*/
|
|
if (info.type == frame->prev_type) {
|
|
if (fp <= frame->prev_fp)
|
|
return -EINVAL;
|
|
} else {
|
|
set_bit(frame->prev_type, frame->stacks_done);
|
|
}
|
|
|
|
/*
|
|
* Record this frame record's values and location. The prev_fp and
|
|
* prev_type are only meaningful to the next unwind_frame() invocation.
|
|
*/
|
|
frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
|
|
frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8));
|
|
frame->prev_fp = fp;
|
|
frame->prev_type = info.type;
|
|
|
|
frame->pc = ptrauth_strip_insn_pac(frame->pc);
|
|
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
|
if (tsk->ret_stack &&
|
|
(frame->pc == (unsigned long)return_to_handler)) {
|
|
unsigned long orig_pc;
|
|
/*
|
|
* This is a case where function graph tracer has
|
|
* modified a return address (LR) in a stack frame
|
|
* to hook a function return.
|
|
* So replace it to an original value.
|
|
*/
|
|
orig_pc = ftrace_graph_ret_addr(tsk, NULL, frame->pc,
|
|
(void *)frame->fp);
|
|
if (WARN_ON_ONCE(frame->pc == orig_pc))
|
|
return -EINVAL;
|
|
frame->pc = orig_pc;
|
|
}
|
|
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
|
|
#ifdef CONFIG_KRETPROBES
|
|
if (is_kretprobe_trampoline(frame->pc))
|
|
frame->pc = kretprobe_find_ret_addr(tsk, (void *)frame->fp, &frame->kr_cur);
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
NOKPROBE_SYMBOL(unwind_frame);
|
|
|
|
void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
|
|
bool (*fn)(void *, unsigned long), void *data)
|
|
{
|
|
while (1) {
|
|
int ret;
|
|
|
|
if (!fn(data, frame->pc))
|
|
break;
|
|
ret = unwind_frame(tsk, frame);
|
|
if (ret < 0)
|
|
break;
|
|
}
|
|
}
|
|
NOKPROBE_SYMBOL(walk_stackframe);
|
|
|
|
static void dump_backtrace_entry(unsigned long where, const char *loglvl)
|
|
{
|
|
printk("%s %pSb\n", loglvl, (void *)where);
|
|
}
|
|
|
|
void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
|
|
const char *loglvl)
|
|
{
|
|
struct stackframe frame;
|
|
int skip = 0;
|
|
|
|
pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
|
|
|
|
if (regs) {
|
|
if (user_mode(regs))
|
|
return;
|
|
skip = 1;
|
|
}
|
|
|
|
if (!tsk)
|
|
tsk = current;
|
|
|
|
if (!try_get_task_stack(tsk))
|
|
return;
|
|
|
|
if (tsk == current) {
|
|
start_backtrace(&frame,
|
|
(unsigned long)__builtin_frame_address(0),
|
|
(unsigned long)dump_backtrace);
|
|
} else {
|
|
/*
|
|
* task blocked in __switch_to
|
|
*/
|
|
start_backtrace(&frame,
|
|
thread_saved_fp(tsk),
|
|
thread_saved_pc(tsk));
|
|
}
|
|
|
|
printk("%sCall trace:\n", loglvl);
|
|
do {
|
|
/* skip until specified stack frame */
|
|
if (!skip) {
|
|
dump_backtrace_entry(frame.pc, loglvl);
|
|
} else if (frame.fp == regs->regs[29]) {
|
|
skip = 0;
|
|
/*
|
|
* Mostly, this is the case where this function is
|
|
* called in panic/abort. As exception handler's
|
|
* stack frame does not contain the corresponding pc
|
|
* at which an exception has taken place, use regs->pc
|
|
* instead.
|
|
*/
|
|
dump_backtrace_entry(regs->pc, loglvl);
|
|
}
|
|
} while (!unwind_frame(tsk, &frame));
|
|
|
|
put_task_stack(tsk);
|
|
}
|
|
|
|
void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
|
|
{
|
|
dump_backtrace(NULL, tsk, loglvl);
|
|
barrier();
|
|
}
|
|
|
|
#ifdef CONFIG_STACKTRACE
|
|
|
|
noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
|
|
void *cookie, struct task_struct *task,
|
|
struct pt_regs *regs)
|
|
{
|
|
struct stackframe frame;
|
|
|
|
if (regs)
|
|
start_backtrace(&frame, regs->regs[29], regs->pc);
|
|
else if (task == current)
|
|
start_backtrace(&frame,
|
|
(unsigned long)__builtin_frame_address(1),
|
|
(unsigned long)__builtin_return_address(0));
|
|
else
|
|
start_backtrace(&frame, thread_saved_fp(task),
|
|
thread_saved_pc(task));
|
|
|
|
walk_stackframe(task, &frame, consume_entry, cookie);
|
|
}
|
|
|
|
#endif
|