5d8213864a
To address the Intel SKL RSB underflow issue in software it's required to do call depth tracking. Provide a return thunk for call depth tracking on Intel SKL CPUs. The tracking does not use a counter. It uses uses arithmetic shift right on call entry and logical shift left on return. The depth tracking variable is initialized to 0x8000.... when the call depth is zero. The arithmetic shift right sign extends the MSB and saturates after the 12th call. The shift count is 5 so the tracking covers 12 nested calls. On return the variable is shifted left logically so it becomes zero again. CALL RET 0: 0x8000000000000000 0x0000000000000000 1: 0xfc00000000000000 0xf000000000000000 ... 11: 0xfffffffffffffff8 0xfffffffffffffc00 12: 0xffffffffffffffff 0xffffffffffffffe0 After a return buffer fill the depth is credited 12 calls before the next stuffing has to take place. There is a inaccuracy for situations like this: 10 calls 5 returns 3 calls 4 returns 3 calls .... The shift count might cause this to be off by one in either direction, but there is still a cushion vs. the RSB depth. The algorithm does not claim to be perfect, but it should obfuscate the problem enough to make exploitation extremly difficult. The theory behind this is: RSB is a stack with depth 16 which is filled on every call. On the return path speculation "pops" entries to speculate down the call chain. Once the speculative RSB is empty it switches to other predictors, e.g. the Branch History Buffer, which can be mistrained by user space and misguide the speculation path to a gadget. Call depth tracking is designed to break this speculation path by stuffing speculation trap calls into the RSB which are never getting a corresponding return executed. This stalls the prediction path until it gets resteered, The assumption is that stuffing at the 12th return is sufficient to break the speculation before it hits the underflow and the fallback to the other predictors. Testing confirms that it works. Johannes, one of the retbleed researchers. tried to attack this approach but failed. There is obviously no scientific proof that this will withstand future research progress, but all we can do right now is to speculate about it. The SAR/SHL usage was suggested by Andi Kleen. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20220915111147.890071690@infradead.org
49 lines
916 B
C
49 lines
916 B
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_CURRENT_H
|
|
#define _ASM_X86_CURRENT_H
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
#include <linux/cache.h>
|
|
#include <asm/percpu.h>
|
|
|
|
struct task_struct;
|
|
|
|
struct pcpu_hot {
|
|
union {
|
|
struct {
|
|
struct task_struct *current_task;
|
|
int preempt_count;
|
|
int cpu_number;
|
|
#ifdef CONFIG_CALL_DEPTH_TRACKING
|
|
u64 call_depth;
|
|
#endif
|
|
unsigned long top_of_stack;
|
|
void *hardirq_stack_ptr;
|
|
u16 softirq_pending;
|
|
#ifdef CONFIG_X86_64
|
|
bool hardirq_stack_inuse;
|
|
#else
|
|
void *softirq_stack_ptr;
|
|
#endif
|
|
};
|
|
u8 pad[64];
|
|
};
|
|
};
|
|
static_assert(sizeof(struct pcpu_hot) == 64);
|
|
|
|
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
|
|
|
|
static __always_inline struct task_struct *get_current(void)
|
|
{
|
|
return this_cpu_read_stable(pcpu_hot.current_task);
|
|
}
|
|
|
|
#define current get_current()
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
#endif /* _ASM_X86_CURRENT_H */
|