94a855111e
been long in the making. It is a lighterweight software-only fix for Skylake-based cores where enabling IBRS is a big hammer and causes a significant performance impact. What it basically does is, it aligns all kernel functions to 16 bytes boundary and adds a 16-byte padding before the function, objtool collects all functions' locations and when the mitigation gets applied, it patches a call accounting thunk which is used to track the call depth of the stack at any time. When that call depth reaches a magical, microarchitecture-specific value for the Return Stack Buffer, the code stuffs that RSB and avoids its underflow which could otherwise lead to the Intel variant of Retbleed. This software-only solution brings a lot of the lost performance back, as benchmarks suggest: https://lore.kernel.org/all/20220915111039.092790446@infradead.org/ That page above also contains a lot more detailed explanation of the whole mechanism - Implement a new control flow integrity scheme called FineIBT which is based on the software kCFI implementation and uses hardware IBT support where present to annotate and track indirect branches using a hash to validate them - Other misc fixes and cleanups -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmOZp5EACgkQEsHwGGHe VUrZFxAAvi/+8L0IYSK4mKJvixGbTFjxN/Swo2JVOfs34LqGUT6JaBc+VUMwZxdb VMTFIZ3ttkKEodjhxGI7oGev6V8UfhI37SmO2lYKXpQVjXXnMlv/M+Vw3teE38CN gopi+xtGnT1IeWQ3tc/Tv18pleJ0mh5HKWiW+9KoqgXj0wgF9x4eRYDz1TDCDA/A iaBzs56j8m/FSykZHnrWZ/MvjKNPdGlfJASUCPeTM2dcrXQGJ93+X2hJctzDte0y Nuiw6Y0htfFBE7xoJn+sqm5Okr+McoUM18/CCprbgSKYk18iMYm3ZtAi6FUQZS1A ua4wQCf49loGp15PO61AS5d3OBf5D3q/WihQRbCaJvTVgPp9sWYnWwtcVUuhMllh ZQtBU9REcVJ/22bH09Q9CjBW0VpKpXHveqQdqRDViLJ6v/iI6EFGmD24SW/VxyRd 73k9MBGrL/dOf1SbEzdsnvcSB3LGzp0Om8o/KzJWOomrVKjBCJy16bwTEsCZEJmP i406m92GPXeaN1GhTko7vmF0GnkEdJs1GVCZPluCAxxbhHukyxHnrjlQjI4vC80n Ylc0B3Kvitw7LGJsPqu+/jfNHADC/zhx1qz/30wb5cFmFbN1aRdp3pm8JYUkn+l/ zri2Y6+O89gvE/9/xUhMohzHsWUO7xITiBavewKeTP9GSWybWUs= =cRy1 -----END PGP SIGNATURE----- Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 core updates from Borislav Petkov: - Add the call depth tracking mitigation for Retbleed which has been long in the making. It is a lighterweight software-only fix for Skylake-based cores where enabling IBRS is a big hammer and causes a significant performance impact. What it basically does is, it aligns all kernel functions to 16 bytes boundary and adds a 16-byte padding before the function, objtool collects all functions' locations and when the mitigation gets applied, it patches a call accounting thunk which is used to track the call depth of the stack at any time. When that call depth reaches a magical, microarchitecture-specific value for the Return Stack Buffer, the code stuffs that RSB and avoids its underflow which could otherwise lead to the Intel variant of Retbleed. This software-only solution brings a lot of the lost performance back, as benchmarks suggest: https://lore.kernel.org/all/20220915111039.092790446@infradead.org/ That page above also contains a lot more detailed explanation of the whole mechanism - Implement a new control flow integrity scheme called FineIBT which is based on the software kCFI implementation and uses hardware IBT support where present to annotate and track indirect branches using a hash to validate them - Other misc fixes and cleanups * tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (80 commits) x86/paravirt: Use common macro for creating simple asm paravirt functions x86/paravirt: Remove clobber bitmask from .parainstructions x86/debug: Include percpu.h in debugreg.h to get DECLARE_PER_CPU() et al x86/cpufeatures: Move X86_FEATURE_CALL_DEPTH from bit 18 to bit 19 of word 11, to leave space for WIP X86_FEATURE_SGX_EDECCSSA bit x86/Kconfig: Enable kernel IBT by default x86,pm: Force out-of-line memcpy() objtool: Fix weak hole vs prefix symbol objtool: Optimize elf_dirty_reloc_sym() x86/cfi: Add boot time hash randomization x86/cfi: Boot time selection of CFI scheme x86/ibt: Implement FineIBT objtool: Add --cfi to generate the .cfi_sites section x86: Add prefix symbols for function padding objtool: Add option to generate prefix symbols objtool: Avoid O(bloody terrible) behaviour -- an ode to libelf objtool: Slice up elf_create_section_symbol() kallsyms: Revert "Take callthunks into account" x86: Unconfuse CONFIG_ and X86_FEATURE_ namespaces x86/retpoline: Fix crash printing warning x86/paravirt: Fix a !PARAVIRT build warning ...
360 lines
8.8 KiB
C
360 lines
8.8 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* Kernel module help for x86.
|
|
Copyright (C) 2001 Rusty Russell.
|
|
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/moduleloader.h>
|
|
#include <linux/elf.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/string.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/kasan.h>
|
|
#include <linux/bug.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/jump_label.h>
|
|
#include <linux/random.h>
|
|
#include <linux/memory.h>
|
|
|
|
#include <asm/text-patching.h>
|
|
#include <asm/page.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/unwind.h>
|
|
|
|
#if 0
|
|
#define DEBUGP(fmt, ...) \
|
|
printk(KERN_DEBUG fmt, ##__VA_ARGS__)
|
|
#else
|
|
#define DEBUGP(fmt, ...) \
|
|
do { \
|
|
if (0) \
|
|
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
|
|
} while (0)
|
|
#endif
|
|
|
|
#ifdef CONFIG_RANDOMIZE_BASE
|
|
static unsigned long module_load_offset;
|
|
|
|
/* Mutex protects the module_load_offset. */
|
|
static DEFINE_MUTEX(module_kaslr_mutex);
|
|
|
|
static unsigned long int get_module_load_offset(void)
|
|
{
|
|
if (kaslr_enabled()) {
|
|
mutex_lock(&module_kaslr_mutex);
|
|
/*
|
|
* Calculate the module_load_offset the first time this
|
|
* code is called. Once calculated it stays the same until
|
|
* reboot.
|
|
*/
|
|
if (module_load_offset == 0)
|
|
module_load_offset =
|
|
get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
|
|
mutex_unlock(&module_kaslr_mutex);
|
|
}
|
|
return module_load_offset;
|
|
}
|
|
#else
|
|
static unsigned long int get_module_load_offset(void)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
void *module_alloc(unsigned long size)
|
|
{
|
|
gfp_t gfp_mask = GFP_KERNEL;
|
|
void *p;
|
|
|
|
if (PAGE_ALIGN(size) > MODULES_LEN)
|
|
return NULL;
|
|
|
|
p = __vmalloc_node_range(size, MODULE_ALIGN,
|
|
MODULES_VADDR + get_module_load_offset(),
|
|
MODULES_END, gfp_mask, PAGE_KERNEL,
|
|
VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
|
|
NUMA_NO_NODE, __builtin_return_address(0));
|
|
|
|
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
|
|
vfree(p);
|
|
return NULL;
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
#ifdef CONFIG_X86_32
|
|
int apply_relocate(Elf32_Shdr *sechdrs,
|
|
const char *strtab,
|
|
unsigned int symindex,
|
|
unsigned int relsec,
|
|
struct module *me)
|
|
{
|
|
unsigned int i;
|
|
Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
|
|
Elf32_Sym *sym;
|
|
uint32_t *location;
|
|
|
|
DEBUGP("Applying relocate section %u to %u\n",
|
|
relsec, sechdrs[relsec].sh_info);
|
|
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
|
|
/* This is where to make the change */
|
|
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
|
|
+ rel[i].r_offset;
|
|
/* This is the symbol it is referring to. Note that all
|
|
undefined symbols have been resolved. */
|
|
sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
|
|
+ ELF32_R_SYM(rel[i].r_info);
|
|
|
|
switch (ELF32_R_TYPE(rel[i].r_info)) {
|
|
case R_386_32:
|
|
/* We add the value into the location given */
|
|
*location += sym->st_value;
|
|
break;
|
|
case R_386_PC32:
|
|
case R_386_PLT32:
|
|
/* Add the value, subtract its position */
|
|
*location += sym->st_value - (uint32_t)location;
|
|
break;
|
|
default:
|
|
pr_err("%s: Unknown relocation: %u\n",
|
|
me->name, ELF32_R_TYPE(rel[i].r_info));
|
|
return -ENOEXEC;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
#else /*X86_64*/
|
|
static int __apply_relocate_add(Elf64_Shdr *sechdrs,
|
|
const char *strtab,
|
|
unsigned int symindex,
|
|
unsigned int relsec,
|
|
struct module *me,
|
|
void *(*write)(void *dest, const void *src, size_t len))
|
|
{
|
|
unsigned int i;
|
|
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
|
|
Elf64_Sym *sym;
|
|
void *loc;
|
|
u64 val;
|
|
|
|
DEBUGP("Applying relocate section %u to %u\n",
|
|
relsec, sechdrs[relsec].sh_info);
|
|
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
|
|
/* This is where to make the change */
|
|
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
|
|
+ rel[i].r_offset;
|
|
|
|
/* This is the symbol it is referring to. Note that all
|
|
undefined symbols have been resolved. */
|
|
sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
|
|
+ ELF64_R_SYM(rel[i].r_info);
|
|
|
|
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
|
|
(int)ELF64_R_TYPE(rel[i].r_info),
|
|
sym->st_value, rel[i].r_addend, (u64)loc);
|
|
|
|
val = sym->st_value + rel[i].r_addend;
|
|
|
|
switch (ELF64_R_TYPE(rel[i].r_info)) {
|
|
case R_X86_64_NONE:
|
|
break;
|
|
case R_X86_64_64:
|
|
if (*(u64 *)loc != 0)
|
|
goto invalid_relocation;
|
|
write(loc, &val, 8);
|
|
break;
|
|
case R_X86_64_32:
|
|
if (*(u32 *)loc != 0)
|
|
goto invalid_relocation;
|
|
write(loc, &val, 4);
|
|
if (val != *(u32 *)loc)
|
|
goto overflow;
|
|
break;
|
|
case R_X86_64_32S:
|
|
if (*(s32 *)loc != 0)
|
|
goto invalid_relocation;
|
|
write(loc, &val, 4);
|
|
if ((s64)val != *(s32 *)loc)
|
|
goto overflow;
|
|
break;
|
|
case R_X86_64_PC32:
|
|
case R_X86_64_PLT32:
|
|
if (*(u32 *)loc != 0)
|
|
goto invalid_relocation;
|
|
val -= (u64)loc;
|
|
write(loc, &val, 4);
|
|
#if 0
|
|
if ((s64)val != *(s32 *)loc)
|
|
goto overflow;
|
|
#endif
|
|
break;
|
|
case R_X86_64_PC64:
|
|
if (*(u64 *)loc != 0)
|
|
goto invalid_relocation;
|
|
val -= (u64)loc;
|
|
write(loc, &val, 8);
|
|
break;
|
|
default:
|
|
pr_err("%s: Unknown rela relocation: %llu\n",
|
|
me->name, ELF64_R_TYPE(rel[i].r_info));
|
|
return -ENOEXEC;
|
|
}
|
|
}
|
|
return 0;
|
|
|
|
invalid_relocation:
|
|
pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
|
|
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
|
|
return -ENOEXEC;
|
|
|
|
overflow:
|
|
pr_err("overflow in relocation type %d val %Lx\n",
|
|
(int)ELF64_R_TYPE(rel[i].r_info), val);
|
|
pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
|
|
me->name);
|
|
return -ENOEXEC;
|
|
}
|
|
|
|
int apply_relocate_add(Elf64_Shdr *sechdrs,
|
|
const char *strtab,
|
|
unsigned int symindex,
|
|
unsigned int relsec,
|
|
struct module *me)
|
|
{
|
|
int ret;
|
|
bool early = me->state == MODULE_STATE_UNFORMED;
|
|
void *(*write)(void *, const void *, size_t) = memcpy;
|
|
|
|
if (!early) {
|
|
write = text_poke;
|
|
mutex_lock(&text_mutex);
|
|
}
|
|
|
|
ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
|
|
write);
|
|
|
|
if (!early) {
|
|
text_poke_sync();
|
|
mutex_unlock(&text_mutex);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
#endif
|
|
|
|
int module_finalize(const Elf_Ehdr *hdr,
|
|
const Elf_Shdr *sechdrs,
|
|
struct module *me)
|
|
{
|
|
const Elf_Shdr *s, *alt = NULL, *locks = NULL,
|
|
*para = NULL, *orc = NULL, *orc_ip = NULL,
|
|
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
|
|
*calls = NULL, *cfi = NULL;
|
|
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
|
|
|
|
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
|
|
if (!strcmp(".altinstructions", secstrings + s->sh_name))
|
|
alt = s;
|
|
if (!strcmp(".smp_locks", secstrings + s->sh_name))
|
|
locks = s;
|
|
if (!strcmp(".parainstructions", secstrings + s->sh_name))
|
|
para = s;
|
|
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
|
|
orc = s;
|
|
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
|
|
orc_ip = s;
|
|
if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
|
|
retpolines = s;
|
|
if (!strcmp(".return_sites", secstrings + s->sh_name))
|
|
returns = s;
|
|
if (!strcmp(".call_sites", secstrings + s->sh_name))
|
|
calls = s;
|
|
if (!strcmp(".cfi_sites", secstrings + s->sh_name))
|
|
cfi = s;
|
|
if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
|
|
ibt_endbr = s;
|
|
}
|
|
|
|
/*
|
|
* See alternative_instructions() for the ordering rules between the
|
|
* various patching types.
|
|
*/
|
|
if (para) {
|
|
void *pseg = (void *)para->sh_addr;
|
|
apply_paravirt(pseg, pseg + para->sh_size);
|
|
}
|
|
if (retpolines || cfi) {
|
|
void *rseg = NULL, *cseg = NULL;
|
|
unsigned int rsize = 0, csize = 0;
|
|
|
|
if (retpolines) {
|
|
rseg = (void *)retpolines->sh_addr;
|
|
rsize = retpolines->sh_size;
|
|
}
|
|
|
|
if (cfi) {
|
|
cseg = (void *)cfi->sh_addr;
|
|
csize = cfi->sh_size;
|
|
}
|
|
|
|
apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
|
|
}
|
|
if (retpolines) {
|
|
void *rseg = (void *)retpolines->sh_addr;
|
|
apply_retpolines(rseg, rseg + retpolines->sh_size);
|
|
}
|
|
if (returns) {
|
|
void *rseg = (void *)returns->sh_addr;
|
|
apply_returns(rseg, rseg + returns->sh_size);
|
|
}
|
|
if (alt) {
|
|
/* patch .altinstructions */
|
|
void *aseg = (void *)alt->sh_addr;
|
|
apply_alternatives(aseg, aseg + alt->sh_size);
|
|
}
|
|
if (calls || para) {
|
|
struct callthunk_sites cs = {};
|
|
|
|
if (calls) {
|
|
cs.call_start = (void *)calls->sh_addr;
|
|
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
|
|
}
|
|
|
|
if (para) {
|
|
cs.pv_start = (void *)para->sh_addr;
|
|
cs.pv_end = (void *)para->sh_addr + para->sh_size;
|
|
}
|
|
|
|
callthunks_patch_module_calls(&cs, me);
|
|
}
|
|
if (ibt_endbr) {
|
|
void *iseg = (void *)ibt_endbr->sh_addr;
|
|
apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
|
|
}
|
|
if (locks) {
|
|
void *lseg = (void *)locks->sh_addr;
|
|
void *text = me->core_layout.base;
|
|
void *text_end = text + me->core_layout.text_size;
|
|
alternatives_smp_module_add(me, me->name,
|
|
lseg, lseg + locks->sh_size,
|
|
text, text_end);
|
|
}
|
|
|
|
if (orc && orc_ip)
|
|
unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
|
|
(void *)orc->sh_addr, orc->sh_size);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void module_arch_cleanup(struct module *mod)
|
|
{
|
|
alternatives_smp_module_del(mod);
|
|
}
|