From 15675706241887ed7fdad9e91f4bf977b9896d0f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 29 Jan 2024 19:05:06 +0100 Subject: [PATCH 01/22] x86/startup_64: Drop long return to initial_code pointer Since 866b556efa12 ("x86/head/64: Install startup GDT") the primary startup sequence sets the code segment register (CS) to __KERNEL_CS before calling into the startup code shared between primary and secondary boot. This means a simple indirect call is sufficient here. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240129180502.4069817-24-ardb+git@google.com --- arch/x86/kernel/head_64.S | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d4918d03efb4..bfbac50dbe69 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -428,39 +428,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq %r15, %rdi .Ljump_to_C_code: - /* - * Jump to run C code and to be on a real kernel address. - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address, this is only possible as indirect - * jump. In addition we need to ensure %cs is set so we make this - * a far return. - * - * Note: do not change to far jump indirect with 64bit offset. - * - * AMD does not support far jump indirect with 64bit offset. - * AMD64 Architecture Programmer's Manual, Volume 3: states only - * JMP FAR mem16:16 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * JMP FAR mem16:32 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * - * Intel64 does support 64bit offset. - * Software Developer Manual Vol 2: states: - * FF /5 JMP m16:16 Jump far, absolute indirect, - * address given in m16:16 - * FF /5 JMP m16:32 Jump far, absolute indirect, - * address given in m16:32. - * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, - * address given in m16:64. - */ - pushq $.Lafter_lret # put return address on stack for unwinder xorl %ebp, %ebp # clear frame pointer - movq initial_code(%rip), %rax - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq -.Lafter_lret: - ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + callq *initial_code(%rip) + ud2 SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" From 9ba8ec8ee67a00eb5631364e4b716f35559724d4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 2 Feb 2024 03:51:43 +0000 Subject: [PATCH 02/22] x86/boot: Add error_putdec() helper Add a helper to print decimal numbers to early console. Suggested-by: Borislav Petkov (AMD) Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Jun'ichi Nomura Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/lkml/20240123112624.GBZa-iYP1l9SSYtr-V@fat_crate.local/ Link: https://lore.kernel.org/r/20240202035052.17963-1-junichi.nomura@nec.com --- arch/x86/boot/compressed/misc.c | 37 ++++++++++++++++++++++----------- arch/x86/boot/compressed/misc.h | 2 ++ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b99e08e6815b..bf2aac4f195e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -164,21 +164,34 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } +static noinline void __putnum(unsigned long value, unsigned int base, + int mindig) +{ + char buf[8*sizeof(value)+1]; + char *p; + + p = buf + sizeof(buf); + *--p = '\0'; + + while (mindig-- > 0 || value) { + unsigned char digit = value % base; + digit += (digit >= 10) ? ('a'-10) : '0'; + *--p = digit; + + value /= base; + } + + __putstr(p); +} + void __puthex(unsigned long value) { - char alpha[2] = "0"; - int bits; + __putnum(value, 16, sizeof(value)*2); +} - for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) { - unsigned long digit = (value >> bits) & 0xf; - - if (digit < 0xA) - alpha[0] = '0' + digit; - else - alpha[0] = 'a' + (digit - 0xA); - - __putstr(alpha); - } +void __putdec(unsigned long value) +{ + __putnum(value, 10, 1); } #ifdef CONFIG_X86_NEED_RELOCS diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index bc2f0f17fb90..6502bc69d1b8 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -63,8 +63,10 @@ void *malloc(int size); void free(void *where); void __putstr(const char *s); void __puthex(unsigned long value); +void __putdec(unsigned long value); #define error_putstr(__x) __putstr(__x) #define error_puthex(__x) __puthex(__x) +#define error_putdec(__x) __putdec(__x) #ifdef CONFIG_X86_VERBOSE_BOOTUP From ac456ca0af4fe9630cf84e7efd20b7f7bf596aab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?NOMURA=20JUNICHI=28=E9=87=8E=E6=9D=91=E3=80=80=E6=B7=B3?= =?UTF-8?q?=E4=B8=80=29?= Date: Fri, 2 Feb 2024 03:51:58 +0000 Subject: [PATCH 03/22] x86/boot: Add a message about ignored early NMIs Commit 78a509fba9c9 ("x86/boot: Ignore NMIs during very early boot") added an empty handler in early boot stage to avoid boot failure due to spurious NMIs. Add a diagnostic message to show that early NMIs have occurred. [ bp: Touchups. ] [ Committer note: tested by stopping the guest really early and injecting NMIs through qemu's monitor. Result: early console in setup code Spurious early NMIs ignored: 13 ... ] Suggested-by: Borislav Petkov Suggested-by: H. Peter Anvin Signed-off-by: Jun'ichi Nomura Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/lkml/20231130103339.GCZWhlA196uRklTMNF@fat_crate.local --- arch/x86/boot/compressed/ident_map_64.c | 2 +- arch/x86/boot/compressed/misc.c | 7 +++++++ arch/x86/boot/compressed/misc.h | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d040080d7edb..4a029baa5147 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -389,5 +389,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) { - /* Empty handler to ignore NMI during early boot */ + spurious_nmi_count++; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bf2aac4f195e..bd6857a9f15a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -52,6 +52,7 @@ struct port_io_ops pio_ops; memptr free_mem_ptr; memptr free_mem_end_ptr; +int spurious_nmi_count; static char *vidmem; static int vidport; @@ -506,6 +507,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) /* Disable exception handling before booting the kernel */ cleanup_exception_handling(); + if (spurious_nmi_count) { + error_putstr("Spurious early NMIs ignored: "); + error_putdec(spurious_nmi_count); + error_putstr("\n"); + } + return output + entry_offset; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 6502bc69d1b8..b353a7be380c 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -59,6 +59,7 @@ extern char _head[], _end[]; /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; +extern int spurious_nmi_count; void *malloc(int size); void free(void *where); void __putstr(const char *s); From 43b1d3e68ee7f41c494ee5558d8def3d3d0b7f1b Mon Sep 17 00:00:00 2001 From: Chris Koch Date: Fri, 15 Dec 2023 11:05:21 -0800 Subject: [PATCH 04/22] kexec: Allocate kernel above bzImage's pref_address A relocatable kernel will relocate itself to pref_address if it is loaded below pref_address. This means a booted kernel may be relocating itself to an area with reserved memory on modern systems, potentially clobbering arbitrary data that may be important to the system. This is often the case, as the default value of PHYSICAL_START is 0x1000000 and kernels are typically loaded at 0x100000 or above by bootloaders like iPXE or kexec. GRUB behaves like the approach implemented here. Also fixes the documentation around pref_address and PHYSICAL_START to be accurate. [ dhansen: changelog tweak ] Co-developed-by: Cloud Hsu Signed-off-by: Cloud Hsu Signed-off-by: Chris Koch Signed-off-by: Dave Hansen Reviewed-by: H. Peter Anvin (Intel) Link: https://lore.kernel.org/all/20231215190521.3796022-1-chrisko%40google.com --- Documentation/arch/x86/boot.rst | 3 ++- arch/x86/Kconfig | 10 +++++----- arch/x86/kernel/kexec-bzimage64.c | 5 ++++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index c513855a54bb..4fd492cb4970 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -878,7 +878,8 @@ Protocol: 2.10+ address if possible. A non-relocatable kernel will unconditionally move itself and to run - at this address. + at this address. A relocatable kernel will move itself to this address if it + loaded below this address. ============ ======= Field name: init_size diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bf..1a33575f98af 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2114,11 +2114,11 @@ config PHYSICAL_START help This gives the physical address where the kernel is loaded. - If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then - bzImage will decompress itself to above physical address and - run from there. Otherwise, bzImage will run from the address where - it has been loaded by the boot loader and will ignore above physical - address. + If the kernel is not relocatable (CONFIG_RELOCATABLE=n) then bzImage + will decompress itself to above physical address and run from there. + Otherwise, bzImage will run from the address where it has been loaded + by the boot loader. The only exception is if it is loaded below the + above physical address, in which case it will relocate itself there. In normal kdump cases one does not have to set/change this option as now bzImage can be compiled as a completely relocatable image diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2a422e00ed4b..cde167b0ea92 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -503,7 +503,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz = kernel_len - kern16_size; kbuf.memsz = PAGE_ALIGN(header->init_size); kbuf.buf_align = header->kernel_alignment; - kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + if (header->pref_address < MIN_KERNEL_LOAD_ADDR) + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + else + kbuf.buf_min = header->pref_address; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) From c2cfc23f79676a9857a5a48911011bd56e23fd46 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Jan 2024 12:01:01 +0200 Subject: [PATCH 05/22] x86/trampoline: Bypass compat mode in trampoline_start64() if not needed The trampoline_start64() vector is used when a secondary CPU starts in 64-bit mode. The current implementation directly enters compatibility mode. It is necessary to disable paging and re-enable it in the correct paging mode: either 4- or 5-level, depending on the configuration. The X86S[1] ISA does not support compatibility mode in ring 0, and paging cannot be disabled. Rework the trampoline_start64() function to only enter compatibility mode if it is necessary to change the paging mode. If the CPU is already in the desired paging mode, proceed in long mode. This allows a secondary CPU to boot on an X86S machine as long as the CPU is already in the correct paging mode. In the future, there will be a mechanism to switch between paging modes without disabling paging. [1] https://www.intel.com/content/www/us/en/developer/articles/technical/envisioning-future-simplified-architecture.html [ dhansen: changelog tweaks ] Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Kai Huang Link: https://lore.kernel.org/all/20240126100101.689090-1-kirill.shutemov%40linux.intel.com --- arch/x86/realmode/rm/trampoline_64.S | 33 +++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index c9f76fae902e..14d9c7daf90f 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -37,13 +37,15 @@ .text .code16 -.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0 +.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0 lock_rip=0 /* * Make sure only one CPU fiddles with the realmode stack */ .Llock_rm\@: .if \lock_pa lock btsl $0, pa_tr_lock + .elseif \lock_rip + lock btsl $0, tr_lock(%rip) .else lock btsl $0, tr_lock .endif @@ -220,6 +222,35 @@ SYM_CODE_START(trampoline_start64) lidt tr_idt(%rip) lgdt tr_gdt64(%rip) + /* Check if paging mode has to be changed */ + movq %cr4, %rax + xorl tr_cr4(%rip), %eax + testl $X86_CR4_LA57, %eax + jnz .L_switch_paging + + /* Paging mode is correct proceed in 64-bit mode */ + + LOCK_AND_LOAD_REALMODE_ESP lock_rip=1 + + movw $__KERNEL_DS, %dx + movl %edx, %ss + addl $pa_real_mode_base, %esp + movl %edx, %ds + movl %edx, %es + movl %edx, %fs + movl %edx, %gs + + movl $pa_trampoline_pgd, %eax + movq %rax, %cr3 + + pushq $__KERNEL_CS + pushq tr_start(%rip) + lretq +.L_switch_paging: + /* + * To switch between 4- and 5-level paging modes, it is necessary + * to disable paging. This must be done in the compatibility mode. + */ ljmpl *tr_compat(%rip) SYM_CODE_END(trampoline_start64) From 5da793671957e8e99fa74423fab2737bf8c772a8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:08 +0100 Subject: [PATCH 06/22] x86/boot/64: Simplify global variable accesses in GDT/IDT programming There are two code paths in the startup code to program an IDT: one that runs from the 1:1 mapping and one that runs from the virtual kernel mapping. Currently, these are strictly separate because fixup_pointer() is used on the 1:1 path, which will produce the wrong value when used while executing from the virtual kernel mapping. Switch to RIP_REL_REF() so that the two code paths can be merged. Also, move the GDT and IDT descriptors to the stack so that they can be referenced directly, rather than via RIP_REL_REF(). Rename startup_64_setup_env() to startup_64_setup_gdt_idt() while at it, to make the call from assembler self-documenting. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-19-ardb+git@google.com --- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/head64.c | 75 +++++++++++++++--------------------- arch/x86/kernel/head_64.S | 4 +- 3 files changed, 32 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 5c83729c8e71..e61e68d71cba 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -48,7 +48,7 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern void startup_64_setup_env(unsigned long physbase); +extern void startup_64_setup_gdt_idt(void); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index dc0956067944..1d6865eafe6a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -76,15 +77,6 @@ static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), }; -/* - * Address needs to be set at runtime because it references the startup_gdt - * while the kernel still uses a direct mapping. - */ -static struct desc_ptr startup_gdt_descr __initdata = { - .size = sizeof(startup_gdt)-1, - .address = 0, -}; - static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; @@ -569,62 +561,52 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) */ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; -static struct desc_ptr bringup_idt_descr = { - .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, - .address = 0, /* Set at runtime */ -}; - -static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +/* This may run while still in the direct mapping */ +static void __head startup_64_load_idt(void *vc_handler) { -#ifdef CONFIG_AMD_MEM_ENCRYPT + struct desc_ptr desc = { + .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; struct idt_data data; - gate_desc desc; + gate_desc idt_desc; - init_idt_data(&data, n, handler); - idt_init_desc(&desc, &data); - native_write_idt_entry(idt, n, &desc); -#endif -} - -/* This runs while still in the direct mapping */ -static void __head startup_64_load_idt(unsigned long physbase) -{ - struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); - gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); - - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - void *handler; - - /* VMM Communication Exception */ - handler = fixup_pointer(vc_no_ghcb, physbase); - set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); } - desc->address = (unsigned long)idt; - native_load_idt(desc); + native_load_idt(&desc); } /* This is used when running on kernel addresses */ void early_setup_idt(void) { - /* VMM Communication Exception */ + void *handler = NULL; + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { setup_ghcb(); - set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + handler = vc_boot_ghcb; } - bringup_idt_descr.address = (unsigned long)bringup_idt_table; - native_load_idt(&bringup_idt_descr); + startup_64_load_idt(handler); } /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_env(unsigned long physbase) +void __head startup_64_setup_gdt_idt(void) { + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)&RIP_REL_REF(startup_gdt), + .size = sizeof(startup_gdt) - 1, + }; + /* Load GDT */ - startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); native_load_gdt(&startup_gdt_descr); /* New GDT is live - reload data segment registers */ @@ -632,5 +614,8 @@ void __head startup_64_setup_env(unsigned long physbase) "movl %%eax, %%ss\n" "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - startup_64_load_idt(physbase); + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = &RIP_REL_REF(vc_no_ghcb); + + startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index bfbac50dbe69..e09cf0b09141 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -68,8 +68,6 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu() */ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp - leaq _text(%rip), %rdi - /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx @@ -77,7 +75,7 @@ SYM_CODE_START_NOALIGN(startup_64) shrq $32, %rdx wrmsr - call startup_64_setup_env + call startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS From d9ec1158056bedb6da8f4e02de30d300914b31f8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:09 +0100 Subject: [PATCH 07/22] x86/boot/64: Use RIP_REL_REF() to assign 'phys_base' 'phys_base' is assigned from code that executes from a 1:1 mapping so it cannot use a plain access from C. Replace the use of fixup_pointer() with RIP_REL_REF(), which is better and simpler. While at it, move the assignment to before the addition of the SME mask so there is no need to subtract it again, and drop the unnecessary addition ('phys_base' is statically initialized to 0x0) Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-20-ardb+git@google.com --- arch/x86/kernel/head64.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1d6865eafe6a..f98f5b6a06b5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -192,6 +192,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * and the address I am actually running at. */ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + RIP_REL_REF(phys_base) = load_delta; /* Is the address not 2M aligned? */ if (load_delta & ~PMD_MASK) @@ -301,12 +302,6 @@ unsigned long __head __startup_64(unsigned long physaddr, for (; i < PTRS_PER_PMD; i++) pmd[i] &= ~_PAGE_PRESENT; - /* - * Fixup phys_base - remove the memory encryption mask to obtain - * the true physical address. - */ - *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - return sme_postprocess_startup(bp, pmd); } From b0fe5fb6095be0f68b570c4cf4cd86b7e70c2e38 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:10 +0100 Subject: [PATCH 08/22] x86/boot/64: Use RIP_REL_REF() to access early_dynamic_pgts[] early_dynamic_pgts[] and next_early_pgt are accessed from code that executes from a 1:1 mapping so it cannot use a plain access from C. Replace the use of fixup_pointer() with RIP_REL_REF(), which is better and simpler. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-21-ardb+git@google.com --- arch/x86/kernel/head64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f98f5b6a06b5..2ac904110f6a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -170,6 +170,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -179,7 +180,6 @@ unsigned long __head __startup_64(unsigned long physaddr, pteval_t *mask_ptr; bool la57; int i; - unsigned int *next_pgt_ptr; la57 = check_la57_support(physaddr); @@ -231,15 +231,14 @@ unsigned long __head __startup_64(unsigned long physaddr, * it avoids problems around wraparound. */ - next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); - pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); - pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + RIP_REL_REF(next_early_pgt) = 2; pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (la57) { - p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], - physaddr); + p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; From 4f8b6cf25f5c119b117cb2a4bacb604a6cd00ff1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:11 +0100 Subject: [PATCH 09/22] x86/boot/64: Use RIP_REL_REF() to access '__supported_pte_mask' '__supported_pte_mask' is accessed from code that executes from a 1:1 mapping so it cannot use a plain access from C. Replace the use of fixup_pointer() with RIP_REL_REF(), which is better and simpler. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-22-ardb+git@google.com --- arch/x86/kernel/head64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2ac904110f6a..e2573ddae32f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -177,7 +177,6 @@ unsigned long __head __startup_64(unsigned long physaddr, p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; - pteval_t *mask_ptr; bool la57; int i; @@ -259,8 +258,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; /* Filter out unsupported __PAGE_KERNEL_* bits: */ - mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); - pmd_entry &= *mask_ptr; + pmd_entry &= RIP_REL_REF(__supported_pte_mask); pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; From eb54c2ae4a4825c42a6a2b4022926bda7448f735 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:12 +0100 Subject: [PATCH 10/22] x86/boot/64: Use RIP_REL_REF() to access early page tables The early statically allocated page tables are populated from code that executes from a 1:1 mapping so it cannot use plain accesses from C. Replace the use of fixup_pointer() with RIP_REL_REF(), which is better and simpler. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-23-ardb+git@google.com --- arch/x86/kernel/head64.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e2573ddae32f..7e2c9b581d58 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -215,13 +215,11 @@ unsigned long __head __startup_64(unsigned long physaddr, p4d[511] += load_delta; } - pud = fixup_pointer(level3_kernel_pgt, physaddr); - pud[510] += load_delta; - pud[511] += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - pmd = fixup_pointer(level2_fixmap_pgt, physaddr); for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - pmd[i] += load_delta; + RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; /* * Set up the identity mapping for the switchover. These @@ -284,7 +282,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * error, causing the BIOS to halt the system. */ - pmd = fixup_pointer(level2_kernel_pgt, physaddr); + pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; /* invalidate pages before the kernel image */ for (i = 0; i < pmd_index((unsigned long)_text); i++) From 533568e06b157b175912a960efe5ebce8710b4f9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 21 Feb 2024 12:35:13 +0100 Subject: [PATCH 11/22] x86/boot/64: Use RIP_REL_REF() to access early_top_pgt[] early_top_pgt[] is assigned from code that executes from a 1:1 mapping so it cannot use a plain access from C. Replace the use of fixup_pointer() with RIP_REL_REF(), which is better and simpler. For legibility and to align with the code that populates the lower page table levels, statically initialize the root level page table with an entry pointing to level3_kernel_pgt[], and overwrite it when needed to enable 5-level paging. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240221113506.2565718-24-ardb+git@google.com --- arch/x86/kernel/head64.c | 21 +++++++++------------ arch/x86/kernel/head_64.S | 3 ++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7e2c9b581d58..72351c3121a6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -77,6 +77,7 @@ static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), }; +#ifdef CONFIG_X86_5LEVEL static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; @@ -87,7 +88,6 @@ static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) return fixup_pointer(ptr, physaddr); } -#ifdef CONFIG_X86_5LEVEL static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) { return fixup_pointer(ptr, physaddr); @@ -165,14 +165,14 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to * boot-time crashes. To work around this problem, every global pointer must - * be adjusted using fixup_pointer(). + * be accessed using RIP_REL_REF(). */ unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); - unsigned long load_delta, *p; unsigned long pgtable_flags; + unsigned long load_delta; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -202,17 +202,14 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ - pgd = fixup_pointer(early_top_pgt, physaddr); - p = pgd + pgd_index(__START_KERNEL_map); - if (la57) - *p = (unsigned long)level4_kernel_pgt; - else - *p = (unsigned long)level3_kernel_pgt; - *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + pgd = &RIP_REL_REF(early_top_pgt)->pgd; + pgd[pgd_index(__START_KERNEL_map)] += load_delta; if (la57) { - p4d = fixup_pointer(level4_kernel_pgt, physaddr); - p4d[511] += load_delta; + p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC; } RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e09cf0b09141..d295bf68bf94 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -624,7 +624,8 @@ SYM_CODE_END(vc_no_ghcb) .balign 4 SYM_DATA_START_PTI_ALIGNED(early_top_pgt) - .fill 512,8,0 + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) From 11e36b0f7c2150a6453872b79555767b43c846d0 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 26 Feb 2024 17:05:44 -0500 Subject: [PATCH 12/22] x86/boot/64: Load the final kernel GDT during early boot directly, remove startup_gdt[] Instead of loading a duplicate GDT just for early boot, load the kernel GDT from its physical address. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Acked-by: Ard Biesheuvel Cc: Kees Cook Cc: Andy Lutomirski Cc: Linus Torvalds Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20240226220544.70769-1-brgerst@gmail.com --- arch/x86/include/asm/desc.h | 1 + arch/x86/kernel/head64.c | 13 ++----------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a..52c015017990 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -46,6 +46,7 @@ struct gdt_page { } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_INIT_PER_CPU(gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 72351c3121a6..fd77a266f29d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -68,15 +68,6 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -/* - * GDT used on the boot CPU before switching to virtual addresses. - */ -static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), -}; - #ifdef CONFIG_X86_5LEVEL static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { @@ -589,8 +580,8 @@ void __head startup_64_setup_gdt_idt(void) void *handler = NULL; struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(startup_gdt), - .size = sizeof(startup_gdt) - 1, + .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)), + .size = GDT_SIZE - 1, }; /* Load GDT */ From 891f8890a4a3663da7056542757022870b499bc1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Jan 2024 09:53:48 +0100 Subject: [PATCH 13/22] efi/x86: Set the PE/COFF header's NX compat flag unconditionally Now that the proper section and file alignment is used, and the EFI memory attributes protocol to manage executable permissions where needed is invoked, set the NX compat flag unconditionally. [ bp: Remove the "we"s. ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240116085347.2193966-2-ardb+git@google.com --- arch/x86/boot/header.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index a1bbedd989e4..b5c79f43359b 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -111,11 +111,7 @@ extra_header_fields: .long salign # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) -#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics -#else - .word 0 # DllCharacteristics -#endif #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit From 721f791ce1cddfa5f2bf524ac14741bfa0f72697 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 24 Jan 2024 11:38:59 +0100 Subject: [PATCH 14/22] x86/boot: Use 32-bit XOR to clear registers x86_64 zero extends 32-bit operations, so for 64-bit operands, XORL r32,r32 is functionally equal to XORQ r64,r64, but avoids a REX prefix byte when legacy registers are used. Slightly smaller code generated, no change in functionality. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Ard Biesheuvel Link: https://lore.kernel.org/r/20240124103859.611372-1-ubizjak@gmail.com --- arch/x86/kernel/head_64.S | 6 +++--- arch/x86/kernel/sev_verify_cbit.S | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d295bf68bf94..86136a78778e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -169,7 +169,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Clear %R15 which holds the boot_params pointer on the boot CPU */ - xorq %r15, %r15 + xorl %r15d, %r15d /* * Retrieve the modifier (SME encryption mask if SME is active) to be @@ -178,7 +178,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) #ifdef CONFIG_AMD_MEM_ENCRYPT movq sme_me_mask, %rax #else - xorq %rax, %rax + xorl %eax, %eax #endif /* Form the CR3 value being sure to include the CR3 modifier */ @@ -295,7 +295,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) .Llookup_AP: /* EAX contains the APIC ID of the current CPU */ - xorq %rcx, %rcx + xorl %ecx, %ecx leaq cpuid_to_apicid(%rip), %rbx .Lfind_cpunr: diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S index 3355e27c69eb..1ab65f6c6ae7 100644 --- a/arch/x86/kernel/sev_verify_cbit.S +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit) * The check failed, prevent any forward progress to prevent ROP * attacks, invalidate the stack and go into a hlt loop. */ - xorq %rsp, %rsp + xorl %esp, %esp subq $0x1000, %rsp 2: hlt jmp 2b From dada8587068c820ba5e5d09b9c32d8bc28c4dbe6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:09 +0100 Subject: [PATCH 15/22] x86/startup_64: Simplify CR4 handling in startup code When paging is enabled, the CR4.PAE and CR4.LA57 control bits cannot be changed, and so they can simply be preserved rather than reason about whether or not they need to be set. CR4.MCE should be preserved unless the kernel was built without CONFIG_X86_MCE, in which case it must be cleared. CR4.PSE should be set explicitly, regardless of whether or not it was set before. CR4.PGE is set explicitly, and then cleared and set again after programming CR3 in order to flush TLB entries based on global translations. This makes the first assignment redundant, and can therefore be omitted. So clear PGE by omitting it from the preserve mask, and set it again explicitly after switching to the new page tables. [ bp: Document the exact operation of CR4.PGE ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-12-ardb+git@google.com --- arch/x86/kernel/head_64.S | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 86136a78778e..54207e7e5d33 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -185,6 +185,16 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) addq $(init_top_pgt - __START_KERNEL_map), %rax 1: + /* + * Create a mask of CR4 bits to preserve. Omit PGE in order to flush + * global 1:1 translations from the TLBs. + * + * From the SDM: + * "If CR4.PGE is changing from 0 to 1, there were no global TLB + * entries before the execution; if CR4.PGE is changing from 1 to 0, + * there will be no global TLB entries after the execution." + */ + movl $(X86_CR4_PAE | X86_CR4_LA57), %edx #ifdef CONFIG_X86_MCE /* * Preserve CR4.MCE if the kernel will enable #MC support. @@ -193,20 +203,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * configured will crash the system regardless of the CR4.MCE value set * here. */ + orl $X86_CR4_MCE, %edx +#endif movq %cr4, %rcx - andl $X86_CR4_MCE, %ecx -#else - movl $0, %ecx -#endif + andl %edx, %ecx - /* Enable PAE mode, PSE, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx -#ifdef CONFIG_X86_5LEVEL - testb $1, __pgtable_l5_enabled(%rip) - jz 1f - orl $X86_CR4_LA57, %ecx -1: -#endif + /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */ + btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 /* Setup early boot stage 4-/5-level pagetables. */ @@ -223,14 +226,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq %rax, %cr3 /* - * Do a global TLB flush after the CR3 switch to make sure the TLB - * entries from the identity mapping are flushed. + * Set CR4.PGE to re-enable global translations. */ - movq %cr4, %rcx - movq %rcx, %rax - xorq $X86_CR4_PGE, %rcx + btsl $X86_CR4_PGE_BIT, %ecx movq %rcx, %cr4 - movq %rax, %cr4 /* Ensure I am executing from virtual addresses */ movq $1f, %rax From 63bed96604205fa0b23c91d268df5f1f1b26faf6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:10 +0100 Subject: [PATCH 16/22] x86/startup_64: Defer assignment of 5-level paging global variables Assigning the 5-level paging related global variables from the earliest C code using explicit references that use the 1:1 translation of memory is unnecessary, as the startup code itself does not rely on them to create the initial page tables, and this is all it should be doing. So defer these assignments to the primary C entry code that executes via the ordinary kernel virtual mapping. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-13-ardb+git@google.com --- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/kernel/head64.c | 44 ++++++++----------------- 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 38b54b992f32..9053dfe9fa03 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -21,9 +21,9 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; typedef struct { pmdval_t pmd; } pmd_t; -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; +#ifdef CONFIG_X86_5LEVEL #ifdef USE_EARLY_PGTABLE_L5 /* * cpu_feature_enabled() is not available in early boot code. diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fd77a266f29d..212e8e06aeba 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -68,24 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -#ifdef CONFIG_X86_5LEVEL -static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +static inline bool check_la57_support(void) { - return ptr - (void *)_text + (void *)physaddr; -} + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return false; -static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) -{ - return fixup_pointer(ptr, physaddr); -} - -static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) -{ - return fixup_pointer(ptr, physaddr); -} - -static bool __head check_la57_support(unsigned long physaddr) -{ /* * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. @@ -93,21 +81,8 @@ static bool __head check_la57_support(unsigned long physaddr) if (!(native_read_cr4() & X86_CR4_LA57)) return false; - *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; - *fixup_int(&pgdir_shift, physaddr) = 48; - *fixup_int(&ptrs_per_p4d, physaddr) = 512; - *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; - *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; - *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; - return true; } -#else -static bool __head check_la57_support(unsigned long physaddr) -{ - return false; -} -#endif static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) { @@ -171,7 +146,7 @@ unsigned long __head __startup_64(unsigned long physaddr, bool la57; int i; - la57 = check_la57_support(physaddr); + la57 = check_la57_support(); /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -456,6 +431,15 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + if (check_la57_support()) { + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + page_offset_base = __PAGE_OFFSET_BASE_L5; + vmalloc_base = __VMALLOC_BASE_L5; + vmemmap_base = __VMEMMAP_BASE_L5; + } + cr4_init_shadow(); /* Kill off the identity-map trampoline */ From d6a41f184dcea0814260af2780e147022c11dca8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:11 +0100 Subject: [PATCH 17/22] x86/startup_64: Simplify calculation of initial page table address Determining the address of the initial page table to program into CR3 involves: - taking the physical address - adding the SME encryption mask On the primary entry path, the code is mapped using a 1:1 virtual to physical translation, so the physical address can be taken directly using a RIP-relative LEA instruction. On the secondary entry path, the address can be obtained by taking the offset from the virtual kernel base (__START_kernel_map) and adding the physical kernel base. This is implemented in a slightly confusing way, so clean this up. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-14-ardb+git@google.com --- arch/x86/kernel/head_64.S | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54207e7e5d33..b8b7118b6976 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -111,13 +111,11 @@ SYM_CODE_START_NOALIGN(startup_64) call __startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(early_top_pgt - __START_KERNEL_map), %rax + leaq early_top_pgt(%rip), %rcx + addq %rcx, %rax #ifdef CONFIG_AMD_MEM_ENCRYPT mov %rax, %rdi - mov %rax, %r14 - - addq phys_base(%rip), %rdi /* * For SEV guests: Verify that the C-bit is correct. A malicious @@ -126,12 +124,6 @@ SYM_CODE_START_NOALIGN(startup_64) * the next RET instruction. */ call sev_verify_cbit - - /* - * Restore CR3 value without the phys_base which will be added - * below, before writing %cr3. - */ - mov %r14, %rax #endif jmp 1f @@ -171,18 +163,18 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Clear %R15 which holds the boot_params pointer on the boot CPU */ xorl %r15d, %r15d + /* Derive the runtime physical address of init_top_pgt[] */ + movq phys_base(%rip), %rax + addq $(init_top_pgt - __START_KERNEL_map), %rax + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ #ifdef CONFIG_AMD_MEM_ENCRYPT - movq sme_me_mask, %rax -#else - xorl %eax, %eax + addq sme_me_mask(%rip), %rax #endif - /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* @@ -212,9 +204,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 - /* Setup early boot stage 4-/5-level pagetables. */ - addq phys_base(%rip), %rax - /* * Switch to new page-table * From 828263957611c210da00c1820db73fac217135b6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:12 +0100 Subject: [PATCH 18/22] x86/startup_64: Simplify virtual switch on primary boot The secondary startup code is used on the primary boot path as well, but in this case, the initial part runs from a 1:1 mapping, until an explicit cross-jump is made to the kernel virtual mapping of the same code. On the secondary boot path, this jump is pointless as the code already executes from the mapping targeted by the jump. So combine this cross-jump with the jump from startup_64() into the common boot path. This simplifies the execution flow, and clearly separates code that runs from a 1:1 mapping from code that runs from the kernel virtual mapping. Note that this requires a page table switch, so hoist the CR3 assignment into startup_64() as well. And since absolute symbol references will no longer be permitted in .head.text once we enable the associated build time checks, a RIP-relative memory operand is used in the JMP instruction, referring to an absolute constant in the .init.rodata section. Given that the secondary startup code does not require a special placement inside the executable, move it to the .text section. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-15-ardb+git@google.com --- arch/x86/kernel/head_64.S | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8b7118b6976..79f7c342e3da 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -39,7 +39,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) - .text __HEAD .code64 SYM_CODE_START_NOALIGN(startup_64) @@ -126,9 +125,21 @@ SYM_CODE_START_NOALIGN(startup_64) call sev_verify_cbit #endif - jmp 1f + /* + * Switch to early_top_pgt which still has the identity mappings + * present. + */ + movq %rax, %cr3 + + /* Branch to the common startup code at its kernel virtual address */ + ANNOTATE_RETPOLINE_SAFE + jmp *0f(%rip) SYM_CODE_END(startup_64) + __INITRODATA +0: .quad common_startup_64 + + .text SYM_CODE_START(secondary_startup_64) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR @@ -174,8 +185,15 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) #ifdef CONFIG_AMD_MEM_ENCRYPT addq sme_me_mask(%rip), %rax #endif + /* + * Switch to the init_top_pgt here, away from the trampoline_pgd and + * unmap the identity mapped ranges. + */ + movq %rax, %cr3 -1: +SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR /* * Create a mask of CR4 bits to preserve. Omit PGE in order to flush @@ -204,30 +222,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 - /* - * Switch to new page-table - * - * For the boot CPU this switches to early_top_pgt which still has the - * identity mappings present. The secondary CPUs will switch to the - * init_top_pgt here, away from the trampoline_pgd and unmap the - * identity mapped ranges. - */ - movq %rax, %cr3 - /* * Set CR4.PGE to re-enable global translations. */ btsl $X86_CR4_PGE_BIT, %ecx movq %rcx, %cr4 - /* Ensure I am executing from virtual addresses */ - movq $1f, %rax - ANNOTATE_RETPOLINE_SAFE - jmp *%rax -1: - UNWIND_HINT_END_OF_STACK - ANNOTATE_NOENDBR // above - #ifdef CONFIG_SMP /* * For parallel boot, the APIC ID is read from the APIC, and then From 7205f06e847422b66c1506eee01b9998ffc75d76 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:13 +0100 Subject: [PATCH 19/22] efi/libstub: Add generic support for parsing mem_encrypt= Parse the mem_encrypt= command line parameter from the EFI stub if CONFIG_ARCH_HAS_MEM_ENCRYPT=y, so that it can be passed to the early boot code by the arch code in the stub. This avoids the need for the core kernel to do any string parsing very early in the boot. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-16-ardb+git@google.com --- drivers/firmware/efi/libstub/efi-stub-helper.c | 8 ++++++++ drivers/firmware/efi/libstub/efistub.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index bfa30625f5d0..3dc2f9aaf08d 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -24,6 +24,8 @@ static bool efi_noinitrd; static bool efi_nosoftreserve; static bool efi_disable_pci_dma = IS_ENABLED(CONFIG_EFI_DISABLE_PCI_DMA); +int efi_mem_encrypt; + bool __pure __efi_soft_reserve_enabled(void) { return !efi_nosoftreserve; @@ -75,6 +77,12 @@ efi_status_t efi_parse_options(char const *cmdline) efi_noinitrd = true; } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) { efi_no5lvl = true; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) && + !strcmp(param, "mem_encrypt") && val) { + if (parse_option_str(val, "on")) + efi_mem_encrypt = 1; + else if (parse_option_str(val, "off")) + efi_mem_encrypt = -1; } else if (!strcmp(param, "efi") && val) { efi_nochunk = parse_option_str(val, "nochunk"); efi_novamap |= parse_option_str(val, "novamap"); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index c04b82ea40f2..fc18fd649ed7 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -37,8 +37,8 @@ extern bool efi_no5lvl; extern bool efi_nochunk; extern bool efi_nokaslr; extern int efi_loglevel; +extern int efi_mem_encrypt; extern bool efi_novamap; - extern const efi_system_table_t *efi_system_table; typedef union efi_dxe_services_table efi_dxe_services_table_t; From cd0d9d92c8bb46e77de62efd7df13069ddd61e7d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:14 +0100 Subject: [PATCH 20/22] x86/boot: Move mem_encrypt= parsing to the decompressor The early SME/SEV code parses the command line very early, in order to decide whether or not memory encryption should be enabled, which needs to occur even before the initial page tables are created. This is problematic for a number of reasons: - this early code runs from the 1:1 mapping provided by the decompressor or firmware, which uses a different translation than the one assumed by the linker, and so the code needs to be built in a special way; - parsing external input while the entire kernel image is still mapped writable is a bad idea in general, and really does not belong in security minded code; - the current code ignores the built-in command line entirely (although this appears to be the case for the entire decompressor) Given that the decompressor/EFI stub is an intrinsic part of the x86 bootable kernel image, move the command line parsing there and out of the core kernel. This removes the need to build lib/cmdline.o in a special way, or to use RIP-relative LEA instructions in inline asm blocks. This involves a new xloadflag in the setup header to indicate that mem_encrypt=on appeared on the kernel command line. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-17-ardb+git@google.com --- arch/x86/boot/compressed/misc.c | 15 ++++++++++++ arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/lib/Makefile | 13 ---------- arch/x86/mm/mem_encrypt_identity.c | 32 +++---------------------- drivers/firmware/efi/libstub/x86-stub.c | 3 +++ 5 files changed, 22 insertions(+), 42 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bd6857a9f15a..408507e305be 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -371,6 +371,19 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, return entry; } +/* + * Set the memory encryption xloadflag based on the mem_encrypt= command line + * parameter, if provided. + */ +static void parse_mem_encrypt(struct setup_header *hdr) +{ + int on = cmdline_find_option_bool("mem_encrypt=on"); + int off = cmdline_find_option_bool("mem_encrypt=off"); + + if (on > off) + hdr->xloadflags |= XLF_MEM_ENCRYPTION; +} + /* * The compressed kernel image (ZO), has been moved so that its position * is against the end of the buffer used to hold the uncompressed kernel @@ -401,6 +414,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) /* Clear flags intended for solely in-kernel use. */ boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG; + parse_mem_encrypt(&boot_params_ptr->hdr); + sanitize_boot_params(boot_params_ptr); if (boot_params_ptr->screen_info.orig_video_mode == 7) { diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 01d19fc22346..eeea058cf602 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -38,6 +38,7 @@ #define XLF_EFI_KEXEC (1<<4) #define XLF_5LEVEL (1<<5) #define XLF_5LEVEL_ENABLED (1<<6) +#define XLF_MEM_ENCRYPTION (1<<7) #ifndef __ASSEMBLY__ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index ea3a28e7b613..f0dae4fb6d07 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -14,19 +14,6 @@ ifdef CONFIG_KCSAN CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE) endif -# Early boot use of cmdline; don't instrument it -ifdef CONFIG_AMD_MEM_ENCRYPT -KCOV_INSTRUMENT_cmdline.o := n -KASAN_SANITIZE_cmdline.o := n -KCSAN_SANITIZE_cmdline.o := n - -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_cmdline.o = -pg -endif - -CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables -endif - inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 0166ab1780cc..d210c7fc8fa2 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -43,7 +43,6 @@ #include #include -#include #include #include @@ -95,9 +94,6 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; - static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; @@ -504,11 +500,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp) void __init sme_enable(struct boot_params *bp) { - const char *cmdline_ptr, *cmdline_arg, *cmdline_on; unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; unsigned long me_mask; - char buffer[16]; bool snp; u64 msr; @@ -551,6 +545,9 @@ void __init sme_enable(struct boot_params *bp) /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION)) + return; + /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a @@ -570,31 +567,8 @@ void __init sme_enable(struct boot_params *bp) msr = __rdmsr(MSR_AMD64_SYSCFG); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; - } else { - /* SEV state cannot be controlled by a command line option */ - goto out; } - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0 || - strncmp(buffer, cmdline_on, sizeof(buffer))) - return; - -out: RIP_REL_REF(sme_me_mask) = me_mask; physical_mask &= ~me_mask; cc_vendor = CC_VENDOR_AMD; diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 99429bc4b0c7..0336ed175e67 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -884,6 +884,9 @@ void __noreturn efi_stub_entry(efi_handle_t handle, } } + if (efi_mem_encrypt > 0) + hdr->xloadflags |= XLF_MEM_ENCRYPTION; + status = efi_decompress_kernel(&kernel_entry); if (status != EFI_SUCCESS) { efi_err("Failed to decompress kernel\n"); From 48204aba801f1b512b3abed10b8e1a63e03f3dd1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:15 +0100 Subject: [PATCH 21/22] x86/sme: Move early SME kernel encryption handling into .head.text The .head.text section is the initial primary entrypoint of the core kernel, and is entered with the CPU executing from a 1:1 mapping of memory. Such code must never access global variables using absolute references, as these are based on the kernel virtual mapping which is not active yet at this point. Given that the SME startup code is also called from this early execution context, move it into .head.text as well. This will allow more thorough build time checks in the future to ensure that early startup code only uses RIP-relative references to global variables. Also replace some occurrences of __pa_symbol() [which relies on the compiler generating an absolute reference, which is not guaranteed] and an open coded RIP-relative access with RIP_REL_REF(). Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-18-ardb+git@google.com --- arch/x86/include/asm/mem_encrypt.h | 8 +++--- arch/x86/mm/mem_encrypt_identity.c | 42 ++++++++++++------------------ 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index b31eb9fd5954..f922b682b9b4 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -47,8 +47,8 @@ void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sme_encrypt_kernel(struct boot_params *bp); -void __init sme_enable(struct boot_params *bp); +void sme_encrypt_kernel(struct boot_params *bp); +void sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); @@ -81,8 +81,8 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } -static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sme_encrypt_kernel(struct boot_params *bp) { } +static inline void sme_enable(struct boot_params *bp) { } static inline void sev_es_init_vc_handling(void) { } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d210c7fc8fa2..64b5005d49e5 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -94,7 +95,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -109,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -146,7 +147,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -162,7 +163,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -188,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -198,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -208,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -232,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __init sme_pgtable_calc(unsigned long len) +static unsigned long __head sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -284,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return entries + tables; } -void __init sme_encrypt_kernel(struct boot_params *bp) +void __head sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -319,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * memory from being cached. */ - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); + kernel_start = (unsigned long)RIP_REL_REF(_text); + kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -338,14 +338,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp) } #endif - /* - * We're running identity mapped, so we must obtain the address to the - * SME encryption workarea using rip-relative addressing. - */ - asm ("lea sme_workarea(%%rip), %0" - : "=r" (workarea_start) - : "p" (sme_workarea)); - /* * Calculate required number of workarea bytes needed: * executable encryption area size: @@ -355,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ - execute_start = workarea_start; + execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; @@ -498,7 +490,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __init sme_enable(struct boot_params *bp) +void __head sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; From 428080c9b19bfda37c478cd626dbd3851db1aff9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Feb 2024 16:19:16 +0100 Subject: [PATCH 22/22] x86/sev: Move early startup code into .head.text section In preparation for implementing rigorous build time checks to enforce that only code that can support it will be called from the early 1:1 mapping of memory, move SEV init code that is called in this manner to the .head.text section. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Lendacky Link: https://lore.kernel.org/r/20240227151907.387873-19-ardb+git@google.com --- arch/x86/boot/compressed/sev.c | 3 +++ arch/x86/include/asm/sev.h | 10 +++++----- arch/x86/kernel/sev-shared.c | 23 ++++++++++------------- arch/x86/kernel/sev.c | 14 ++++++++------ 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 073291832f44..bea0719d70f2 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -116,6 +116,9 @@ static bool fault_in_kernel_space(unsigned long address) #undef __init #define __init +#undef __head +#define __head + #define __BOOT_COMPRESSED /* Basic instruction decoding support needed */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index bed95e1f4d52..cf671138feef 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -213,16 +213,16 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) struct snp_guest_request_ioctl; void setup_ghcb(void); -void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages); -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages); +void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages); +void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages); void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __init __noreturn snp_abort(void); +void __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ae79f9505298..0bd7ccbe8732 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -93,7 +93,8 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) +static void __head __noreturn +sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -330,13 +331,7 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid */ static const struct snp_cpuid_table *snp_cpuid_get_table(void) { - void *ptr; - - asm ("lea cpuid_table_copy(%%rip), %0" - : "=r" (ptr) - : "p" (&cpuid_table_copy)); - - return ptr; + return &RIP_REL_REF(cpuid_table_copy); } /* @@ -395,7 +390,7 @@ static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool +static bool __head snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -532,7 +527,8 @@ static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static int __head +snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -574,7 +570,7 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -1025,7 +1021,8 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +static __head +struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; struct setup_data *hdr; @@ -1052,7 +1049,7 @@ static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 1ef7ae806a01..33c14aa1f06c 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -682,8 +683,9 @@ static u64 __init get_jump_table_addr(void) return ret; } -static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) +static void __head +early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) { unsigned long paddr_end; u64 val; @@ -739,7 +741,7 @@ e_term: sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } -void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { /* @@ -2062,7 +2064,7 @@ fail: * * Scan for the blob in that order. */ -static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -2088,7 +2090,7 @@ found_cc_info: return cc_info; } -bool __init snp_init(struct boot_params *bp) +bool __head snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -2110,7 +2112,7 @@ bool __init snp_init(struct boot_params *bp) return true; } -void __init __noreturn snp_abort(void) +void __head __noreturn snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); }