From 122333d6bd229af279cdb35d1b874b71b3b9ccfb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 5 May 2023 15:03:32 +0300 Subject: [PATCH 1/4] x86/tdx: Wrap exit reason with hcall_func() TDX reuses VMEXIT "reasons" in its guest->host hypercall ABI. This is confusing because there might not be a VMEXIT involved at *all*. These instances are supposed to document situation and reduce confusion by wrapping VMEXIT reasons with hcall_func(). The decompression code does not follow this convention. Unify the TDX decompression code with the other TDX use of VMEXIT reasons. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Dave Hansen Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20230505120332.1429957-1-nik.borisov%40suse.com --- arch/x86/boot/compressed/tdx.c | 4 ++-- arch/x86/coco/tdx/tdx.c | 11 ----------- arch/x86/include/asm/shared/tdx.h | 11 +++++++++++ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 2d81d3cc72a1..8841b945a1e2 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -20,7 +20,7 @@ static inline unsigned int tdx_io_in(int size, u16 port) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 0, .r14 = port, @@ -36,7 +36,7 @@ static inline void tdx_io_out(int size, u16 port, u32 value) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 1, .r14 = port, diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e146b599260f..15569bd32ed5 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -76,17 +76,6 @@ noinstr void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } -/* - * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined - * independently from but are currently matched 1:1 with VMX EXIT_REASONs. - * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and - * guest sides of these calls. - */ -static __always_inline u64 hcall_func(u64 exit_reason) -{ - return exit_reason; -} - #ifdef CONFIG_KVM_GUEST long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2631e01f6e0f..b415a24f0d48 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -40,5 +40,16 @@ u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static __always_inline u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ From 3f6819dd192ef4f0c568ec3e9d6d408b3fa1ad3d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 Jun 2023 12:56:20 +0300 Subject: [PATCH 2/4] x86/mm: Allow guest.enc_status_change_prepare() to fail TDX code is going to provide guest.enc_status_change_prepare() that is able to fail. TDX will use the call to convert the GPA range from shared to private. This operation can fail. Add a way to return an error from the callback. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/all/20230606095622.1939-2-kirill.shutemov%40linux.intel.com --- arch/x86/include/asm/x86_init.h | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/mm/mem_encrypt_amd.c | 4 +++- arch/x86/mm/pat/set_memory.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 88085f369ff6..1ca9701917c5 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -150,7 +150,7 @@ struct x86_init_acpi { * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status */ struct x86_guest { - void (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); + bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); bool (*enc_tlb_flush_required)(bool enc); bool (*enc_cache_flush_required)(void); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d82f4fa2f1bf..f230d4d7d8eb 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -130,7 +130,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; -static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { } +static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index e0b51c09109f..4f95c449a406 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -319,7 +319,7 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) #endif } -static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) +static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) { /* * To maintain the security guarantees of SEV-SNP guests, make sure @@ -327,6 +327,8 @@ static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool */ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) snp_set_memory_shared(vaddr, npages); + + return true; } /* Return true unconditionally: return value doesn't matter for the SEV side */ diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 7159cf787613..b8f48ebe753c 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2151,7 +2151,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ - x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); + if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) + return -EIO; ret = __change_page_attr_set_clr(&cpa, 1); From 195edce08b63d293377f615f4f7f086715d2d212 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 Jun 2023 12:56:21 +0300 Subject: [PATCH 3/4] x86/tdx: Fix race between set_memory_encrypted() and load_unaligned_zeropad() tl;dr: There is a race in the TDX private<=>shared conversion code which could kill the TDX guest. Fix it by changing conversion ordering to eliminate the window. TDX hardware maintains metadata to track which pages are private and shared. Additionally, TDX guests use the guest x86 page tables to specify whether a given mapping is intended to be private or shared. Bad things happen when the intent and metadata do not match. So there are two thing in play: 1. "the page" -- the physical TDX page metadata 2. "the mapping" -- the guest-controlled x86 page table intent For instance, an unrecoverable exit to VMM occurs if a guest touches a private mapping that points to a shared physical page. In summary: * Private mapping => Private Page == OK (obviously) * Shared mapping => Shared Page == OK (obviously) * Private mapping => Shared Page == BIG BOOM! * Shared mapping => Private Page == OK-ish (It will read generate a recoverable #VE via handle_mmio()) Enter load_unaligned_zeropad(). It can touch memory that is adjacent but otherwise unrelated to the memory it needs to touch. It will cause one of those unrecoverable exits (aka. BIG BOOM) if it blunders into a shared mapping pointing to a private page. This is a problem when __set_memory_enc_pgtable() converts pages from shared to private. It first changes the mapping and second modifies the TDX page metadata. It's moving from: * Shared mapping => Shared Page == OK to: * Private mapping => Shared Page == BIG BOOM! This means that there is a window with a shared mapping pointing to a private page where load_unaligned_zeropad() can strike. Add a TDX handler for guest.enc_status_change_prepare(). This converts the page from shared to private *before* the page becomes private. This ensures that there is never a private mapping to a shared page. Leave a guest.enc_status_change_finish() in place but only use it for private=>shared conversions. This will delay updating the TDX metadata marking the page private until *after* the mapping matches the metadata. This also ensures that there is never a private mapping to a shared page. [ dhansen: rewrite changelog ] Fixes: 7dbde7631629 ("x86/mm/cpa: Add support for TDX shared memory") Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/all/20230606095622.1939-3-kirill.shutemov%40linux.intel.com --- arch/x86/coco/tdx/tdx.c | 51 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 15569bd32ed5..5b8056f6c83f 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -829,6 +829,30 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) return true; } +static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages, + bool enc) +{ + /* + * Only handle shared->private conversion here. + * See the comment in tdx_early_init(). + */ + if (enc) + return tdx_enc_status_changed(vaddr, numpages, enc); + return true; +} + +static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, + bool enc) +{ + /* + * Only handle private->shared conversion here. + * See the comment in tdx_early_init(). + */ + if (!enc) + return tdx_enc_status_changed(vaddr, numpages, enc); + return true; +} + void __init tdx_early_init(void) { u64 cc_mask; @@ -856,9 +880,30 @@ void __init tdx_early_init(void) */ physical_mask &= cc_mask - 1; - x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; - x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; - x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed; + /* + * The kernel mapping should match the TDX metadata for the page. + * load_unaligned_zeropad() can touch memory *adjacent* to that which is + * owned by the caller and can catch even _momentary_ mismatches. Bad + * things happen on mismatch: + * + * - Private mapping => Shared Page == Guest shutdown + * - Shared mapping => Private Page == Recoverable #VE + * + * guest.enc_status_change_prepare() converts the page from + * shared=>private before the mapping becomes private. + * + * guest.enc_status_change_finish() converts the page from + * private=>shared after the mapping becomes private. + * + * In both cases there is a temporary shared mapping to a private page, + * which can result in a #VE. But, there is never a private mapping to + * a shared page. + */ + x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare; + x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish; + + x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; pr_info("Guest detected\n"); } From 94142c9d1bdf1c18027a42758ceb6bdd59a92012 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 Jun 2023 12:56:22 +0300 Subject: [PATCH 4/4] x86/mm: Fix enc_status_change_finish_noop() enc_status_change_finish_noop() is now defined as always-fail, which doesn't make sense for noop. The change has no user-visible effect because it is only called if the platform has CC_ATTR_MEM_ENCRYPT. All platforms with the attribute override the callback with their own implementation. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/all/20230606095622.1939-4-kirill.shutemov%40linux.intel.com --- arch/x86/kernel/x86_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f230d4d7d8eb..64664311ac2b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -131,7 +131,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } -static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } static bool is_private_mmio_noop(u64 addr) {return false; }