From 1e03bff3600101bd9158d005e4313132e55bdec8 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jun 2019 19:35:36 -0700 Subject: [PATCH 1/7] x86/cpu/intel: Clear cache self-snoop capability in CPUs with known errata Processors which have self-snooping capability can handle conflicting memory type across CPUs by snooping its own cache. However, there exists CPU models in which having conflicting memory types still leads to unpredictable behavior, machine check errors, or hangs. Clear this feature on affected CPUs to prevent its use. Suggested-by: Alan Cox Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Tony Luck Cc: "H. Peter Anvin" Cc: Andy Shevchenko Cc: Andi Kleen Cc: Hans de Goede Cc: Greg Kroah-Hartman Cc: Jordan Borgner Cc: "Ravi V. Shankar" Cc: Mohammad Etemadi Cc: Ricardo Neri Cc: Andy Shevchenko Cc: Andi Kleen Cc: Peter Feiner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/1561689337-19390-2-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/cpu/intel.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f17c1a714779..8d6d92ebeb54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) @@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + check_memory_type_self_snoop_errata(c); /* * Get the number of SMT siblings early from the extended topology From fd329f276ecaad7a371d6f91b9bbea031d0c3440 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jun 2019 19:35:37 -0700 Subject: [PATCH 2/7] x86/mtrr: Skip cache flushes on CPUs with cache self-snooping Programming MTRR registers in multi-processor systems is a rather lengthy process. Furthermore, all processors must program these registers in lock step and with interrupts disabled; the process also involves flushing caches and TLBs twice. As a result, the process may take a considerable amount of time. On some platforms, this can lead to a large skew of the refined-jiffies clock source. Early when booting, if no other clock is available (e.g., booting with hpet=disabled), the refined-jiffies clock source is used to monitor the TSC clock source. If the skew of refined-jiffies is too large, Linux wrongly assumes that the TSC is unstable: clocksource: timekeeping watchdog on CPU1: Marking clocksource 'tsc-early' as unstable because the skew is too large: clocksource: 'refined-jiffies' wd_now: fffedc10 wd_last: fffedb90 mask: ffffffff clocksource: 'tsc-early' cs_now: 5eccfddebc cs_last: 5e7e3303d4 mask: ffffffffffffffff tsc: Marking TSC unstable due to clocksource watchdog As per measurements, around 98% of the time needed by the procedure to program MTRRs in multi-processor systems is spent flushing caches with wbinvd(). As per the Section 11.11.8 of the Intel 64 and IA 32 Architectures Software Developer's Manual, it is not necessary to flush caches if the CPU supports cache self-snooping. Thus, skipping the cache flushes can reduce by several tens of milliseconds the time needed to complete the programming of the MTRR registers: Platform Before After 104-core (208 Threads) Skylake 1437ms 28ms 2-core ( 4 Threads) Haswell 114ms 2ms Reported-by: Mohammad Etemadi Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Alan Cox Cc: Tony Luck Cc: "H. Peter Anvin" Cc: Andy Shevchenko Cc: Andi Kleen Cc: Hans de Goede Cc: Greg Kroah-Hartman Cc: Jordan Borgner Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Andy Shevchenko Cc: Andi Kleen Cc: Peter Feiner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/1561689337-19390-3-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/cpu/mtrr/generic.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9356c1c9024d..aa5c064a6a22 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - wbinvd(); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (boot_cpu_has(X86_FEATURE_PGE)) { @@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - wbinvd(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) From 9402eaf4c11f0b892eda7b2bcb4654ab34ce34f9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Jul 2019 20:43:19 -0700 Subject: [PATCH 3/7] selftests/x86: Test SYSCALL and SYSENTER manually with TF set Make sure that both variants of the nasty TF-in-compat-syscall are exercised regardless of what vendor's CPU is running the tests. Also change the intentional signal after SYSCALL to use ud2, which is a lot more comprehensible. This crashes the kernel due to an FSGSBASE bug right now. This test *also* detects a bug in KVM when run on an Intel host. KVM people, feel free to use it to help debug. There's a bunch of code in this test to warn instead of going into an infinite looping when the bug gets triggered. Reported-by: Vegard Nossum Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Cc: "Bae, Chang Seok" Link: https://lkml.kernel.org/r/5f5de10441ab2e3005538b4c33be9b1965d1bb63.1562035429.git.luto@kernel.org --- tools/testing/selftests/x86/Makefile | 5 +- .../testing/selftests/x86/syscall_arg_fault.c | 112 +++++++++++++++++- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 186520198de7..fa07d526fe39 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ + protection_keys test_vdso test_vsyscall mov_ss_trap \ + syscall_arg_fault +TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 4e25d38c8bbd..bc0ecc2e862e 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -15,9 +15,30 @@ #include #include +#ifdef __x86_64__ +# define WIDTH "q" +#else +# define WIDTH "l" +#endif + /* Our sigaltstack scratch space. */ static unsigned char altstack_data[SIGSTKSZ]; +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +#define X86_EFLAGS_TF (1UL << 8) + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; +#ifdef __x86_64__ +#define REG_AX REG_RAX +#define REG_IP REG_RIP +#else +#define REG_AX REG_EAX +#define REG_IP REG_EIP +#endif + static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + long ax = (long)ctx->uc_mcontext.gregs[REG_AX]; - if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { - printf("[FAIL]\tAX had the wrong value: 0x%x\n", - ctx->uc_mcontext.gregs[REG_EAX]); + if (ax != -EFAULT && ax != -ENOSYS) { + printf("[FAIL]\tAX had the wrong value: 0x%lx\n", + (unsigned long)ax); n_errs++; } else { printf("[OK]\tSeems okay\n"); @@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) siglongjmp(jmpbuf, 1); } +static volatile sig_atomic_t sigtrap_consecutive_syscalls; + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + /* + * KVM has some bugs that can cause us to stop making progress. + * detect them and complain, but don't infinite loop or fail the + * test. + */ + + ucontext_t *ctx = (ucontext_t*)ctx_void; + unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP]; + + if (*ip == 0x340f || *ip == 0x050f) { + /* The trap was on SYSCALL or SYSENTER */ + sigtrap_consecutive_syscalls++; + if (sigtrap_consecutive_syscalls > 3) { + printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n"); + siglongjmp(jmpbuf, 1); + } + } else { + sigtrap_consecutive_syscalls = 0; + } +} + static void sigill(int sig, siginfo_t *info, void *ctx_void) { - printf("[SKIP]\tIllegal instruction\n"); + ucontext_t *ctx = (ucontext_t*)ctx_void; + unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP]; + + if (*ip == 0x0b0f) { + /* one of the ud2 instructions faulted */ + printf("[OK]\tSYSCALL returned normally\n"); + } else { + printf("[SKIP]\tIllegal instruction\n"); + } siglongjmp(jmpbuf, 1); } @@ -120,9 +183,48 @@ int main() "movl $-1, %%ebp\n\t" "movl $-1, %%esp\n\t" "syscall\n\t" - "pushl $0" /* make sure we segfault cleanly */ + "ud2" /* make sure we recover cleanly */ : : : "memory", "flags"); } + printf("[RUN]\tSYSENTER with TF and invalid state\n"); + sethandler(SIGTRAP, sigtrap, SA_ONSTACK); + + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "sysenter" + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + + printf("[RUN]\tSYSCALL with TF and invalid state\n"); + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "syscall\n\t" + "ud2" /* make sure we recover cleanly */ + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + return 0; } From dffb3f9db6b593f3ed6ab4c8d8f10e0aa6aa7a88 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Jul 2019 20:43:20 -0700 Subject: [PATCH 4/7] x86/entry/64: Don't compile ignore_sysret if 32-bit emulation is enabled It's only used if !CONFIG_IA32_EMULATION, so disable it in normal configs. This will save a few bytes of text and reduce confusion. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "Bae, Chang Seok" Link: https://lkml.kernel.org/r/0f7dafa72fe7194689de5ee8cfe5d83509fabcf5.1562035429.git.luto@kernel.org --- arch/x86/entry/entry_64.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7f9f5119d6b1..54b1b0468b2b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1743,11 +1743,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC From 539bca535decb11a0861b6205c6684b8e908589b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Jul 2019 20:43:21 -0700 Subject: [PATCH 5/7] x86/entry/64: Fix and clean up paranoid_exit paranoid_exit needs to restore CR3 before GSBASE. Doing it in the opposite order crashes if the exception came from a context with user GSBASE and user CR3 -- RESTORE_CR3 cannot resture user CR3 if run with user GSBASE. This results in infinitely recursing exceptions if user code does SYSENTER with TF set if both FSGSBASE and PTI are enabled. The old code worked if user code just set TF without SYSENTER because #DB from user mode is special cased in idtentry and paranoid_exit doesn't run. Fix it by cleaning up the spaghetti code. All that paranoid_exit needs to do is to disable IRQs, handle IRQ tracing, then restore CR3, and restore GSBASE. Simply do those actions in that order. Fixes: 708078f65721 ("x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit") Reported-by: Vegard Nossum Signed-off-by: Chang S. Bae Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/59725ceb08977359489fbed979716949ad45f616.1562035429.git.luto@kernel.org --- arch/x86/entry/entry_64.S | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 54b1b0468b2b..670306f588bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1256,31 +1256,32 @@ END(paranoid_entry) ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - /* Handle GS depending on FSGSBASE availability */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE + /* + * The order of operations is important. IRQ tracing requires + * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GS base. + * + * NB to anyone to tries to optimize this code: this code does + * not execute at all for exceptions coming from user mode. Those + * exceptions go through error_exit instead. + */ + TRACE_IRQS_IRETQ_DEBUG + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases. */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE /* With FSGSBASE enabled, unconditionally restore GSBASE */ wrgsbase %rbx - jmp .Lparanoid_exit_no_swapgs; + jmp restore_regs_and_return_to_kernel .Lparanoid_exit_checkgs: /* On non-FSGSBASE systems, conditionally do SWAPGS */ testl %ebx, %ebx - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE. */ SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore - -.Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - -.Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel END(paranoid_exit) From 697096b14444f458fb81212d1c82d7846e932455 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 2 Jul 2019 20:43:04 -0700 Subject: [PATCH 6/7] selftests/x86/fsgsbase: Fix some test case bugs This refactors do_unexpected_base() to clean up some code. It also fixes the following bugs in test_ptrace_write_gsbase(): - Incorrect printf() format string caused crashes. - Hardcoded 0x7 for the gs selector was not reliably correct. It also documents the fact that the test is expected to fail on old kernels. Fixes: a87730cc3acc ("selftests/x86/fsgsbase: Test ptracer-induced GSBASE write with FSGSBASE") Fixes: 1b6858d5a2eb ("selftests/x86/fsgsbase: Test ptracer-induced GSBASE write") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: H. Peter Anvin Cc: "BaeChang Seok" Link: https://lkml.kernel.org/r/bab29c84f2475e2c30ddb00f1b877fcd7f4f96a8.1562125333.git.luto@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 74 ++++++++++++++------------ 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 21fd4f94b5b0..5ab4c60c100e 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -35,6 +35,8 @@ static volatile sig_atomic_t want_segv; static volatile unsigned long segv_addr; +static unsigned short *shared_scratch; + static int nerrs; static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), @@ -242,16 +244,11 @@ static void do_remote_base() static __thread int set_thread_area_entry_number = -1; -static void do_unexpected_base(void) +static unsigned short load_gs(void) { /* - * The goal here is to try to arrange for GS == 0, GSBASE != - * 0, and for the the kernel the think that GSBASE == 0. - * - * To make the test as reliable as possible, this uses - * explicit descriptors. (This is not the only way. This - * could use ARCH_SET_GS with a low, nonzero base, but the - * relevant side effect of ARCH_SET_GS could change.) + * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think + * that GSBASE == 0 (i.e. thread.gsbase == 0). */ /* Step 1: tell the kernel that we have GSBASE == 0. */ @@ -271,8 +268,9 @@ static void do_unexpected_base(void) .useable = 0 }; if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { - printf("\tother thread: using LDT slot 0\n"); + printf("\tusing LDT slot 0\n"); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + return 0x7; } else { /* No modify_ldt for us (configured out, perhaps) */ @@ -294,20 +292,15 @@ static void do_unexpected_base(void) if (ret != 0) { printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); - return; + return 0; } - printf("\tother thread: using GDT slot %d\n", desc.entry_number); + printf("\tusing GDT slot %d\n", desc.entry_number); set_thread_area_entry_number = desc.entry_number; - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); + unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3); + asm volatile ("mov %0, %%gs" : : "rm" (gs)); + return gs; } - - /* - * Step 3: set the selector back to zero. On AMD chips, this will - * preserve GSBASE. - */ - - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); } void test_wrbase(unsigned short index, unsigned long base) @@ -346,12 +339,19 @@ static void *threadproc(void *ctx) if (ftx == 3) return NULL; - if (ftx == 1) + if (ftx == 1) { do_remote_base(); - else if (ftx == 2) - do_unexpected_base(); - else + } else if (ftx == 2) { + /* + * On AMD chips, this causes GSBASE != 0, GS == 0, and + * thread.gsbase == 0. + */ + + load_gs(); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + } else { errx(1, "helper thread got bad command"); + } ftx = 0; syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); @@ -453,12 +453,7 @@ static void test_ptrace_write_gsbase(void) if (child == 0) { printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n"); - /* - * Use the LDT setup and fetch the GSBASE from the LDT - * by switching to the (nonzero) selector (again) - */ - do_unexpected_base(); - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + *shared_scratch = load_gs(); if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) err(1, "PTRACE_TRACEME"); @@ -476,7 +471,7 @@ static void test_ptrace_write_gsbase(void) gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); - if (gs != 0x7) { + if (gs != *shared_scratch) { nerrs++; printf("[FAIL]\tGS is not prepared with nonzero\n"); goto END; @@ -494,16 +489,24 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != 0x7) { + if (gs != *shared_scratch) { nerrs++; printf("[FAIL]\tGS changed to %lx\n", gs); + + /* + * On older kernels, poking a nonzero value into the + * base would zero the selector. On newer kernels, + * this behavior has changed -- poking the base + * changes only the base and, if FSGSBASE is not + * available, this may have no effect. + */ + if (gs == 0) + printf("\tNote: this is expected behavior on older kernels.\n"); } else if (have_fsgsbase && (base != 0xFF)) { nerrs++; printf("[FAIL]\tGSBASE changed to %lx\n", base); } else { - printf("[OK]\tGS remained 0x7 %s"); - if (have_fsgsbase) - printf("and GSBASE changed to 0xFF"); + printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); printf("\n"); } } @@ -516,6 +519,9 @@ int main() { pthread_t thread; + shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + /* Probe FSGSBASE */ sethandler(SIGILL, sigill, 0); if (sigsetjmp(jmpbuf, 1) == 0) { From 049331f277fef1c3f2527c2c9afa1d285e9a1247 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Jul 2019 14:19:36 +0200 Subject: [PATCH 7/7] x86/fsgsbase: Revert FSGSBASE support The FSGSBASE series turned out to have serious bugs and there is still an open issue which is not fully understood yet. The confidence in those changes has become close to zero especially as the test cases which have been shipped with that series were obviously never run before sending the final series out to LKML. ./fsgsbase_64 >/dev/null Segmentation fault As the merge window is close, the only sane decision is to revert FSGSBASE support. The revert is necessary as this branch has been merged into perf/core already and rebasing all of that a few days before the merge window is not the most brilliant idea. I could definitely slap myself for not noticing the test case fail when merging that series, but TBH my expectations weren't that low back then. Won't happen again. Revert the following commits: 539bca535dec ("x86/entry/64: Fix and clean up paranoid_exit") 2c7b5ac5d5a9 ("Documentation/x86/64: Add documentation for GS/FS addressing mode") f987c955c745 ("x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2") 2032f1f96ee0 ("x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit") 5bf0cab60ee2 ("x86/entry/64: Document GSBASE handling in the paranoid path") 708078f65721 ("x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit") 79e1932fa3ce ("x86/entry/64: Introduce the FIND_PERCPU_BASE macro") 1d07316b1363 ("x86/entry/64: Switch CR3 before SWAPGS in paranoid entry") f60a83df4593 ("x86/process/64: Use FSGSBASE instructions on thread copy and ptrace") 1ab5f3f7fe3d ("x86/process/64: Use FSBSBASE in switch_to() if available") a86b4625138d ("x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions") 8b71340d702e ("x86/fsgsbase/64: Add intrinsics for FSGSBASE instructions") b64ed19b93c3 ("x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE") Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Andi Kleen Cc: Ravi Shankar Cc: Dave Hansen Cc: H. Peter Anvin --- .../admin-guide/kernel-parameters.txt | 2 - Documentation/x86/entry_64.rst | 9 - Documentation/x86/x86_64/fsgs.rst | 199 ------------------ Documentation/x86/x86_64/index.rst | 1 - arch/x86/entry/calling.h | 40 ---- arch/x86/entry/entry_64.S | 132 +++--------- arch/x86/include/asm/fsgsbase.h | 59 ++---- arch/x86/include/asm/inst.h | 15 -- arch/x86/include/uapi/asm/hwcap2.h | 3 - arch/x86/kernel/cpu/common.c | 22 -- arch/x86/kernel/process_64.c | 119 ++--------- 11 files changed, 57 insertions(+), 544 deletions(-) delete mode 100644 Documentation/x86/x86_64/fsgs.rst diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35bc3c3574c6..138f6664b2e2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,6 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - nofsgsbase [X86] Disables FSGSBASE instructions. - no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst index b87c1d816aea..a48b3f6ebbe8 100644 --- a/Documentation/x86/entry_64.rst +++ b/Documentation/x86/entry_64.rst @@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors that absolutely need the more expensive check for the GS base - and we generate all 'normal' entry points with the regular (faster) paranoid=0 variant. - -On a FSGSBASE system, however, user space can set GS without kernel -interaction. It means the value of GS base itself does not imply anything, -whether a kernel value or a user space value. So, there is no longer a safe -way to check whether the exception is entering from user mode or kernel -mode in the paranoid entry code path. So the GSBASE value needs to be read -out, saved and the kernel GSBASE value written. On exit the saved GSBASE -value needs to be restored unconditionally. The non paranoid entry/exit -code still uses SWAPGS unconditionally as the state is known. diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst deleted file mode 100644 index 380c0b5ccca2..000000000000 --- a/Documentation/x86/x86_64/fsgs.rst +++ /dev/null @@ -1,199 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Using FS and GS segments in user space applications -=================================================== - -The x86 architecture supports segmentation. Instructions which access -memory can use segment register based addressing mode. The following -notation is used to address a byte within a segment: - - Segment-register:Byte-address - -The segment base address is added to the Byte-address to compute the -resulting virtual address which is accessed. This allows to access multiple -instances of data with the identical Byte-address, i.e. the same code. The -selection of a particular instance is purely based on the base-address in -the segment register. - -In 32-bit mode the CPU provides 6 segments, which also support segment -limits. The limits can be used to enforce address space protections. - -In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is -always 0 to provide a full 64bit address space. The FS and GS segments are -still functional in 64-bit mode. - -Common FS and GS usage ------------------------------- - -The FS segment is commonly used to address Thread Local Storage (TLS). FS -is usually managed by runtime code or a threading library. Variables -declared with the '__thread' storage class specifier are instantiated per -thread and the compiler emits the FS: address prefix for accesses to these -variables. Each thread has its own FS base address so common code can be -used without complex address offset calculations to access the per thread -instances. Applications should not use FS for other purposes when they use -runtimes or threading libraries which manage the per thread FS. - -The GS segment has no common use and can be used freely by -applications. GCC and Clang support GS based addressing via address space -identifiers. - -Reading and writing the FS/GS base address ------------------------------------------- - -There exist two mechanisms to read and write the FS/FS base address: - - - the arch_prctl() system call - - - the FSGSBASE instruction family - -Accessing FS/GS base with arch_prctl() --------------------------------------- - - The arch_prctl(2) based mechanism is available on all 64bit CPUs and all - kernel versions. - - Reading the base: - - arch_prctl(ARCH_GET_FS, &fsbase); - arch_prctl(ARCH_GET_GS, &gsbase); - - Writing the base: - - arch_prctl(ARCH_SET_FS, fsbase); - arch_prctl(ARCH_SET_GS, gsbase); - - The ARCH_SET_GS prctl may be disabled depending on kernel configuration - and security settings. - -Accessing FS/GS base with the FSGSBASE instructions ---------------------------------------------------- - - With the Ivy Bridge CPU generation Intel introduced a new set of - instructions to access the FS and GS base registers directly from user - space. These instructions are also supported on AMD Family 17H CPUs. The - following instructions are available: - - =============== =========================== - RDFSBASE %reg Read the FS base register - RDGSBASE %reg Read the GS base register - WRFSBASE %reg Write the FS base register - WRGSBASE %reg Write the GS base register - =============== =========================== - - The instructions avoid the overhead of the arch_prctl() syscall and allow - more flexible usage of the FS/GS addressing modes in user space - applications. This does not prevent conflicts between threading libraries - and runtimes which utilize FS and applications which want to use it for - their own purpose. - -FSGSBASE instructions enablement -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If - available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. - - The availability of the instructions does not enable them - automatically. The kernel has to enable them explicitly in CR4. The - reason for this is that older kernels make assumptions about the values in - the GS register and enforce them when GS base is set via - arch_prctl(). Allowing user space to write arbitrary values to GS base - would violate these assumptions and cause malfunction. - - On kernels which do not enable FSGSBASE the execution of the FSGSBASE - instructions will fault with a #UD exception. - - The kernel provides reliable information about the enabled state in the - ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the - kernel has FSGSBASE instructions enabled and applications can use them. - The following code example shows how this detection works:: - - #include - #include - - /* Will be eventually in asm/hwcap.h */ - #ifndef HWCAP2_FSGSBASE - #define HWCAP2_FSGSBASE (1 << 1) - #endif - - .... - - unsigned val = getauxval(AT_HWCAP2); - - if (val & HWCAP2_FSGSBASE) - printf("FSGSBASE enabled\n"); - -FSGSBASE instructions compiler support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE -instructions. Clang supports them as well. - - =================== =========================== - _readfsbase_u64() Read the FS base register - _readfsbase_u64() Read the GS base register - _writefsbase_u64() Write the FS base register - _writegsbase_u64() Write the GS base register - =================== =========================== - -To utilize these instrinsics must be included in the source -code and the compiler option -mfsgsbase has to be added. - -Compiler support for FS/GS based addressing -------------------------------------------- - -GCC version 6 and newer provide support for FS/GS based addressing via -Named Address Spaces. GCC implements the following address space -identifiers for x86: - - ========= ==================================== - __seg_fs Variable is addressed relative to FS - __seg_gs Variable is addressed relative to GS - ========= ==================================== - -The preprocessor symbols __SEG_FS and __SEG_GS are defined when these -address spaces are supported. Code which implements fallback modes should -check whether these symbols are defined. Usage example:: - - #ifdef __SEG_GS - - long data0 = 0; - long data1 = 1; - - long __seg_gs *ptr; - - /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ - .... - - /* Set GS to point to data0 */ - _writegsbase_u64(&data0); - - /* Access offset 0 of GS */ - ptr = 0; - printf("data0 = %ld\n", *ptr); - - /* Set GS to point to data1 */ - _writegsbase_u64(&data1); - /* ptr still addresses offset 0! */ - printf("data1 = %ld\n", *ptr); - - -Clang does not provide the GCC address space identifiers, but it provides -address spaces via an attribute based mechanism in Clang 5 and newer -versions: - - ==================================== ===================================== - __attribute__((address_space(256)) Variable is addressed relative to GS - __attribute__((address_space(257)) Variable is addressed relative to FS - ==================================== ===================================== - -FS/GS based addressing with inline assembly -------------------------------------------- - -In case the compiler does not support address spaces, inline assembly can -be used for FS/GS based addressing mode:: - - mov %fs:offset, %reg - mov %gs:offset, %reg - - mov %reg, %fs:offset - mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index a56070fc8e77..d6eaaa5a35fc 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,4 +14,3 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck - fsgs diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d3fbe2dc03ea..efb0d1b1f15f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,7 +6,6 @@ #include #include #include -#include /* @@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req - rdgsbase \save_reg - GET_PERCPU_BASE \scratch_reg - wrgsbase \scratch_reg -.endm - #endif /* CONFIG_X86_64 */ .macro STACKLEAK_ERASE @@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#ifdef CONFIG_SMP - -/* - * CPU/node NR is loaded from the limit (size) field of a special segment - * descriptor entry in GDT. - */ -.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req - movq $__CPUNODE_SEG, \reg - lsl \reg, \reg -.endm - -/* - * Fetch the per-CPU GSBASE value for this processor and put it in @reg. - * We normally use %gs for accessing per-CPU data, but we are setting up - * %gs here and obviously can not use %gs itself to access per-CPU data. - */ -.macro GET_PERCPU_BASE reg:req - ALTERNATIVE \ - "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ - "RDPID \reg", \ - X86_FEATURE_RDPID - andq $VDSO_CPUNODE_MASK, \reg - movq __per_cpu_offset(, \reg, 8), \reg -.endm - -#else - -.macro GET_PERCPU_BASE reg:req - movq pcpu_unit_offsets(%rip), \reg -.endm - -#endif /* CONFIG_SMP */ - /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 670306f588bf..3b7a0e8d3bc0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,7 +38,6 @@ #include #include #include -#include #include #include "calling.h" @@ -948,6 +947,7 @@ ENTRY(\sym) addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid jmp paranoid_exit .else @@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs. Return GSBASE related information - * in EBX depending on the availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y GSBASE value at entry, must be restored in paranoid_exit + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. - * - * Switching CR3 does not depend on kernel GSBASE so it can - * be done before switching to the kernel GSBASE. This is - * required for FSGSBASE because the kernel GSBASE has to - * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 - /* - * Handling GSBASE depends on the availability of FSGSBASE. - * - * Without FSGSBASE the kernel enforces that negative GSBASE - * values indicate kernel GSBASE. With FSGSBASE no assumptions - * can be made about the GSBASE value when entering from user - * space. - */ - ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE - - /* - * Read the current GSBASE and store it in in %rbx unconditionally, - * retrieve and set the current CPUs kernel GSBASE. The stored value - * has to be restored in paranoid_exit unconditionally. - */ - SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - ret - -.Lparanoid_entry_checkgs: - /* EBX = 1 -> kernel GSBASE active, no restore required */ - movl $1, %ebx - /* - * The kernel-enforced convention is a negative GSBASE indicates - * a kernel value. No SWAPGS needed on entry and exit. - */ - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - jns .Lparanoid_entry_swapgs - ret - -.Lparanoid_entry_swapgs: - SWAPGS - /* EBX = 0 -> SWAPGS required on exit */ - xorl %ebx, %ebx ret END(paranoid_entry) @@ -1241,48 +1204,28 @@ END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, there's no good reason to try - * to handle preemption here. + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. * - * R/EBX contains the GSBASE related information depending on the - * availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y User space GSBASE, must be restored unconditionally + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - - /* - * The order of operations is important. IRQ tracing requires - * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GS base. - * - * NB to anyone to tries to optimize this code: this code does - * not execute at all for exceptions coming from user mode. Those - * exceptions go through error_exit instead. - */ - TRACE_IRQS_IRETQ_DEBUG - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 - - /* Handle the three GSBASE cases. */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE - - /* With FSGSBASE enabled, unconditionally restore GSBASE */ - wrgsbase %rbx - jmp restore_regs_and_return_to_kernel - -.Lparanoid_exit_checkgs: - /* On non-FSGSBASE systems, conditionally do SWAPGS */ - testl %ebx, %ebx - jnz restore_regs_and_return_to_kernel - - /* We are returning to a context with user GSBASE. */ + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1693,27 +1636,10 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - /* - * The above invocation of paranoid_entry stored the GSBASE - * related information in R/EBX depending on the availability - * of FSGSBASE. - * - * If FSGSBASE is enabled, restore the saved GSBASE value - * unconditionally, otherwise take the conditional SWAPGS path. - */ - ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE - - wrgsbase %rbx - jmp nmi_restore - -nmi_no_fsgsbase: - /* EBX == 0 -> invoke SWAPGS */ - testl %ebx, %ebx + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore - nmi_swapgs: SWAPGS_UNSAFE_STACK - nmi_restore: POP_REGS diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index aefd53767a5d..bca4c743de77 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,62 +19,35 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Must be protected by X86_FEATURE_FSGSBASE check. */ - -static __always_inline unsigned long rdfsbase(void) -{ - unsigned long fsbase; - - asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); - - return fsbase; -} - -static __always_inline unsigned long rdgsbase(void) -{ - unsigned long gsbase; - - asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); - - return gsbase; -} - -static __always_inline void wrfsbase(unsigned long fsbase) -{ - asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); -} - -static __always_inline void wrgsbase(unsigned long gsbase) -{ - asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); -} - -#include - /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - fsbase = rdfsbase(); - else - rdmsrl(MSR_FS_BASE, fsbase); + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static inline unsigned long x86_gsbase_read_cpu_inactive(void) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - wrfsbase(fsbase); - else - wrmsrl(MSR_FS_BASE, fsbase); + unsigned long gsbase; + + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + + return gsbase; } -extern unsigned long x86_gsbase_read_cpu_inactive(void); -extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); +static inline void x86_fsbase_write_cpu(unsigned long fsbase) +{ + wrmsrl(MSR_FS_BASE, fsbase); +} + +static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index d063841a17e3..f5a796da07f8 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,21 +306,6 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index c5ce54e749f6..6ebaae90e207 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,7 +5,4 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) -/* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) - #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 637c9117d5ae..dad20bc891d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,22 +366,6 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -static __init int x86_nofsgsbase_setup(char *arg) -{ - /* Require an exact match without trailing characters. */ - if (strlen(arg)) - return 0; - - /* Do not emit a message if the feature is not present. */ - if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) - return 1; - - setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); - pr_info("FSGSBASE disabled via kernel command line\n"); - return 1; -} -__setup("nofsgsbase", x86_nofsgsbase_setup); - /* * Protection Keys are not available in 32-bit mode. */ @@ -1386,12 +1370,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); - /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - cr4_set_bits(X86_CR4_FSGSBASE); - elf_hwcap2 |= HWCAP2_FSGSBASE; - } - /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8f239091c15d..250e4c4ac6d9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,40 +161,6 @@ enum which_selector { GS }; -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline unsigned long __rdgsbase_inactive(void) -{ - unsigned long gsbase; - - lockdep_assert_irqs_disabled(); - - native_swapgs(); - gsbase = rdgsbase(); - native_swapgs(); - - return gsbase; -} -NOKPROBE_SYMBOL(__rdgsbase_inactive); - -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline void __wrgsbase_inactive(unsigned long gsbase) -{ - lockdep_assert_irqs_disabled(); - - native_swapgs(); - wrgsbase(gsbase); - native_swapgs(); -} -NOKPROBE_SYMBOL(__wrgsbase_inactive); - /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* - * If FSGSBASE is enabled, we can't make any useful guesses - * about the base, and user code expects us to save the current - * value. Fortunately, reading the base directly is efficient. - */ - task->thread.fsbase = rdfsbase(); - local_irq_save(flags); - task->thread.gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); - } + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); } #if IS_ENABLED(CONFIG_KVM) @@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - /* Update the FS and GS selectors if they could have changed. */ - if (unlikely(prev->fsindex || next->fsindex)) - loadseg(FS, next->fsindex); - if (unlikely(prev->gsindex || next->gsindex)) - loadseg(GS, next->gsindex); - - /* Update the bases. */ - wrfsbase(next->fsbase); - __wrgsbase_inactive(next->gsbase); - } else { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); - } + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } -unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - } - - return gsbase; -} - -void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - __wrgsbase_inactive(gsbase); - local_irq_restore(flags); - } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); - } -} - unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.fsindex == 0)) + else if (task->thread.fsindex == 0) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.gsindex == 0)) + else if (task->thread.gsindex == 0) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); @@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; - save_fsgs(me); - p->thread.fsindex = me->thread.fsindex; - p->thread.fsbase = me->thread.fsbase; - p->thread.gsindex = me->thread.gsindex; - p->thread.gsbase = me->thread.gsbase; + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));