From 3c52b5c64326d9dcfee4e10611c53ec1b1b20675 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 6 Sep 2017 17:18:08 +0200 Subject: [PATCH 01/47] x86/asm: Remove unnecessary \n\t in front of CC_SET() from asm templates There is no need for \n\t in front of CC_SET(), as the macro already includes these two. Signed-off-by: Uros Bizjak Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/archrandom.h | 8 ++++---- arch/x86/include/asm/bitops.h | 10 +++++----- arch/x86/include/asm/percpu.h | 2 +- arch/x86/include/asm/rmwcc.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 5b0579abb398..3ac991d81e74 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG "\n\t" + asm volatile(RDRAND_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT "\n\t" + asm volatile(RDRAND_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) static inline bool rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG "\n\t" + asm volatile(RDSEED_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; @@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) static inline bool rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT "\n\t" + asm volatile(RDSEED_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 854022772c5b..8cee8db6dffb 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) : CC_OUT(s) (negative), ADDR : "ir" ((char) ~(1 << nr)) : "memory"); @@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * { bool oldbit; - asm("bts %2,%1\n\t" + asm("bts %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long { bool oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile("btr %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon { bool oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile("btc %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l { bool oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile("bt %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 9fa03604b2b3..b21a475fd7ed 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("bt "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 045f99211a99..0c411c8bbdbd 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -28,7 +28,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ - asm volatile (fullop ";" CC_SET(cc) \ + asm volatile (fullop CC_SET(cc) \ : [counter] "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ return c; \ From 00d96180dc38ef872ac471c2d3e14b067cbd895d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:30 -0500 Subject: [PATCH 02/47] objtool: Don't report end of section error after an empty unwind hint If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the section ends unexpectedly. This can happen with the xen-head.S code because the hypercall_page is "text" but it's all zeros. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/check.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a0c518ecf085..83f370fa00c2 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1752,11 +1752,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, if (insn->dead_end) return 0; - insn = next_insn; - if (!insn) { + if (!next_insn) { + if (state.cfa.base == CFI_UNDEFINED) + return 0; WARN("%s: unexpected end of section", sec->name); return 1; } + + insn = next_insn; } return 0; From 17270717e80de33a884ad328fea5f407d87f6d6a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:31 -0500 Subject: [PATCH 03/47] x86/head: Remove confusing comment This comment is actively wrong and confusing. It refers to the registers' stack offsets after the pt_regs has been constructed on the stack, but this code is *before* that. At this point the stack just has the standard iret frame, for which no comment should be needed. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..3b04e4c99389 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -270,10 +270,6 @@ bad_address: __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 From a8b88e84d124bc92c4808e72b8b8c0e0bb538630 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:32 -0500 Subject: [PATCH 04/47] x86/head: Remove unused 'bad_address' code It's no longer possible for this code to be executed, so remove it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3b04e4c99389..afb0a1e22d41 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,9 +265,6 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) i = 0 From 015a2ea5478680fc5216d56b7ff306f2a74efaf9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:33 -0500 Subject: [PATCH 05/47] x86/head: Fix head ELF function annotations These functions aren't callable C-type functions, so don't annotate them as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index afb0a1e22d41..edacd579d504 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,7 +234,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -277,7 +277,7 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +END(early_idt_handler_array) early_idt_handler_common: /* @@ -320,7 +320,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) +END(early_idt_handler_common) __INITDATA From e93db75a0054b23a874a12c63376753544f3fe9e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:34 -0500 Subject: [PATCH 06/47] x86/boot: Annotate verify_cpu() as a callable function verify_cpu() is a callable function. Annotate it as such. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/verify_cpu.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include #include -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) From 2582d3df95c76d3b686453baf90b64d57e87d1e8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:35 -0500 Subject: [PATCH 07/47] x86/xen: Fix xen head ELF annotations Mark the ends of the startup_xen and hypercall_page code sections. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a7525e95d53f..9753225289e8 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -33,7 +33,7 @@ ENTRY(startup_xen) mov $init_thread_union+THREAD_SIZE, %_ASM_SP jmp xen_start_kernel - +END(startup_xen) __FINIT #endif @@ -47,7 +47,7 @@ ENTRY(hypercall_page) .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 #include #undef HYPERCALL - +END(hypercall_page) .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") From abbe1cac6214d81d2f4e149aba64a8760703144e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:36 -0500 Subject: [PATCH 08/47] x86/xen: Add unwind hint annotations Add unwind hint annotations to the xen head code so the ORC unwinder can read head_64.o. hypercall_page needs empty annotations at 32-byte intervals to match the 'xen_hypercall_*' ELF functions at those locations. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Jiri Slaby Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 9753225289e8..124941d09b2b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) + UNWIND_HINT_EMPTY cld /* Clear .bss */ @@ -40,7 +42,10 @@ END(startup_xen) .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE + .rept (PAGE_SIZE / 32) + UNWIND_HINT_EMPTY + .skip 32 + .endr #define HYPERCALL(n) \ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ From 2704fbb672d0d9a19414907fda7949283dcef6a1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 21:43:37 -0500 Subject: [PATCH 09/47] x86/head: Add unwind hint annotations Jiri Slaby reported an ORC issue when unwinding from an idle task. The stack was: ffffffff811083c2 do_idle+0x142/0x1e0 ffffffff8110861d cpu_startup_entry+0x5d/0x60 ffffffff82715f58 start_kernel+0x3ff/0x407 ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d ffffffff810001bf secondary_startup_64+0x9f/0xa0 The ORC unwinder errored out at secondary_startup_64 because the head code isn't annotated yet so there wasn't a corresponding ORC entry. Fix that and any other head-related unwinding issues by adding unwind hints to the head code. Reported-by: Jiri Slaby Tested-by: Jiri Slaby Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/head_64.S | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..d8e2b700d1db 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index edacd579d504..42e32c2e51bb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -88,6 +89,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -132,6 +134,7 @@ ENTRY(secondary_startup_64) movq $1f, %rax jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -246,6 +249,7 @@ END(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -270,13 +274,18 @@ ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr + UNWIND_HINT_IRET_REGS offset=16 END(early_idt_handler_array) early_idt_handler_common: @@ -305,6 +314,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -427,7 +437,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE From 1e4078f0bba46ad61b69548abe6a6faf63b89380 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Oct 2017 09:24:30 +0200 Subject: [PATCH 10/47] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default in the 64-bit defconfig Increase testing coverage by turning on the primary x86 unwinder for the 64-bit defconfig. Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/configs/x86_64_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4a4b16e56d35..eb65c248708d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y +CONFIG_ORC_UNWINDER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y From 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:06:19 -0400 Subject: [PATCH 11/47] x86/fpu/debug: Remove unused 'x86_fpu_state' and 'x86_fpu_deactivate_state' tracepoints Commit: d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points") ... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points, but never used them. Today they are still not used. As they take up and waste memory, remove them. Signed-off-by: Steven Rostedt (VMware) Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/fpu.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 39f7a27bef13..6b086c39e6dc 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -33,11 +33,6 @@ DECLARE_EVENT_CLASS(x86_fpu, ) ); -DEFINE_EVENT(x86_fpu, x86_fpu_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -73,11 +68,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) From 11af847446ed0d131cf24d16a7ef3d5ea7a49554 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 13 Oct 2017 15:02:00 -0500 Subject: [PATCH 12/47] x86/unwind: Rename unwinder config options to 'CONFIG_UNWINDER_*' Rename the unwinder config options from: CONFIG_ORC_UNWINDER CONFIG_FRAME_POINTER_UNWINDER CONFIG_GUESS_UNWINDER to: CONFIG_UNWINDER_ORC CONFIG_UNWINDER_FRAME_POINTER CONFIG_UNWINDER_GUESS ... in order to give them a more logical config namespace. Suggested-by: Ingo Molnar Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/x86/orc-unwinder.txt | 2 +- Makefile | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 10 +++++----- arch/x86/configs/tiny.config | 4 ++-- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/include/asm/unwind.h | 8 ++++---- arch/x86/kernel/Makefile | 6 +++--- include/asm-generic/vmlinux.lds.h | 2 +- lib/Kconfig.debug | 2 +- scripts/Makefile.build | 2 +- 12 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt index af0c9a4c65a6..cd4b29be29af 100644 --- a/Documentation/x86/orc-unwinder.txt +++ b/Documentation/x86/orc-unwinder.txt @@ -4,7 +4,7 @@ ORC unwinder Overview -------- -The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is +The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. diff --git a/Makefile b/Makefile index bc5c79e8e3cf..c0f723f81c06 100644 --- a/Makefile +++ b/Makefile @@ -933,8 +933,8 @@ ifdef CONFIG_STACK_VALIDATION ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - ifdef CONFIG_ORC_UNWINDER - $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_UNWINDER_ORC + $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") else $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 971feac13506..6b94ca0aa585 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -170,7 +170,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71a48a30fc84..f274dbb87c26 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default FRAME_POINTER_UNWINDER + default UNWINDER_FRAME_POINTER ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, livepatch, lockdep, and more. -config FRAME_POINTER_UNWINDER +config UNWINDER_FRAME_POINTER bool "Frame pointer unwinder" select FRAME_POINTER ---help--- @@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER consistency model, as this is currently the only way to get a reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -config ORC_UNWINDER +config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 select STACK_VALIDATION @@ -395,7 +395,7 @@ config ORC_UNWINDER Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. -config GUESS_UNWINDER +config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT ---help--- @@ -410,7 +410,7 @@ config GUESS_UNWINDER endchoice config FRAME_POINTER - depends on !ORC_UNWINDER && !GUESS_UNWINDER + depends on !UNWINDER_ORC && !UNWINDER_GUESS bool endmenu diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 550cd5012b73..66c9e2aab16c 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set -CONFIG_GUESS_UNWINDER=y -# CONFIG_FRAME_POINTER_UNWINDER is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index eb65c248708d..e32fc1f274d8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y -CONFIG_ORC_UNWINDER=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eb7c718aaf8..9f05a1002aa9 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,7 +5,7 @@ #include struct mod_arch_specific { -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9f793e2df7a..35d67dc7b69f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,11 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#if defined(CONFIG_ORC_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs; -#elif defined(CONFIG_FRAME_POINTER_UNWINDER) +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; @@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } -#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) } #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fd0a7895b63f..6209ab6deb50 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -127,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8acfc1e099e1..63e56f6c1877 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -687,7 +687,7 @@ #define BUG_TABLE #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2689b7c50c52..7566eff22236 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -376,7 +376,7 @@ config STACK_VALIDATION that runtime stack traces are more reliable. This is also a prerequisite for generation of ORC unwind data, which - is needed for CONFIG_ORC_UNWINDER. + is needed for CONFIG_UNWINDER_ORC. For more information, see tools/objtool/Documentation/stack-validation.txt. diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 061d0c3a420a..f965f477832e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) __objtool_obj := $(objtree)/tools/objtool/objtool -objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) +objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) ifndef CONFIG_FRAME_POINTER objtool_args += --no-fp From fc72ae40e30327aa24eb88a24b9c7058f938bd36 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 13 Oct 2017 15:02:01 -0500 Subject: [PATCH 13/47] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit The ORC unwinder has been stable in testing so far. Give it much wider testing by making it the default in kconfig for x86_64. It's not yet supported for 32-bit, so leave frame pointers as the default there. Suggested-by: Ingo Molnar Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index f274dbb87c26..a4ff214fb760 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default UNWINDER_FRAME_POINTER + default UNWINDER_ORC if X86_64 + default UNWINDER_FRAME_POINTER if X86_32 ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, livepatch, lockdep, and more. -config UNWINDER_FRAME_POINTER - bool "Frame pointer unwinder" - select FRAME_POINTER - ---help--- - This option enables the frame pointer unwinder for unwinding kernel - stack traces. - - The unwinder itself is fast and it uses less RAM than the ORC - unwinder, but the kernel text size will grow by ~3% and the kernel's - overall performance will degrade by roughly 5-10%. - - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 @@ -395,6 +381,21 @@ config UNWINDER_ORC Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT From cbe96375025e14fc76f9ed42ee5225120d7210f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:41 -0700 Subject: [PATCH 14/47] bitops: Add clear/set_bit32() to linux/bitops.h Add two simple wrappers around set_bit/clear_bit() that accept the common case of an u32 array. This avoids writing casts in all callers. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org Signed-off-by: Ingo Molnar --- include/linux/bitops.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 8fbe259b197c..36794f058ba6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -227,6 +227,32 @@ static inline unsigned long __ffs64(u64 word) return __ffs((unsigned long)word); } +/* + * clear_bit32 - Clear a bit in memory for u32 array + * @nr: Bit to clear + * @addr: u32 * address of bitmap + * + * Same as clear_bit, but avoids needing casts for u32 arrays. + */ + +static __always_inline void clear_bit32(long nr, volatile u32 *addr) +{ + clear_bit(nr, (volatile unsigned long *)addr); +} + +/* + * set_bit32 - Set a bit in memory for u32 array + * @nr: Bit to clear + * @addr: u32 * address of bitmap + * + * Same as set_bit, but avoids needing casts for u32 arrays. + */ + +static __always_inline void set_bit32(long nr, volatile u32 *addr) +{ + set_bit(nr, (volatile unsigned long *)addr); +} + #ifdef __KERNEL__ #ifndef set_mask_bits From 0b00de857a648dafe7020878c7a27cf776f5edf4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:42 -0700 Subject: [PATCH 15/47] x86/cpuid: Add generic table for CPUID dependencies Some CPUID features depend on other features. Currently it's possible to to clear dependent features, but not clear the base features, which can cause various interesting problems. This patch implements a generic table to describe dependencies between CPUID features, to be used by all code that clears CPUID. Some subsystems (like XSAVE) had an own implementation of this, but it's better to do it all in a single place for everyone. Then clear_cpu_cap and setup_clear_cpu_cap always look up this table and clear all dependencies too. This is intended to be a practical table: only for features that make sense to clear. If someone for example clears FPU, or other features that are essentially part of the required base feature set, not much is going to work. Handling that is right now out of scope. We're only handling features which can be usefully cleared. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Jonathan McDowell Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 9 +-- arch/x86/include/asm/cpufeatures.h | 5 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d59c15c3defd..225fd8374fae 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2519c6c801c9..401a70992060 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -21,6 +21,11 @@ * this feature bit is not displayed in /proc/cpuinfo at all. */ +/* + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c + */ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e17942c131c8..de260fae1017 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,6 +22,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..e48eb7313120 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,113 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) +{ + clear_bit32(bit, c->x86_capability); +} + +static inline void __setup_clear_cpu_cap(unsigned int bit) +{ + clear_cpu_cap(&boot_cpu_data, bit); + set_bit32(bit, cpu_caps_cleared); +} + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + if (!c) + __setup_clear_cpu_cap(feature); + else + __clear_cpu_cap(c, feature); +} + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + bool changed; + DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + const struct cpuid_dep *d; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} From 0c2a3913d6f50503f7c59d83a6219e39508cc898 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:43 -0700 Subject: [PATCH 16/47] x86/fpu: Parse clearcpuid= as early XSAVE argument With a followon patch we want to make clearcpuid affect the XSAVE configuration. But xsave is currently initialized before arguments are parsed. Move the clearcpuid= parsing into the special early xsave argument parsing code. Since clearcpuid= contains a = we need to keep the old __setup around as a dummy, otherwise it would end up as a environment variable in init's environment. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 16 +++++++--------- arch/x86/kernel/fpu/init.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..03bb004bb15e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* From ccb18db2ab9d923df07e7495123fe5fb02329713 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:44 -0700 Subject: [PATCH 17/47] x86/fpu: Make XSAVE check the base CPUID features before enabling Before enabling XSAVE, not only check the XSAVE specific CPUID bits, but also the base CPUID features of the respective XSAVE feature. This allows to disable individual XSAVE states using the existing clearcpuid= option, which can be useful for performance testing and debugging, and also in general avoids inconsistencies. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..fb581292975b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include #include +#include /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -726,6 +740,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +774,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ From 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 13 Oct 2017 14:56:45 -0700 Subject: [PATCH 18/47] x86/fpu: Remove the explicit clearing of XSAVE dependent features Clearing a CPU feature with setup_clear_cpu_cap() clears all features which depend on it. Expressing feature dependencies in one place is easier to maintain than keeping functions like fpu__xstate_clear_all_cpu_caps() up to date. The features which depend on XSAVE have their dependency expressed in the dependency table, so its sufficient to clear X86_FEATURE_XSAVE. Remove the explicit clearing of XSAVE dependent features. Signed-off-by: Andi Kleen Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fb581292975b..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* From 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Sat, 14 Oct 2017 20:17:54 +0530 Subject: [PATCH 19/47] objtool: Print top level commands on incorrect usage Print top-level objtool commands, along with the error on incorrect command line usage. Objtool command line parser exit's with code 129, for incorrect usage. Convert the cmd_usage() exit code also, to maintain consistency across objtool. After the patch: $ ./objtool -j Unknown option: -j usage: objtool COMMAND [ARGS] Commands: check Perform stack metadata validation on an object file orc Generate in-place ORC unwind tables for an object file $ echo $? 129 Signed-off-by: Kamalesh Babulal Acked-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- tools/objtool/objtool.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 31e0f9143840..07f329919828 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -70,7 +70,7 @@ static void cmd_usage(void) printf("\n"); - exit(1); + exit(129); } static void handle_options(int *argc, const char ***argv) @@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) break; } else { fprintf(stderr, "Unknown option: %s\n", cmd); - fprintf(stderr, "\n Usage: %s\n", - objtool_usage_string); - exit(1); + cmd_usage(); } (*argv)++; From 57b8b1a1856adaa849d02d547411a553a531022b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Oct 2017 19:39:35 +0200 Subject: [PATCH 20/47] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible 'features' which can be handed in are larger than this, because after the capabilities the bug 'feature' bits occupy another 32bit. Not really obvious... So clearing any of the misfeature bits, as 32bit does for the F00F bug, accesses that bitmap out of bounds thereby corrupting the stack. Size the bitmap proper and add a sanity check to catch accidental out of bound access. Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop --- arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e48eb7313120..c1d49842a411 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) __clear_cpu_cap(c, feature); } +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) { - bool changed; - DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; clear_feature(c, feature); From da20ab35180780e4a6eadc804544f1fa967f3567 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 18 Oct 2017 10:21:07 -0700 Subject: [PATCH 21/47] x86/entry: Use SYSCALL_DEFINE() macros for sys_modify_ldt() We do not have tracepoints for sys_modify_ldt() because we define it directly instead of using the normal SYSCALL_DEFINEx() macros. However, there is a reason sys_modify_ldt() does not use the macros: it has an 'int' return type instead of 'unsigned long'. This is a bug, but it's a bug cemented in the ABI. What does this mean? If we return -EINVAL from a function that returns 'int', we have 0x00000000ffffffea in %rax. But, if we return -EINVAL from a function returning 'unsigned long', we end up with 0xffffffffffffffea in %rax, which is wrong. To work around this and maintain the 'int' behavior while using the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int' in both implementations of sys_modify_ldt(). Signed-off-by: Dave Hansen Reviewed-by: Andy Lutomirski Reviewed-by: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 2 +- arch/x86/kernel/ldt.c | 16 +++++++++++++--- arch/x86/um/ldt.c | 7 +++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91dfcafe27a6..bad25bb80679 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f0e64db18ac8..0402d44deb4d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,8 +295,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 836a1eb5df43..3ee234b6234d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) mm->arch.ldt.entry_count = 0; } -int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { - return do_modify_ldt_skas(func, ptr, bytecount); + /* See non-um modify_ldt() for why we do this cast */ + return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); } From 82c62fa0c49aa305104013cee4468772799bb391 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Oct 2017 11:21:35 -0500 Subject: [PATCH 22/47] x86/asm: Don't use the confusing '.ifeq' directive I find the '.ifeq ' directive to be confusing. Reading it quickly seems to suggest its opposite meaning, or that it's missing an argument. Improve readability by replacing all of its x86 uses with '.if == 0'. Signed-off-by: Josh Poimboeuf Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6cdb7a1455e..846e84a1d1f7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -817,7 +817,7 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..6e50f87765e5 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -401,7 +401,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 42e32c2e51bb..311db1a73c11 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -273,7 +273,7 @@ ENDPROC(start_cpu0) ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 UNWIND_HINT_IRET_REGS pushq $0 # Dummy error code, to make stack frame uniform .else From c128dbfa0f879f8ce7b79054037889b0b2240728 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Mon, 30 Oct 2017 18:20:29 -0700 Subject: [PATCH 23/47] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, AVX512_BITALG. CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG Detailed information of CPUID bits for these features can be found in the Intel Architecture Instruction Set Extensions and Future Features Programming Interface document (refer to Table 1-1. and Table 1-2.). A copy of this document is available at https://bugzilla.kernel.org/show_bug.cgi?id=197239 Signed-off-by: Gayatri Kammela Acked-by: Thomas Gleixner Cc: Andi Kleen Cc: Fenghua Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Ricardo Neri Cc: Yang Zhong Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 6 ++++++ arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 401a70992060..b0556f882aa8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -299,6 +299,12 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c1d49842a411..c21f22d836ad 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, From 1067f030994c69ca1fba8c607437c8895dcf8509 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:28 -0700 Subject: [PATCH 24/47] x86/mm: Relocate page fault error codes to traps.h Up to this point, only fault.c used the definitions of the page fault error codes. Thus, it made sense to keep them within such file. Other portions of code might be interested in those definitions too. For instance, the User- Mode Instruction Prevention emulation code will use such definitions to emulate a page fault when it is unable to successfully copy the results of the emulated instructions to user space. While relocating the error code enumeration, the prefix X86_ is used to make it consistent with the rest of the definitions in traps.h. Of course, code using the enumeration had to be updated as well. No functional changes were performed. Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Chen Yucong Cc: Vlastimil Babka Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/traps.h | 18 ++++++++ arch/x86/mm/fault.c | 88 ++++++++++++++---------------------- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5545f6459bf5..da3c3a3674a5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -144,4 +144,22 @@ enum { X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e2baeaa053a5..db71c73530bd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,26 +28,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - - PF_PROT = 1 << 0, - PF_WRITE = 1 << 1, - PF_USER = 1 << 2, - PF_RSVD = 1 << 3, - PF_INSTR = 1 << 4, - PF_PK = 1 << 5, -}; - /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: @@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); @@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * siginfo so userspace can discover which protection key was set * on the PTE. * - * If we get here, we know that the hardware signaled a PF_PK + * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. @@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) /* * force_sig_info_fault() is called from a number of * contexts, some of which have a VMA and some of which - * do not. The PF_PK handing happens after we have a + * do not. The X86_PF_PK handing happens after we have a * valid VMA, so we should never reach this without a * valid VMA. */ @@ -697,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (!oops_may_print()) return; - if (error_code & PF_INSTR) { + if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; @@ -779,7 +759,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, */ if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | PF_USER; + tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ @@ -897,7 +877,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * It's possible to have interrupts off here: */ @@ -918,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * Instruction fetch faults in the vsyscall page might need * emulation. */ - if (unlikely((error_code & PF_INSTR) && + if (unlikely((error_code & X86_PF_INSTR) && ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; @@ -931,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * are always protection faults. */ if (address >= TASK_SIZE_MAX) - error_code |= PF_PROT; + error_code |= X86_PF_PROT; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -992,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return false; - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return true; return false; } @@ -1024,7 +1004,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1052,14 +1032,14 @@ static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) { - if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; @@ -1084,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, static int spurious_fault_check(unsigned long error_code, pte_t *pte) { - if ((error_code & PF_WRITE) && !pte_write(*pte)) + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; /* * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set PF_PK. + * changes, so no spurious fault will ever set X86_PF_PK. */ - if ((error_code & PF_PK)) + if ((error_code & X86_PF_PK)) return 1; return 1; @@ -1139,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address) * change, so user accesses are not expected to cause spurious * faults. */ - if (error_code != (PF_WRITE | PF_PROT) - && error_code != (PF_INSTR | PF_PROT)) + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); @@ -1200,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return 1; /* * Make sure to check the VMA so that we do not perform - * faults just to hit a PF_PK as soon as we fill in a + * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return 1; - if (error_code & PF_WRITE) { + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -1220,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) } /* read, present: */ - if (unlikely(error_code & PF_PROT)) + if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ @@ -1243,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (!static_cpu_has(X86_FEATURE_SMAP)) return false; - if (error_code & PF_USER) + if (error_code & X86_PF_USER) return false; if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) @@ -1296,7 +1276,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; @@ -1324,7 +1304,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & PF_RSVD)) + if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { @@ -1350,7 +1330,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= PF_USER; + error_code |= X86_PF_USER; flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1359,9 +1339,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1381,7 +1361,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if ((error_code & PF_USER) == 0 && + if (!(error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address, NULL); return; @@ -1408,7 +1388,7 @@ retry: bad_area(regs, error_code, address); return; } - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter From b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:29 -0700 Subject: [PATCH 25/47] x86/boot: Relocate definition of the initial state of CR0 Both head_32.S and head_64.S utilize the same value to initialize the control register CR0. Also, other parts of the kernel might want to access this initial definition (e.g., emulation code for User-Mode Instruction Prevention uses this state to provide a sane dummy value for CR0 when emulating the smsw instruction). Thus, relocate this definition to a header file from which it can be conveniently accessed. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: linux-mm@kvack.org Cc: Paul Gortmaker Cc: Huang Rui Cc: Shuah Khan Cc: linux-arch@vger.kernel.org Cc: Jonathan Corbet Cc: Jiri Slaby Cc: "Ravi V. Shankar" Cc: Denys Vlasenko Cc: Chris Metcalf Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Chen Yucong Cc: Vlastimil Babka Cc: Dave Hansen Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Linus Torvalds Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/uapi/asm/processor-flags.h | 3 +++ arch/x86/kernel/head_32.S | 3 --- arch/x86/kernel/head_64.S | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 185f3d10c194..39946d0a1d41 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -151,5 +151,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..c3cfc655f551 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -211,9 +211,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 513cbb012ecc..5e1bfdd86b5b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -149,9 +149,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 From e27c310af5c05cf876d9cad006928076c27f54d4 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 27 Oct 2017 13:25:30 -0700 Subject: [PATCH 26/47] ptrace,x86: Make user_64bit_mode() available to 32-bit builds In its current form, user_64bit_mode() can only be used when CONFIG_X86_64 is selected. This implies that code built with CONFIG_X86_64=n cannot use it. If a piece of code needs to be built for both CONFIG_X86_64=y and CONFIG_X86_64=n and wants to use this function, it needs to wrap it in an #ifdef/#endif; potentially, in multiple places. This can be easily avoided with a single #ifdef/#endif pair within user_64bit_mode() itself. Suggested-by: Borislav Petkov Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: "Michael S. Tsirkin" Cc: Peter Zijlstra Cc: Dave Hansen Cc: ricardo.neri@intel.com Cc: Adrian Hunter Cc: Paul Gortmaker Cc: Huang Rui Cc: Qiaowei Ren Cc: Shuah Khan Cc: Kees Cook Cc: Jonathan Corbet Cc: Jiri Slaby Cc: Dmitry Vyukov Cc: "Ravi V. Shankar" Cc: Chris Metcalf Cc: Brian Gerst Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Colin Ian King Cc: Chen Yucong Cc: Adam Buchbinder Cc: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Andrew Morton Cc: Thomas Garnier Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/include/asm/ptrace.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 91c04c8e67fa..e2afbf689309 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -135,9 +135,9 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } -#ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { +#ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 @@ -148,8 +148,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif +#else /* !CONFIG_X86_64 */ + return false; +#endif } +#ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif From 9da78ba6b47b46428cfdfc0851511ab29c869798 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:58:58 -0700 Subject: [PATCH 27/47] x86/entry/64: Remove the restore_c_regs_and_iret label The only user was the 64-bit opportunistic SYSRET failure path, and that path didn't really need it. This change makes the opportunistic SYSRET code a bit more straightforward and gets rid of the label. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 846e84a1d1f7..e8ef83df46e6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path: call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* @@ -314,6 +313,7 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ + RESTORE_EXTRA_REGS RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY @@ -321,7 +321,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -638,7 +638,6 @@ retint_kernel: */ GLOBAL(restore_regs_and_iret) RESTORE_EXTRA_REGS -restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN From 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:58:59 -0700 Subject: [PATCH 28/47] x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths These code paths will diverge soon. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++--------- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/kernel/head_64.S | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e8ef83df46e6..3eeb1694210c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -321,7 +321,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -423,7 +423,7 @@ ENTRY(ret_from_fork) call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -612,7 +612,20 @@ GLOBAL(retint_user) call prepare_exit_to_usermode TRACE_IRQS_IRETQ SWAPGS - jmp restore_regs_and_iret + +GLOBAL(restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testl $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -632,11 +645,14 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testl $3, CS(%rsp) + jz 1f + ud2 +1: +#endif RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -1327,7 +1343,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e26c25ca7756..9ca014a99968 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON SWAPGS - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 189bf42dfa2b..08f067faa264 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -326,7 +326,7 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret + jmp restore_regs_and_return_to_kernel END(early_idt_handler_common) __INITDATA From 8a055d7f411d41755ce30db5bb65b154777c4b78 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:00 -0700 Subject: [PATCH 29/47] x86/entry/64: Move SWAPGS into the common IRET-to-usermode path All of the code paths that ended up doing IRET to usermode did SWAPGS immediately beforehand. Move the SWAPGS into the common code. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------ arch/x86/entry/entry_64_compat.S | 3 +-- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3eeb1694210c..d6ffdc9afcbb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -249,12 +249,14 @@ return_from_SYSCALL_64: /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -272,14 +274,14 @@ return_from_SYSCALL_64: /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -300,12 +302,12 @@ return_from_SYSCALL_64: * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding @@ -318,10 +320,6 @@ syscall_return_via_sysret: movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_regs_and_return_to_usermode END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -422,8 +420,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -611,9 +608,8 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ - SWAPGS -GLOBAL(restore_regs_and_return_to_usermode) +GLOBAL(swapgs_restore_regs_and_return_to_usermode) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testl $3, CS(%rsp) @@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode) ud2 1: #endif + SWAPGS RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -1342,8 +1339,7 @@ ENTRY(nmi) * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9ca014a99968..932b96ce1b06 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON - SWAPGS - jmp restore_regs_and_return_to_usermode + jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) From e872045bfd9c465a8555bab4b8567d56a4d2d3bb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:01 -0700 Subject: [PATCH 30/47] x86/entry/64: Simplify reg restore code in the standard IRET paths The old code restored all the registers with movq instead of pop. In theory, this was done because some CPUs have higher movq throughput, but any gain there would be tiny and is almost certainly outweighed by the higher text size. This saves 96 bytes of text. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 21 +++++++++++++++++++++ arch/x86/entry/entry_64.S | 12 ++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 640aafebdc00..0b9dd8123701 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset extra=0 .endm + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .endm + + .macro POP_C_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + .endm + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d6ffdc9afcbb..925d56246071 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) 1: #endif SWAPGS - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN @@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel) ud2 1: #endif - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) From e53178328c9b96fbdbc719e78c93b5687ee007c3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:02 -0700 Subject: [PATCH 31/47] x86/entry/64: Shrink paranoid_exit_restore and make labels local paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel. Merge them and make the paranoid_exit internal labels local. Keeping .Lparanoid_exit makes the code a bit shorter because it allows a 2-byte jnz instead of a 5-byte jnz. Saves 96 bytes of text. ( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS kernel, but fixing that would make the code rather messy. ) Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 925d56246071..155396443aaa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1123,17 +1123,14 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* From 4fbb39108f972437c44e5ffa781b56635d496826 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:03 -0700 Subject: [PATCH 32/47] x86/entry/64: Use pop instead of movq in syscall_return_via_sysret Saves 64 bytes. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 155396443aaa..4f9b4463b3fc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -315,10 +315,18 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - RESTORE_EXTRA_REGS - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY + POP_EXTRA_REGS + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi + popq %rdi + movq RSP-ORIG_RAX(%rsp), %rsp USERGS_SYSRET64 END(entry_SYSCALL_64) From a512210643da8082cb44181dba8b18e752bd68f0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:04 -0700 Subject: [PATCH 33/47] x86/entry/64: Merge the fast and slow SYSRET paths They did almost the same thing. Remove a bunch of pointless instructions (mostly hidden in macros) and reduce cognitive load by merging them. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4f9b4463b3fc..b5a0ea63d391 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp + addq $6*8, %rsp /* skip extra regs -- they were preserved */ UNWIND_HINT_EMPTY - USERGS_SYSRET64 + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -317,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ UNWIND_HINT_EMPTY POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: popq %rsi /* skip r11 */ popq %r10 popq %r9 From 471ee4832209e986029b9fabdaad57b1eecb856b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:05 -0700 Subject: [PATCH 34/47] x86/entry/64: Use POP instead of MOV to restore regs on NMI return This gets rid of the last user of the old RESTORE_..._REGS infrastructure. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b5a0ea63d391..5b2f0bc661a0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1559,11 +1559,14 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily From c39858de696f0cc160a544455e8403d663d577e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:06 -0700 Subject: [PATCH 35/47] x86/entry/64: Remove the RESTORE_..._REGS infrastructure All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 52 ---------------------------------------- 1 file changed, 52 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 0b9dd8123701..1895a685d3dd 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - .macro POP_EXTRA_REGS popq %r15 popq %r14 @@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with popq %rdi .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - UNWIND_HINT_IRET_REGS offset=16*8 - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp - .endm - .macro icebp .byte 0xf1 .endm From 43e4111086a70c78bedb6ad990bee97f17b27a6e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 00:59:07 -0700 Subject: [PATCH 36/47] xen, x86/entry/64: Add xen NMI trap entry Instead of trying to execute any NMI via the bare metal's NMI trap handler use a Xen specific one for PV domains, like we do for e.g. debug traps. As in a PV domain the NMI is handled via the normal kernel stack this is the correct thing to do. This will enable us to get rid of the very fragile and questionable dependencies between the bare metal NMI handler and Xen assumptions believed to be broken anyway. Signed-off-by: Juergen Gross Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/traps.h | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b2f0bc661a0..a3f76ab5d0ea 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1078,6 +1078,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 #endif @@ -1240,7 +1241,6 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ -/* XXX: broken on Xen PV */ ENTRY(nmi) UNWIND_HINT_IRET_REGS /* diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index da3c3a3674a5..e76ce80ca18b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -37,9 +37,9 @@ asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_divide_error(void); +asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_xenint3(void); -asmlinkage void xen_nmi(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69b9deff7e5c..8da4eff19c2a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif - { nmi, xen_nmi, true }, + { nmi, xen_xennmi, true }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index dae2cc33afb5..286ecc198562 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,7 +29,7 @@ xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xenint3 -xen_pv_trap nmi +xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds xen_pv_trap invalid_op From 929bacec21478a72c78e4f29f98fb799bd00105a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:08 -0700 Subject: [PATCH 37/47] x86/entry/64: De-Xen-ify our NMI code Xen PV is fundamentally incompatible with our fancy NMI code: it doesn't use IST at all, and Xen entries clobber two stack slots below the hardware frame. Drop Xen PV support from our NMI code entirely. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a3f76ab5d0ea..40e9933a2d33 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1240,9 +1240,13 @@ ENTRY(error_exit) jmp retint_user END(error_exit) -/* Runs on exception stack */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + */ ENTRY(nmi) UNWIND_HINT_IRET_REGS + /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1300,7 +1304,7 @@ ENTRY(nmi) * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1465,7 +1469,7 @@ nested_nmi_out: popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1496,7 +1500,7 @@ first_nmi: pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: #endif @@ -1571,20 +1575,22 @@ nmi_restore: /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) From bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:09 -0700 Subject: [PATCH 38/47] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out of native_load_sp0() This causes the MSR_IA32_SYSENTER_CS write to move out of the paravirt callback. This shouldn't affect Xen PV: Xen already ignores MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support vm86() in a useful way. Note to any potential backporters: This patch won't break lguest, as lguest didn't have any SYSENTER support at all. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 ------- arch/x86/include/asm/switch_to.h | 12 ++++++++++++ arch/x86/kernel/process_32.c | 4 +++- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 6 +++++- 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b390ff76e58f..0167e3e35a57 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -520,13 +520,6 @@ static inline void native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index fcc5cd387fd1..7ae8caffbada 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -72,4 +72,16 @@ do { \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..0936ed3da6b6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ load_sp0(tss, next); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..a6ff6d1a0110 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -464,7 +464,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* Reload esp0 and ss1. This changes current_thread_info(). */ + /* Reload sp0. */ load_sp0(tss, next); /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 7924a5356c8a..5bc1c3ab6287 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -54,6 +54,7 @@ #include #include #include +#include /* * Known problems: @@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, &tsk->thread); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; put_cpu(); @@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } load_sp0(tss, &tsk->thread); put_cpu(); From da51da189a24bb9b7e2d5a123be096e51a4695a5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:10 -0700 Subject: [PATCH 39/47] x86/entry/64: Pass SP0 directly to load_sp0() load_sp0() had an odd signature: void load_sp0(struct tss_struct *tss, struct thread_struct *thread); Simplify it to: void load_sp0(unsigned long sp0); Also simplify a few get_cpu()/put_cpu() sequences to preempt_disable()/preempt_enable(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 5 ++--- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/include/asm/processor.h | 9 ++++----- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 14 ++++++-------- arch/x86/xen/enlighten_pv.c | 7 +++---- 8 files changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 12deec722cf0..43d4f90edebc 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,10 +15,9 @@ #include #include -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 280d94c36dad..a916788ac478 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -133,7 +133,7 @@ struct pv_cpu_ops { void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0167e3e35a57..064b84722166 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -517,9 +517,9 @@ static inline void native_set_iopl_mask(unsigned mask) } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -544,10 +544,9 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 03bb004bb15e..4e7fb9c3bfa5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,7 +1570,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); + load_sp0(current->thread.sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1625,7 +1625,7 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); + load_sp0(thread->sp0); set_tss_desc(cpu, t); load_TR_desc(); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0936ed3da6b6..40b85870e429 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(tss, next); + load_sp0(next->sp0); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a6ff6d1a0110..2124304fb77a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(tss, next); + load_sp0(next->sp0); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5bc1c3ab6287..0f1d92cd20ad 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -94,7 +94,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + load_sp0(tsk->thread.sp0); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; if (static_cpu_has(X86_FEATURE_SEP)) { @@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tss, &tsk->thread); - put_cpu(); + load_sp0(tsk->thread.sp0); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8da4eff19c2a..e7b213047724 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, } } -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static void xen_load_sp0(unsigned long sp0) { struct multicall_space mcs; mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) From 3500130b84a3cdc5b6796eba1daf178944935efe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:11 -0700 Subject: [PATCH 40/47] x86/entry: Add task_top_of_stack() to find the top of a task's stack This will let us get rid of a few places that hardcode accesses to thread.sp0. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 064b84722166..ad59cec14239 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -795,6 +795,8 @@ static inline void spin_lock_prefetch(const void *x) #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) +#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). From f16b3da1dc936c0f8121741d0a1731bf242f2f56 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:12 -0700 Subject: [PATCH 41/47] x86/xen/64, x86/entry/64: Clean up SP code in cpu_initialize_context() I'm removing thread_struct::sp0, and Xen's usage of it is slightly dubious and unnecessary. Use appropriate helpers instead. While we're at at, reorder the code slightly to make it more obvious what's going on. Signed-off-by: Andy Lutomirski Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/xen/smp_pv.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 51471408fdd1..8c0e047d0b80 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -13,6 +13,7 @@ * single-threaded. */ #include +#include #include #include #include @@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + /* + * Bring up the CPU in cpu_bringup_and_idle() with the stack + * pointing just below where pt_regs would be if it were a normal + * kernel entry. + */ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); xen_copy_trap_info(ctxt->trap_ctxt); @@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; + /* + * Set SS:SP that Xen will use when entering guest kernel mode + * from guest user mode. Subsequent calls to load_sp0() can + * change this value. + */ ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_sp = task_top_of_stack(idle); #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; @@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) BUG(); From 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:13 -0700 Subject: [PATCH 42/47] x86/entry/64: Stop initializing TSS.sp0 at boot In my quest to get rid of thread_struct::sp0, I want to clean up or remove all of its readers. Two of them are in cpu_init() (32-bit and 64-bit), and they aren't needed. This is because we never enter userspace at all on the threads that CPUs are initialized in. Poison the initial TSS.sp0 and stop initializing it on CPU init. The comment text mostly comes from Dave Hansen. Thanks! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 13 ++++++++++--- arch/x86/kernel/process.c | 8 +++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e7fb9c3bfa5..cdf79ab628c2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1570,9 +1570,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(current->thread.sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1594,7 +1598,6 @@ void cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); @@ -1625,9 +1628,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(thread->sp0); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd6b85fac666..ff8a9acbcf8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -48,7 +48,13 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, From 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:14 -0700 Subject: [PATCH 43/47] x86/entry/64: Remove all remaining direct thread_struct::sp0 reads The only remaining readers in context switch code or vm86(), and they all just want to update TSS.sp0 to match the current task. Replace them all with a new helper update_sp0(). Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7ae8caffbada..54e64d909725 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) } #endif +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ + load_sp0(task->thread.sp0); +} + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 40b85870e429..45bf0c5f93e1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - load_sp0(next->sp0); + update_sp0(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2124304fb77a..45e380958392 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); /* Reload sp0. */ - load_sp0(next->sp0); + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 0f1d92cd20ad..a7b44c75c642 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tsk->thread.sp0); + update_sp0(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - load_sp0(tsk->thread.sp0); + update_sp0(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) From cd493a6deb8b78eca280d05f7fa73fd69403ae29 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:15 -0700 Subject: [PATCH 44/47] x86/entry/32: Fix cpu_current_top_of_stack initialization at boot cpu_current_top_of_stack's initialization forgot about TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the idle threads never enter user mode. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..06c18fe1c09e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif From d375cf1530595e33961a8844192cddab913650e3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:16 -0700 Subject: [PATCH 45/47] x86/entry/64: Remove thread_struct::sp0 On x86_64, we can easily calculate sp0 when needed instead of storing it in thread_struct. On x86_32, a similar cleanup would be possible, but it would require cleaning up the vm86 code first, and that can wait for a later cleanup series. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 1 + arch/x86/include/asm/processor.h | 28 +++++++++------------------- arch/x86/include/asm/switch_to.h | 6 ++++++ arch/x86/kernel/process_64.c | 1 - 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5343c19814b3..948b6d8ec46f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad59cec14239..ae2ae6d80674 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -430,7 +430,9 @@ typedef struct { struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -797,6 +799,13 @@ static inline void spin_lock_prefetch(const void *x) #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -816,23 +825,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else @@ -866,11 +858,9 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 54e64d909725..010cd6e4eafc 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { +#ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); +#else + load_sp0(task_top_of_stack(task)); +#endif } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 45e380958392..eeeb34f85c25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; From 3383642c2f9d4f5b4fa37436db4a109a1a10018c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Nov 2017 00:59:17 -0700 Subject: [PATCH 46/47] x86/traps: Use a new on_thread_stack() helper to clean up an assertion Let's keep the stack-related logic together rather than open-coding a comparison in an assertion in the traps code. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/traps.c | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ae2ae6d80674..f10dae14f951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -541,6 +541,12 @@ static inline unsigned long current_top_of_stack(void) #endif } +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer) < THREAD_SIZE; +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 67db4f43309e..42a9c4458f5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } From 1e4c4f610f774df6088d7c065b2dd4d22adba698 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Nov 2017 13:09:26 +0100 Subject: [PATCH 47/47] x86/entry/64: Shorten TEST instructions Convert TESTL to TESTB and save 3 bytes per callsite. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 40e9933a2d33..84263c79a119 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -620,7 +620,7 @@ GLOBAL(retint_user) GLOBAL(swapgs_restore_regs_and_return_to_usermode) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ - testl $3, CS(%rsp) + testb $3, CS(%rsp) jnz 1f ud2 1: @@ -653,7 +653,7 @@ retint_kernel: GLOBAL(restore_regs_and_return_to_kernel) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ - testl $3, CS(%rsp) + testb $3, CS(%rsp) jz 1f ud2 1: