2019-04-16 21:16:14 +05:30
/* SPDX-License-Identifier: GPL-2.0 */
2017-12-20 18:28:54 +01:00
# ifndef _ASM_X86_CPU_ENTRY_AREA_H
# define _ASM_X86_CPU_ENTRY_AREA_H
# include <linux/percpu-defs.h>
# include <asm/processor.h>
2017-12-04 15:07:49 +01:00
# include <asm/intel_ds.h>
2019-11-29 08:17:25 +01:00
# include <asm/pgtable_areas.h>
2017-12-20 18:28:54 +01:00
2019-04-14 17:59:47 +02:00
# ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */
2020-09-07 15:15:43 +02:00
# define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
char DF_stack_guard [ guardsize ] ; \
char DF_stack [ EXCEPTION_STKSZ ] ; \
char NMI_stack_guard [ guardsize ] ; \
char NMI_stack [ EXCEPTION_STKSZ ] ; \
char DB_stack_guard [ guardsize ] ; \
char DB_stack [ EXCEPTION_STKSZ ] ; \
char MCE_stack_guard [ guardsize ] ; \
char MCE_stack [ EXCEPTION_STKSZ ] ; \
char VC_stack_guard [ guardsize ] ; \
char VC_stack [ optional_stack_size ] ; \
char VC2_stack_guard [ guardsize ] ; \
char VC2_stack [ optional_stack_size ] ; \
char IST_top_guard [ guardsize ] ; \
2019-04-14 17:59:47 +02:00
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
2020-09-07 15:15:43 +02:00
ESTACKS_MEMBERS ( 0 , 0 )
2019-04-14 17:59:47 +02:00
} ;
2019-04-14 17:59:56 +02:00
/* The effective cpu entry area mapping with guard pages. */
2019-04-14 17:59:47 +02:00
struct cea_exception_stacks {
2020-09-07 15:15:43 +02:00
ESTACKS_MEMBERS ( PAGE_SIZE , EXCEPTION_STKSZ )
2019-04-14 17:59:47 +02:00
} ;
2019-04-14 17:59:55 +02:00
/*
* The exception stack ordering in [ cea_ ] exception_stacks
*/
enum exception_stack_ordering {
ESTACK_DF ,
ESTACK_NMI ,
ESTACK_DB ,
ESTACK_MCE ,
2020-09-07 15:15:43 +02:00
ESTACK_VC ,
ESTACK_VC2 ,
2019-04-14 17:59:55 +02:00
N_EXCEPTION_STACKS
} ;
2019-04-14 17:59:47 +02:00
# define CEA_ESTACK_SIZE(st) \
sizeof ( ( ( struct cea_exception_stacks * ) 0 ) - > st # # _stack )
# define CEA_ESTACK_BOT(ceastp, st) \
( ( unsigned long ) & ( ceastp ) - > st # # _stack )
# define CEA_ESTACK_TOP(ceastp, st) \
( CEA_ESTACK_BOT ( ceastp , st ) + CEA_ESTACK_SIZE ( st ) )
# define CEA_ESTACK_OFFS(st) \
offsetof ( struct cea_exception_stacks , st # # _stack )
# define CEA_ESTACK_PAGES \
( sizeof ( struct cea_exception_stacks ) / PAGE_SIZE )
# endif
x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area
There are three problems with the current layout of the doublefault
stack and TSS. First, the TSS is only cacheline-aligned, which is
not enough -- if the hardware portion of the TSS (struct x86_hw_tss)
crosses a page boundary, horrible things happen [0]. Second, the
stack and TSS are global, so simultaneous double faults on different
CPUs will cause massive corruption. Third, the whole mechanism
won't work if user CR3 is loaded, resulting in a triple fault [1].
Let the doublefault stack and TSS share a page (which prevents the
TSS from spanning a page boundary), make it percpu, and move it into
cpu_entry_area. Teach the stack dump code about the doublefault
stack.
[0] Real hardware will read past the end of the page onto the next
*physical* page if a task switch happens. Virtual machines may
have any number of bugs, and I would consider it reasonable for
a VM to summarily kill the guest if it tries to task-switch to
a page-spanning TSS.
[1] Real hardware triple faults. At least some VMs seem to hang.
I'm not sure what's going on.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-11-26 18:27:16 +01:00
# ifdef CONFIG_X86_32
struct doublefault_stack {
unsigned long stack [ ( PAGE_SIZE - sizeof ( struct x86_hw_tss ) ) / sizeof ( unsigned long ) ] ;
struct x86_hw_tss tss ;
} __aligned ( PAGE_SIZE ) ;
# endif
2017-12-20 18:28:54 +01:00
/*
* cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry / exit code . Real types aren ' t used for all fields here
* to avoid circular header dependencies .
*
* Every field is a virtual alias of some other allocated backing store .
* There is no direct allocation of a struct cpu_entry_area .
*/
struct cpu_entry_area {
char gdt [ PAGE_SIZE ] ;
/*
* The GDT is just below entry_stack and thus serves ( on x86_64 ) as
2019-11-21 00:40:24 +01:00
* a read - only guard page . On 32 - bit the GDT must be writeable , so
* it needs an extra guard page .
2017-12-20 18:28:54 +01:00
*/
2019-11-21 00:40:24 +01:00
# ifdef CONFIG_X86_32
char guard_entry_stack [ PAGE_SIZE ] ;
# endif
2017-12-20 18:28:54 +01:00
struct entry_stack_page entry_stack_page ;
x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area
There are three problems with the current layout of the doublefault
stack and TSS. First, the TSS is only cacheline-aligned, which is
not enough -- if the hardware portion of the TSS (struct x86_hw_tss)
crosses a page boundary, horrible things happen [0]. Second, the
stack and TSS are global, so simultaneous double faults on different
CPUs will cause massive corruption. Third, the whole mechanism
won't work if user CR3 is loaded, resulting in a triple fault [1].
Let the doublefault stack and TSS share a page (which prevents the
TSS from spanning a page boundary), make it percpu, and move it into
cpu_entry_area. Teach the stack dump code about the doublefault
stack.
[0] Real hardware will read past the end of the page onto the next
*physical* page if a task switch happens. Virtual machines may
have any number of bugs, and I would consider it reasonable for
a VM to summarily kill the guest if it tries to task-switch to
a page-spanning TSS.
[1] Real hardware triple faults. At least some VMs seem to hang.
I'm not sure what's going on.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-11-26 18:27:16 +01:00
# ifdef CONFIG_X86_32
char guard_doublefault_stack [ PAGE_SIZE ] ;
struct doublefault_stack doublefault_stack ;
# endif
2017-12-20 18:28:54 +01:00
/*
* On x86_64 , the TSS is mapped RO . On x86_32 , it ' s mapped RW because
* we need task switches to work , and task switches write to the TSS .
*/
struct tss_struct tss ;
# ifdef CONFIG_X86_64
/*
2019-04-14 17:59:47 +02:00
* Exception stacks used for IST entries with guard pages .
2017-12-20 18:28:54 +01:00
*/
2019-04-14 17:59:47 +02:00
struct cea_exception_stacks estacks ;
2017-12-20 18:28:54 +01:00
# endif
2017-12-04 15:07:49 +01:00
/*
* Per CPU debug store for Intel performance monitoring . Wastes a
* full page at the moment .
*/
struct debug_store cpu_debug_store ;
/*
* The actual PEBS / BTS buffers must be mapped to user space
* Reserve enough fixmap PTEs .
*/
struct debug_store_buffers cpu_debug_buffers ;
2017-12-20 18:28:54 +01:00
} ;
x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise
When two recent commits that increased the size of the 'struct cpu_entry_area'
were merged in -tip, the 32-bit defconfig build started failing on the following
build time assert:
./include/linux/compiler.h:391:38: error: call to ‘__compiletime_assert_189’ declared with attribute error: BUILD_BUG_ON failed: CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE
arch/x86/mm/cpu_entry_area.c:189:2: note: in expansion of macro ‘BUILD_BUG_ON’
In function ‘setup_cpu_entry_area_ptes’,
Which corresponds to the following build time assert:
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
The purpose of this assert is to sanity check the fixed-value definition of
CPU_ENTRY_AREA_PAGES arch/x86/include/asm/pgtable_32_types.h:
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41)
The '41' is supposed to match sizeof(struct cpu_entry_area)/PAGE_SIZE, which value
we didn't want to define in such a low level header, because it would cause
dependency hell.
Every time the size of cpu_entry_area is changed, we have to adjust CPU_ENTRY_AREA_PAGES
accordingly - and this assert is checking that constraint.
But the assert is both imprecise and buggy, primarily because it doesn't
include the single readonly IDT page that is mapped at CPU_ENTRY_AREA_BASE
(which begins at a PMD boundary).
This bug was hidden by the fact that by accident CPU_ENTRY_AREA_PAGES is defined
too large upstream (v5.4-rc8):
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
While 'struct cpu_entry_area' is 155648 bytes, or 38 pages. So we had two extra
pages, which hid the bug.
The following commit (not yet upstream) increased the size to 40 pages:
x86/iopl: ("Restrict iopl() permission scope")
... but increased CPU_ENTRY_AREA_PAGES only 41 - i.e. shortening the gap
to just 1 extra page.
Then another not-yet-upstream commit changed the size again:
880a98c33996: ("x86/cpu_entry_area: Add guard page for entry stack on 32bit")
Which increased the cpu_entry_area size from 38 to 39 pages, but
didn't change CPU_ENTRY_AREA_PAGES (kept it at 40). This worked
fine, because we still had a page left from the accidental 'reserve'.
But when these two commits were merged into the same tree, the
combined size of cpu_entry_area grew from 38 to 40 pages, while
CPU_ENTRY_AREA_PAGES finally caught up to 40 as well.
Which is fine in terms of functionality, but the assert broke:
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
because CPU_ENTRY_AREA_MAP_SIZE is the total size of the area,
which is 1 page larger due to the IDT page.
To fix all this, change the assert to two precise asserts:
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
This takes the IDT page into account, and also connects the size-based
define of CPU_ENTRY_AREA_TOTAL_SIZE with the address-subtraction based
define of CPU_ENTRY_AREA_MAP_SIZE.
Also clean up some of the names which made it rather confusing:
- 'CPU_ENTRY_AREA_TOT_SIZE' wasn't actually the 'total' size of
the cpu-entry-area, but the per-cpu array size, so rename this
to CPU_ENTRY_AREA_ARRAY_SIZE.
- Introduce CPU_ENTRY_AREA_TOTAL_SIZE that _is_ the total mapping
size, with the IDT included.
- Add comments where '+1' denotes the IDT mapping - it wasn't
obvious and took me about 3 hours to decode...
Finally, because this particular commit is actually applied after
this patch:
880a98c33996: ("x86/cpu_entry_area: Add guard page for entry stack on 32bit")
Fix the CPU_ENTRY_AREA_PAGES value from 40 pages to the correct 39 pages.
All future commits that change cpu_entry_area will have to adjust
this value precisely.
As a side note, we should probably attempt to remove CPU_ENTRY_AREA_PAGES
and derive its value directly from the structure, without causing
header hell - but that is an adventure for another day! :-)
Fixes: 880a98c33996: ("x86/cpu_entry_area: Add guard page for entry stack on 32bit")
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-11-24 11:21:44 +01:00
# define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
# define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
/* Total size includes the readonly IDT mapping page as well: */
# define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
2017-12-20 18:28:54 +01:00
DECLARE_PER_CPU ( struct cpu_entry_area * , cpu_entry_area ) ;
2019-04-14 17:59:49 +02:00
DECLARE_PER_CPU ( struct cea_exception_stacks * , cea_exception_stacks ) ;
2017-12-20 18:28:54 +01:00
extern void setup_cpu_entry_areas ( void ) ;
2017-12-20 18:51:31 +01:00
extern void cea_set_pte ( void * cea_vaddr , phys_addr_t pa , pgprot_t flags ) ;
extern struct cpu_entry_area * get_cpu_entry_area ( int cpu ) ;
static inline struct entry_stack * cpu_entry_stack ( int cpu )
{
return & get_cpu_entry_area ( cpu ) - > entry_stack_page . stack ;
}
2017-12-20 18:28:54 +01:00
2019-04-14 17:59:49 +02:00
# define __this_cpu_ist_top_va(name) \
CEA_ESTACK_TOP ( __this_cpu_read ( cea_exception_stacks ) , name )
2020-09-07 15:15:43 +02:00
# define __this_cpu_ist_bottom_va(name) \
CEA_ESTACK_BOT ( __this_cpu_read ( cea_exception_stacks ) , name )
2017-12-20 18:28:54 +01:00
# endif