2021-01-25 14:19:08 -05:00
// SPDX-License-Identifier: GPL-2.0
/*
* Transitional page tables for kexec and hibernate
*
* This file derived from : arch / arm64 / kernel / hibernate . c
*
2021-09-30 14:31:00 +00:00
* Copyright ( c ) 2021 , Microsoft Corporation .
* Pasha Tatashin < pasha . tatashin @ soleen . com >
2021-01-25 14:19:08 -05:00
*
*/
/*
* Transitional tables are used during system transferring from one world to
* another : such as during hibernate restore , and kexec reboots . During these
* phases one cannot rely on page table not being overwritten . This is because
* hibernate and kexec can overwrite the current page tables during transition .
*/
# include <asm/trans_pgd.h>
# include <asm/pgalloc.h>
# include <asm/pgtable.h>
# include <linux/suspend.h>
# include <linux/bug.h>
# include <linux/mm.h>
# include <linux/mmzone.h>
2023-07-13 12:37:57 +05:30
# include <linux/kfence.h>
2021-01-25 14:19:08 -05:00
2021-01-25 14:19:09 -05:00
static void * trans_alloc ( struct trans_pgd_info * info )
{
return info - > trans_alloc_page ( info - > trans_alloc_arg ) ;
}
2021-01-25 14:19:08 -05:00
static void _copy_pte ( pte_t * dst_ptep , pte_t * src_ptep , unsigned long addr )
{
pte_t pte = READ_ONCE ( * src_ptep ) ;
if ( pte_valid ( pte ) ) {
/*
* Resume will overwrite areas that may be marked
* read only ( code , rodata ) . Clear the RDONLY bit from
* the temporary mappings we use during restore .
*/
mm: Move pte/pmd_mkwrite() callers with no VMA to _novma()
The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.
One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable or shadow stack mappings.
But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.
So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.
Earlier work did the first step, so next move the callers that don't have
a VMA to pte_mkwrite_novma(). Also do the same for pmd_mkwrite(). This
will be ok for the shadow stack feature, as these callers are on kernel
memory which will not need to be made shadow stack, and the other
architectures only currently support one type of memory in pte_mkwrite()
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/all/20230613001108.3040476-3-rick.p.edgecombe%40intel.com
2023-06-12 17:10:28 -07:00
set_pte ( dst_ptep , pte_mkwrite_novma ( pte ) ) ;
2023-07-13 12:37:57 +05:30
} else if ( ( debug_pagealloc_enabled ( ) | |
is_kfence_address ( ( void * ) addr ) ) & & ! pte_none ( pte ) ) {
2021-01-25 14:19:08 -05:00
/*
* debug_pagealloc will removed the PTE_VALID bit if
* the page isn ' t in use by the resume kernel . It may have
* been in use by the original kernel , in which case we need
* to put it back in our copy to do the restore .
*
* Before marking this entry valid , check the pfn should
* be mapped .
*/
BUG_ON ( ! pfn_valid ( pte_pfn ( pte ) ) ) ;
mm: Move pte/pmd_mkwrite() callers with no VMA to _novma()
The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.
One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable or shadow stack mappings.
But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.
So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.
Earlier work did the first step, so next move the callers that don't have
a VMA to pte_mkwrite_novma(). Also do the same for pmd_mkwrite(). This
will be ok for the shadow stack feature, as these callers are on kernel
memory which will not need to be made shadow stack, and the other
architectures only currently support one type of memory in pte_mkwrite()
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/all/20230613001108.3040476-3-rick.p.edgecombe%40intel.com
2023-06-12 17:10:28 -07:00
set_pte ( dst_ptep , pte_mkpresent ( pte_mkwrite_novma ( pte ) ) ) ;
2021-01-25 14:19:08 -05:00
}
}
2021-01-25 14:19:10 -05:00
static int copy_pte ( struct trans_pgd_info * info , pmd_t * dst_pmdp ,
pmd_t * src_pmdp , unsigned long start , unsigned long end )
2021-01-25 14:19:08 -05:00
{
pte_t * src_ptep ;
pte_t * dst_ptep ;
unsigned long addr = start ;
2021-01-25 14:19:10 -05:00
dst_ptep = trans_alloc ( info ) ;
2021-01-25 14:19:08 -05:00
if ( ! dst_ptep )
return - ENOMEM ;
2021-01-25 14:19:11 -05:00
pmd_populate_kernel ( NULL , dst_pmdp , dst_ptep ) ;
2021-01-25 14:19:08 -05:00
dst_ptep = pte_offset_kernel ( dst_pmdp , start ) ;
src_ptep = pte_offset_kernel ( src_pmdp , start ) ;
do {
_copy_pte ( dst_ptep , src_ptep , addr ) ;
} while ( dst_ptep + + , src_ptep + + , addr + = PAGE_SIZE , addr ! = end ) ;
return 0 ;
}
2021-01-25 14:19:10 -05:00
static int copy_pmd ( struct trans_pgd_info * info , pud_t * dst_pudp ,
pud_t * src_pudp , unsigned long start , unsigned long end )
2021-01-25 14:19:08 -05:00
{
pmd_t * src_pmdp ;
pmd_t * dst_pmdp ;
unsigned long next ;
unsigned long addr = start ;
if ( pud_none ( READ_ONCE ( * dst_pudp ) ) ) {
2021-01-25 14:19:10 -05:00
dst_pmdp = trans_alloc ( info ) ;
2021-01-25 14:19:08 -05:00
if ( ! dst_pmdp )
return - ENOMEM ;
2021-01-25 14:19:11 -05:00
pud_populate ( NULL , dst_pudp , dst_pmdp ) ;
2021-01-25 14:19:08 -05:00
}
dst_pmdp = pmd_offset ( dst_pudp , start ) ;
src_pmdp = pmd_offset ( src_pudp , start ) ;
do {
pmd_t pmd = READ_ONCE ( * src_pmdp ) ;
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none ( pmd ) )
continue ;
if ( pmd_table ( pmd ) ) {
2021-01-25 14:19:10 -05:00
if ( copy_pte ( info , dst_pmdp , src_pmdp , addr , next ) )
2021-01-25 14:19:08 -05:00
return - ENOMEM ;
} else {
set_pmd ( dst_pmdp ,
__pmd ( pmd_val ( pmd ) & ~ PMD_SECT_RDONLY ) ) ;
}
} while ( dst_pmdp + + , src_pmdp + + , addr = next , addr ! = end ) ;
return 0 ;
}
2021-01-25 14:19:10 -05:00
static int copy_pud ( struct trans_pgd_info * info , p4d_t * dst_p4dp ,
p4d_t * src_p4dp , unsigned long start ,
2021-01-25 14:19:08 -05:00
unsigned long end )
{
pud_t * dst_pudp ;
pud_t * src_pudp ;
unsigned long next ;
unsigned long addr = start ;
if ( p4d_none ( READ_ONCE ( * dst_p4dp ) ) ) {
2021-01-25 14:19:10 -05:00
dst_pudp = trans_alloc ( info ) ;
2021-01-25 14:19:08 -05:00
if ( ! dst_pudp )
return - ENOMEM ;
2021-01-25 14:19:11 -05:00
p4d_populate ( NULL , dst_p4dp , dst_pudp ) ;
2021-01-25 14:19:08 -05:00
}
dst_pudp = pud_offset ( dst_p4dp , start ) ;
src_pudp = pud_offset ( src_p4dp , start ) ;
do {
pud_t pud = READ_ONCE ( * src_pudp ) ;
next = pud_addr_end ( addr , end ) ;
if ( pud_none ( pud ) )
continue ;
if ( pud_table ( pud ) ) {
2021-01-25 14:19:10 -05:00
if ( copy_pmd ( info , dst_pudp , src_pudp , addr , next ) )
2021-01-25 14:19:08 -05:00
return - ENOMEM ;
} else {
set_pud ( dst_pudp ,
__pud ( pud_val ( pud ) & ~ PUD_SECT_RDONLY ) ) ;
}
} while ( dst_pudp + + , src_pudp + + , addr = next , addr ! = end ) ;
return 0 ;
}
2021-01-25 14:19:10 -05:00
static int copy_p4d ( struct trans_pgd_info * info , pgd_t * dst_pgdp ,
pgd_t * src_pgdp , unsigned long start ,
2021-01-25 14:19:08 -05:00
unsigned long end )
{
p4d_t * dst_p4dp ;
p4d_t * src_p4dp ;
unsigned long next ;
unsigned long addr = start ;
dst_p4dp = p4d_offset ( dst_pgdp , start ) ;
src_p4dp = p4d_offset ( src_pgdp , start ) ;
do {
next = p4d_addr_end ( addr , end ) ;
if ( p4d_none ( READ_ONCE ( * src_p4dp ) ) )
continue ;
2021-01-25 14:19:10 -05:00
if ( copy_pud ( info , dst_p4dp , src_p4dp , addr , next ) )
2021-01-25 14:19:08 -05:00
return - ENOMEM ;
} while ( dst_p4dp + + , src_p4dp + + , addr = next , addr ! = end ) ;
return 0 ;
}
2021-01-25 14:19:10 -05:00
static int copy_page_tables ( struct trans_pgd_info * info , pgd_t * dst_pgdp ,
unsigned long start , unsigned long end )
2021-01-25 14:19:08 -05:00
{
unsigned long next ;
unsigned long addr = start ;
pgd_t * src_pgdp = pgd_offset_k ( start ) ;
dst_pgdp = pgd_offset_pgd ( dst_pgdp , start ) ;
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none ( READ_ONCE ( * src_pgdp ) ) )
continue ;
2021-01-25 14:19:10 -05:00
if ( copy_p4d ( info , dst_pgdp , src_pgdp , addr , next ) )
2021-01-25 14:19:08 -05:00
return - ENOMEM ;
} while ( dst_pgdp + + , src_pgdp + + , addr = next , addr ! = end ) ;
return 0 ;
}
2021-01-25 14:19:10 -05:00
/*
* Create trans_pgd and copy linear map .
* info : contains allocator and its argument
* dst_pgdp : new page table that is created , and to which map is copied .
* start : Start of the interval ( inclusive ) .
* end : End of the interval ( exclusive ) .
*
* Returns 0 on success , and - ENOMEM on failure .
*/
int trans_pgd_create_copy ( struct trans_pgd_info * info , pgd_t * * dst_pgdp ,
unsigned long start , unsigned long end )
2021-01-25 14:19:08 -05:00
{
int rc ;
2021-01-25 14:19:10 -05:00
pgd_t * trans_pgd = trans_alloc ( info ) ;
2021-01-25 14:19:08 -05:00
if ( ! trans_pgd ) {
pr_err ( " Failed to allocate memory for temporary page tables. \n " ) ;
return - ENOMEM ;
}
2021-01-25 14:19:10 -05:00
rc = copy_page_tables ( info , trans_pgd , start , end ) ;
2021-01-25 14:19:08 -05:00
if ( ! rc )
* dst_pgdp = trans_pgd ;
return rc ;
}
2021-01-25 14:19:13 -05:00
/*
* The page we want to idmap may be outside the range covered by VA_BITS that
* can be built using the kernel ' s p ? d_populate ( ) helpers . As a one off , for a
* single page , we build these page tables bottom up and just assume that will
* need the maximum T0SZ .
*
* Returns 0 on success , and - ENOMEM on failure .
* On success trans_ttbr0 contains page table with idmapped page , t0sz is set to
* maximum T0SZ for this page .
*/
int trans_pgd_idmap_page ( struct trans_pgd_info * info , phys_addr_t * trans_ttbr0 ,
unsigned long * t0sz , void * page )
{
phys_addr_t dst_addr = virt_to_phys ( page ) ;
unsigned long pfn = __phys_to_pfn ( dst_addr ) ;
int max_msb = ( dst_addr & GENMASK ( 52 , 48 ) ) ? 51 : 47 ;
int bits_mapped = PAGE_SHIFT - 4 ;
unsigned long level_mask , prev_level_entry , * levels [ 4 ] ;
int this_level , index , level_lsb , level_msb ;
dst_addr & = PAGE_MASK ;
2022-04-29 15:13:47 +02:00
prev_level_entry = pte_val ( pfn_pte ( pfn , PAGE_KERNEL_ROX ) ) ;
2021-01-25 14:19:13 -05:00
for ( this_level = 3 ; this_level > = 0 ; this_level - - ) {
levels [ this_level ] = trans_alloc ( info ) ;
if ( ! levels [ this_level ] )
return - ENOMEM ;
level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT ( this_level ) ;
level_msb = min ( level_lsb + bits_mapped , max_msb ) ;
level_mask = GENMASK_ULL ( level_msb , level_lsb ) ;
index = ( dst_addr & level_mask ) > > level_lsb ;
* ( levels [ this_level ] + index ) = prev_level_entry ;
pfn = virt_to_pfn ( levels [ this_level ] ) ;
prev_level_entry = pte_val ( pfn_pte ( pfn ,
__pgprot ( PMD_TYPE_TABLE ) ) ) ;
if ( level_msb = = max_msb )
break ;
}
* trans_ttbr0 = phys_to_ttbr ( __pfn_to_phys ( pfn ) ) ;
* t0sz = TCR_T0SZ ( max_msb + 1 ) ;
return 0 ;
}
2021-09-30 14:31:00 +00:00
/*
* Create a copy of the vector table so we can call HVC_SET_VECTORS or
* HVC_SOFT_RESTART from contexts where the table may be overwritten .
*/
int trans_pgd_copy_el2_vectors ( struct trans_pgd_info * info ,
phys_addr_t * el2_vectors )
{
void * hyp_stub = trans_alloc ( info ) ;
if ( ! hyp_stub )
return - ENOMEM ;
* el2_vectors = virt_to_phys ( hyp_stub ) ;
memcpy ( hyp_stub , & trans_pgd_stub_vectors , ARM64_VECTOR_TABLE_LEN ) ;
caches_clean_inval_pou ( ( unsigned long ) hyp_stub ,
( unsigned long ) hyp_stub +
ARM64_VECTOR_TABLE_LEN ) ;
dcache_clean_inval_poc ( ( unsigned long ) hyp_stub ,
( unsigned long ) hyp_stub +
ARM64_VECTOR_TABLE_LEN ) ;
return 0 ;
}