2016-03-08 11:49:57 +01:00
/*
* Page table allocation functions
*
* Copyright IBM Corp . 2016
* Author ( s ) : Martin Schwidefsky < schwidefsky @ de . ibm . com >
*/
# include <linux/mm.h>
# include <linux/sysctl.h>
# include <asm/mmu_context.h>
# include <asm/pgalloc.h>
# include <asm/gmap.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
# ifdef CONFIG_PGSTE
static int page_table_allocate_pgste_min = 0 ;
static int page_table_allocate_pgste_max = 1 ;
int page_table_allocate_pgste = 0 ;
EXPORT_SYMBOL ( page_table_allocate_pgste ) ;
static struct ctl_table page_table_sysctl [ ] = {
{
. procname = " allocate_pgste " ,
. data = & page_table_allocate_pgste ,
. maxlen = sizeof ( int ) ,
. mode = S_IRUGO | S_IWUSR ,
. proc_handler = proc_dointvec ,
. extra1 = & page_table_allocate_pgste_min ,
. extra2 = & page_table_allocate_pgste_max ,
} ,
{ }
} ;
static struct ctl_table page_table_sysctl_dir [ ] = {
{
. procname = " vm " ,
. maxlen = 0 ,
. mode = 0555 ,
. child = page_table_sysctl ,
} ,
{ }
} ;
static int __init page_table_register_sysctl ( void )
{
return register_sysctl_table ( page_table_sysctl_dir ) ? 0 : - ENOMEM ;
}
__initcall ( page_table_register_sysctl ) ;
# endif /* CONFIG_PGSTE */
unsigned long * crst_table_alloc ( struct mm_struct * mm )
{
struct page * page = alloc_pages ( GFP_KERNEL , 2 ) ;
if ( ! page )
return NULL ;
2016-06-14 12:56:01 +02:00
arch_set_page_dat ( page , 2 ) ;
2016-03-08 11:49:57 +01:00
return ( unsigned long * ) page_to_phys ( page ) ;
}
void crst_table_free ( struct mm_struct * mm , unsigned long * table )
{
free_pages ( ( unsigned long ) table , 2 ) ;
}
static void __crst_table_upgrade ( void * arg )
{
struct mm_struct * mm = arg ;
if ( current - > active_mm = = mm ) {
clear_user_asce ( ) ;
set_user_asce ( mm ) ;
}
__tlb_flush_local ( ) ;
}
2017-04-24 18:19:10 +02:00
int crst_table_upgrade ( struct mm_struct * mm , unsigned long end )
2016-03-08 11:49:57 +01:00
{
unsigned long * table , * pgd ;
2017-04-24 18:19:10 +02:00
int rc , notify ;
2016-03-08 11:49:57 +01:00
2017-04-24 18:19:10 +02:00
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
2017-08-31 13:18:22 +02:00
VM_BUG_ON ( mm - > context . asce_limit < _REGION2_SIZE ) ;
2017-04-24 18:19:10 +02:00
if ( end > = TASK_SIZE_MAX )
2016-03-08 11:49:57 +01:00
return - ENOMEM ;
2017-04-24 18:19:10 +02:00
rc = 0 ;
notify = 0 ;
while ( mm - > context . asce_limit < end ) {
table = crst_table_alloc ( mm ) ;
if ( ! table ) {
rc = - ENOMEM ;
break ;
}
spin_lock_bh ( & mm - > page_table_lock ) ;
pgd = ( unsigned long * ) mm - > pgd ;
2017-07-05 07:37:27 +02:00
if ( mm - > context . asce_limit = = _REGION2_SIZE ) {
2017-04-24 18:19:10 +02:00
crst_table_init ( table , _REGION2_ENTRY_EMPTY ) ;
p4d_populate ( mm , ( p4d_t * ) table , ( pud_t * ) pgd ) ;
mm - > pgd = ( pgd_t * ) table ;
2017-07-05 07:37:27 +02:00
mm - > context . asce_limit = _REGION1_SIZE ;
2017-04-24 18:19:10 +02:00
mm - > context . asce = __pa ( mm - > pgd ) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION2 ;
} else {
crst_table_init ( table , _REGION1_ENTRY_EMPTY ) ;
pgd_populate ( mm , ( pgd_t * ) table , ( p4d_t * ) pgd ) ;
mm - > pgd = ( pgd_t * ) table ;
mm - > context . asce_limit = - PAGE_SIZE ;
mm - > context . asce = __pa ( mm - > pgd ) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION1 ;
}
notify = 1 ;
spin_unlock_bh ( & mm - > page_table_lock ) ;
}
if ( notify )
on_each_cpu ( __crst_table_upgrade , mm , 0 ) ;
return rc ;
2016-03-08 11:49:57 +01:00
}
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
void crst_table_downgrade ( struct mm_struct * mm )
2016-03-08 11:49:57 +01:00
{
pgd_t * pgd ;
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
/* downgrade should only happen from 3 to 2 levels (compat only) */
2017-08-31 13:18:22 +02:00
VM_BUG_ON ( mm - > context . asce_limit ! = _REGION2_SIZE ) ;
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
2016-03-08 11:49:57 +01:00
if ( current - > active_mm = = mm ) {
clear_user_asce ( ) ;
__tlb_flush_mm ( mm ) ;
}
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
pgd = mm - > pgd ;
mm - > pgd = ( pgd_t * ) ( pgd_val ( * pgd ) & _REGION_ENTRY_ORIGIN ) ;
2017-07-05 07:37:27 +02:00
mm - > context . asce_limit = _REGION3_SIZE ;
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
mm - > context . asce = __pa ( mm - > pgd ) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_SEGMENT ;
crst_table_free ( mm , ( unsigned long * ) pgd ) ;
2016-03-08 11:49:57 +01:00
if ( current - > active_mm = = mm )
set_user_asce ( mm ) ;
}
static inline unsigned int atomic_xor_bits ( atomic_t * v , unsigned int bits )
{
unsigned int old , new ;
do {
old = atomic_read ( v ) ;
new = old ^ bits ;
} while ( atomic_cmpxchg ( v , old , new ) ! = old ) ;
return new ;
}
2016-03-08 12:12:18 +01:00
# ifdef CONFIG_PGSTE
struct page * page_table_alloc_pgste ( struct mm_struct * mm )
{
struct page * page ;
unsigned long * table ;
2017-03-07 16:48:40 +01:00
page = alloc_page ( GFP_KERNEL ) ;
2016-03-08 12:12:18 +01:00
if ( page ) {
table = ( unsigned long * ) page_to_phys ( page ) ;
clear_table ( table , _PAGE_INVALID , PAGE_SIZE / 2 ) ;
clear_table ( table + PTRS_PER_PTE , 0 , PAGE_SIZE / 2 ) ;
}
return page ;
}
void page_table_free_pgste ( struct page * page )
{
__free_page ( page ) ;
}
# endif /* CONFIG_PGSTE */
2016-03-08 11:49:57 +01:00
/*
* page table entry allocation / free routines .
*/
unsigned long * page_table_alloc ( struct mm_struct * mm )
{
unsigned long * table ;
struct page * page ;
unsigned int mask , bit ;
/* Try to get a fragment of a 4K page as a 2K page table */
if ( ! mm_alloc_pgste ( mm ) ) {
table = NULL ;
2017-08-17 18:17:49 +02:00
spin_lock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
if ( ! list_empty ( & mm - > context . pgtable_list ) ) {
page = list_first_entry ( & mm - > context . pgtable_list ,
struct page , lru ) ;
mask = atomic_read ( & page - > _mapcount ) ;
mask = ( mask | ( mask > > 4 ) ) & 3 ;
if ( mask ! = 3 ) {
table = ( unsigned long * ) page_to_phys ( page ) ;
bit = mask & 1 ; /* =1 -> second 2K */
if ( bit )
table + = PTRS_PER_PTE ;
atomic_xor_bits ( & page - > _mapcount , 1U < < bit ) ;
list_del ( & page - > lru ) ;
}
}
2017-08-17 18:17:49 +02:00
spin_unlock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
if ( table )
return table ;
}
/* Allocate a fresh page */
2016-06-24 14:49:17 -07:00
page = alloc_page ( GFP_KERNEL ) ;
2016-03-08 11:49:57 +01:00
if ( ! page )
return NULL ;
if ( ! pgtable_page_ctor ( page ) ) {
__free_page ( page ) ;
return NULL ;
}
2016-06-14 12:56:01 +02:00
arch_set_page_dat ( page , 0 ) ;
2016-03-08 11:49:57 +01:00
/* Initialize page table */
table = ( unsigned long * ) page_to_phys ( page ) ;
if ( mm_alloc_pgste ( mm ) ) {
/* Return 4K page table with PGSTEs */
atomic_set ( & page - > _mapcount , 3 ) ;
clear_table ( table , _PAGE_INVALID , PAGE_SIZE / 2 ) ;
clear_table ( table + PTRS_PER_PTE , 0 , PAGE_SIZE / 2 ) ;
} else {
/* Return the first 2K fragment of the page */
atomic_set ( & page - > _mapcount , 1 ) ;
clear_table ( table , _PAGE_INVALID , PAGE_SIZE ) ;
2017-08-17 18:17:49 +02:00
spin_lock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
list_add ( & page - > lru , & mm - > context . pgtable_list ) ;
2017-08-17 18:17:49 +02:00
spin_unlock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
}
return table ;
}
void page_table_free ( struct mm_struct * mm , unsigned long * table )
{
struct page * page ;
unsigned int bit , mask ;
page = pfn_to_page ( __pa ( table ) > > PAGE_SHIFT ) ;
if ( ! mm_alloc_pgste ( mm ) ) {
/* Free 2K page table fragment of a 4K page */
bit = ( __pa ( table ) & ~ PAGE_MASK ) / ( PTRS_PER_PTE * sizeof ( pte_t ) ) ;
2017-08-17 18:17:49 +02:00
spin_lock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
mask = atomic_xor_bits ( & page - > _mapcount , 1U < < bit ) ;
if ( mask & 3 )
list_add ( & page - > lru , & mm - > context . pgtable_list ) ;
else
list_del ( & page - > lru ) ;
2017-08-17 18:17:49 +02:00
spin_unlock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
if ( mask ! = 0 )
return ;
}
pgtable_page_dtor ( page ) ;
atomic_set ( & page - > _mapcount , - 1 ) ;
__free_page ( page ) ;
}
void page_table_free_rcu ( struct mmu_gather * tlb , unsigned long * table ,
unsigned long vmaddr )
{
struct mm_struct * mm ;
struct page * page ;
unsigned int bit , mask ;
mm = tlb - > mm ;
page = pfn_to_page ( __pa ( table ) > > PAGE_SHIFT ) ;
if ( mm_alloc_pgste ( mm ) ) {
gmap_unlink ( mm , table , vmaddr ) ;
table = ( unsigned long * ) ( __pa ( table ) | 3 ) ;
tlb_remove_table ( tlb , table ) ;
return ;
}
bit = ( __pa ( table ) & ~ PAGE_MASK ) / ( PTRS_PER_PTE * sizeof ( pte_t ) ) ;
2017-08-17 18:17:49 +02:00
spin_lock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
mask = atomic_xor_bits ( & page - > _mapcount , 0x11U < < bit ) ;
if ( mask & 3 )
list_add_tail ( & page - > lru , & mm - > context . pgtable_list ) ;
else
list_del ( & page - > lru ) ;
2017-08-17 18:17:49 +02:00
spin_unlock_bh ( & mm - > context . lock ) ;
2016-03-08 11:49:57 +01:00
table = ( unsigned long * ) ( __pa ( table ) | ( 1U < < bit ) ) ;
tlb_remove_table ( tlb , table ) ;
}
static void __tlb_remove_table ( void * _table )
{
unsigned int mask = ( unsigned long ) _table & 3 ;
void * table = ( void * ) ( ( unsigned long ) _table ^ mask ) ;
struct page * page = pfn_to_page ( __pa ( table ) > > PAGE_SHIFT ) ;
switch ( mask ) {
2017-04-24 18:19:10 +02:00
case 0 : /* pmd, pud, or p4d */
2016-03-08 11:49:57 +01:00
free_pages ( ( unsigned long ) table , 2 ) ;
break ;
case 1 : /* lower 2K of a 4K page table */
case 2 : /* higher 2K of a 4K page table */
if ( atomic_xor_bits ( & page - > _mapcount , mask < < 4 ) ! = 0 )
break ;
/* fallthrough */
case 3 : /* 4K page table with pgstes */
pgtable_page_dtor ( page ) ;
atomic_set ( & page - > _mapcount , - 1 ) ;
__free_page ( page ) ;
break ;
}
}
static void tlb_remove_table_smp_sync ( void * arg )
{
/* Simply deliver the interrupt */
}
static void tlb_remove_table_one ( void * table )
{
/*
* This isn ' t an RCU grace period and hence the page - tables cannot be
* assumed to be actually RCU - freed .
*
* It is however sufficient for software page - table walkers that rely
* on IRQ disabling . See the comment near struct mmu_table_batch .
*/
smp_call_function ( tlb_remove_table_smp_sync , NULL , 1 ) ;
__tlb_remove_table ( table ) ;
}
static void tlb_remove_table_rcu ( struct rcu_head * head )
{
struct mmu_table_batch * batch ;
int i ;
batch = container_of ( head , struct mmu_table_batch , rcu ) ;
for ( i = 0 ; i < batch - > nr ; i + + )
__tlb_remove_table ( batch - > tables [ i ] ) ;
free_page ( ( unsigned long ) batch ) ;
}
void tlb_table_flush ( struct mmu_gather * tlb )
{
struct mmu_table_batch * * batch = & tlb - > batch ;
if ( * batch ) {
call_rcu_sched ( & ( * batch ) - > rcu , tlb_remove_table_rcu ) ;
* batch = NULL ;
}
}
void tlb_remove_table ( struct mmu_gather * tlb , void * table )
{
struct mmu_table_batch * * batch = & tlb - > batch ;
tlb - > mm - > context . flush_mm = 1 ;
if ( * batch = = NULL ) {
* batch = ( struct mmu_table_batch * )
__get_free_page ( GFP_NOWAIT | __GFP_NOWARN ) ;
if ( * batch = = NULL ) {
__tlb_flush_mm_lazy ( tlb - > mm ) ;
tlb_remove_table_one ( table ) ;
return ;
}
( * batch ) - > nr = 0 ;
}
( * batch ) - > tables [ ( * batch ) - > nr + + ] = table ;
if ( ( * batch ) - > nr = = MAX_TABLE_BATCH )
tlb_flush_mmu ( tlb ) ;
}