2005-04-16 15:20:36 -07:00
/*
* flexible mmap layout support
*
* Copyright 2003 - 2004 Red Hat Inc . , Durham , North Carolina .
* All Rights Reserved .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 59 Temple Place , Suite 330 , Boston , MA 02111 - 1307 USA
*
*
* Started by Ingo Molnar < mingo @ elte . hu >
*/
2016-05-07 12:15:34 +02:00
# include <linux/elf-randomize.h>
2005-04-16 15:20:36 -07:00
# include <linux/personality.h>
# include <linux/mm.h>
2011-10-30 15:17:13 +01:00
# include <linux/mman.h>
2005-04-16 15:20:36 -07:00
# include <linux/module.h>
2011-01-12 09:55:27 +01:00
# include <linux/random.h>
2012-02-27 10:01:52 +01:00
# include <linux/compat.h>
2015-01-14 17:51:17 +01:00
# include <linux/security.h>
2008-02-09 18:24:37 +01:00
# include <asm/pgalloc.h>
2005-04-16 15:20:36 -07:00
2011-01-12 09:55:22 +01:00
static unsigned long stack_maxrandom_size ( void )
{
if ( ! ( current - > flags & PF_RANDOMIZE ) )
return 0 ;
if ( current - > personality & ADDR_NO_RANDOMIZE )
return 0 ;
return STACK_RND_MASK < < PAGE_SHIFT ;
}
2005-04-16 15:20:36 -07:00
/*
* Top of mmap area ( just below the process stack ) .
*
2011-01-12 09:55:23 +01:00
* Leave at least a ~ 32 MB hole .
2005-04-16 15:20:36 -07:00
*/
2011-01-12 09:55:23 +01:00
# define MIN_GAP (32*1024*1024)
2009-03-18 13:27:36 +01:00
# define MAX_GAP (STACK_TOP / 6*5)
2005-04-16 15:20:36 -07:00
2011-01-12 09:55:26 +01:00
static inline int mmap_is_legacy ( void )
{
if ( current - > personality & ADDR_COMPAT_LAYOUT )
return 1 ;
if ( rlimit ( RLIMIT_STACK ) = = RLIM_INFINITY )
return 1 ;
return sysctl_legacy_va_layout ;
}
2015-04-14 15:48:00 -07:00
unsigned long arch_mmap_rnd ( void )
2011-01-12 09:55:27 +01:00
{
2015-11-10 12:30:28 +01:00
return ( get_random_int ( ) & MMAP_RND_MASK ) < < PAGE_SHIFT ;
2011-01-12 09:55:27 +01:00
}
2015-04-14 15:47:57 -07:00
static unsigned long mmap_base_legacy ( unsigned long rnd )
2013-11-12 15:07:55 -08:00
{
2015-04-14 15:47:57 -07:00
return TASK_UNMAPPED_BASE + rnd ;
2013-11-12 15:07:55 -08:00
}
2015-04-14 15:47:57 -07:00
static inline unsigned long mmap_base ( unsigned long rnd )
2005-04-16 15:20:36 -07:00
{
2010-01-13 20:44:33 +01:00
unsigned long gap = rlimit ( RLIMIT_STACK ) ;
2005-04-16 15:20:36 -07:00
if ( gap < MIN_GAP )
gap = MIN_GAP ;
else if ( gap > MAX_GAP )
gap = MAX_GAP ;
2011-01-12 09:55:27 +01:00
gap & = PAGE_MASK ;
2015-04-14 15:47:57 -07:00
return STACK_TOP - stack_maxrandom_size ( ) - rnd - gap ;
2005-04-16 15:20:36 -07:00
}
2015-01-14 17:51:17 +01:00
unsigned long
arch_get_unmapped_area ( struct file * filp , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
struct mm_struct * mm = current - > mm ;
struct vm_area_struct * vma ;
struct vm_unmapped_area_info info ;
if ( len > TASK_SIZE - mmap_min_addr )
return - ENOMEM ;
if ( flags & MAP_FIXED )
return addr ;
if ( addr ) {
addr = PAGE_ALIGN ( addr ) ;
vma = find_vma ( mm , addr ) ;
if ( TASK_SIZE - len > = addr & & addr > = mmap_min_addr & &
( ! vma | | addr + len < = vma - > vm_start ) )
return addr ;
}
info . flags = 0 ;
info . length = len ;
info . low_limit = mm - > mmap_base ;
info . high_limit = TASK_SIZE ;
2015-11-10 12:30:28 +01:00
if ( filp | | ( flags & MAP_SHARED ) )
info . align_mask = MMAP_ALIGN_MASK < < PAGE_SHIFT ;
else
info . align_mask = 0 ;
2015-01-14 17:51:17 +01:00
info . align_offset = pgoff < < PAGE_SHIFT ;
return vm_unmapped_area ( & info ) ;
}
unsigned long
arch_get_unmapped_area_topdown ( struct file * filp , const unsigned long addr0 ,
const unsigned long len , const unsigned long pgoff ,
const unsigned long flags )
{
struct vm_area_struct * vma ;
struct mm_struct * mm = current - > mm ;
unsigned long addr = addr0 ;
struct vm_unmapped_area_info info ;
/* requested length too big for entire address space */
if ( len > TASK_SIZE - mmap_min_addr )
return - ENOMEM ;
if ( flags & MAP_FIXED )
return addr ;
/* requesting a specific address */
if ( addr ) {
addr = PAGE_ALIGN ( addr ) ;
vma = find_vma ( mm , addr ) ;
if ( TASK_SIZE - len > = addr & & addr > = mmap_min_addr & &
( ! vma | | addr + len < = vma - > vm_start ) )
return addr ;
}
info . flags = VM_UNMAPPED_AREA_TOPDOWN ;
info . length = len ;
info . low_limit = max ( PAGE_SIZE , mmap_min_addr ) ;
info . high_limit = mm - > mmap_base ;
2015-11-10 12:30:28 +01:00
if ( filp | | ( flags & MAP_SHARED ) )
info . align_mask = MMAP_ALIGN_MASK < < PAGE_SHIFT ;
else
info . align_mask = 0 ;
2015-01-14 17:51:17 +01:00
info . align_offset = pgoff < < PAGE_SHIFT ;
addr = vm_unmapped_area ( & info ) ;
/*
* A failed mmap ( ) very likely causes application failure ,
* so fall back to the bottom - up function here . This scenario
* can happen with large stack limits and large mmap ( )
* allocations .
*/
if ( addr & ~ PAGE_MASK ) {
VM_BUG_ON ( addr ! = - ENOMEM ) ;
info . flags = 0 ;
info . low_limit = TASK_UNMAPPED_BASE ;
info . high_limit = TASK_SIZE ;
addr = vm_unmapped_area ( & info ) ;
}
return addr ;
}
2013-02-11 14:29:49 +01:00
int s390_mmap_check ( unsigned long addr , unsigned long len , unsigned long flags )
2009-03-18 13:27:37 +01:00
{
2016-01-11 11:47:12 +01:00
if ( is_compat_task ( ) | | TASK_SIZE > = TASK_MAX_SIZE )
2013-02-11 14:29:49 +01:00
return 0 ;
if ( ! ( flags & MAP_FIXED ) )
addr = 0 ;
2013-10-28 14:48:30 +01:00
if ( ( addr + len ) > = TASK_SIZE )
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
return crst_table_upgrade ( current - > mm ) ;
2009-03-18 13:27:37 +01:00
return 0 ;
}
2008-02-09 18:24:37 +01:00
static unsigned long
s390_get_unmapped_area ( struct file * filp , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
struct mm_struct * mm = current - > mm ;
2009-03-18 13:27:37 +01:00
unsigned long area ;
2008-02-09 18:24:37 +01:00
int rc ;
2009-03-18 13:27:37 +01:00
area = arch_get_unmapped_area ( filp , addr , len , pgoff , flags ) ;
if ( ! ( area & ~ PAGE_MASK ) )
return area ;
2016-01-11 11:47:12 +01:00
if ( area = = - ENOMEM & & ! is_compat_task ( ) & & TASK_SIZE < TASK_MAX_SIZE ) {
2009-03-18 13:27:37 +01:00
/* Upgrade the page table to 4 levels and retry. */
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
rc = crst_table_upgrade ( mm ) ;
2008-02-09 18:24:37 +01:00
if ( rc )
return ( unsigned long ) rc ;
2009-03-18 13:27:37 +01:00
area = arch_get_unmapped_area ( filp , addr , len , pgoff , flags ) ;
2008-02-09 18:24:37 +01:00
}
2009-03-18 13:27:37 +01:00
return area ;
2008-02-09 18:24:37 +01:00
}
static unsigned long
2009-03-18 13:27:37 +01:00
s390_get_unmapped_area_topdown ( struct file * filp , const unsigned long addr ,
2008-02-09 18:24:37 +01:00
const unsigned long len , const unsigned long pgoff ,
const unsigned long flags )
{
struct mm_struct * mm = current - > mm ;
2009-03-18 13:27:37 +01:00
unsigned long area ;
2008-02-09 18:24:37 +01:00
int rc ;
2009-03-18 13:27:37 +01:00
area = arch_get_unmapped_area_topdown ( filp , addr , len , pgoff , flags ) ;
if ( ! ( area & ~ PAGE_MASK ) )
return area ;
2016-01-11 11:47:12 +01:00
if ( area = = - ENOMEM & & ! is_compat_task ( ) & & TASK_SIZE < TASK_MAX_SIZE ) {
2009-03-18 13:27:37 +01:00
/* Upgrade the page table to 4 levels and retry. */
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 16:38:40 +02:00
rc = crst_table_upgrade ( mm ) ;
2008-02-09 18:24:37 +01:00
if ( rc )
return ( unsigned long ) rc ;
2009-03-18 13:27:37 +01:00
area = arch_get_unmapped_area_topdown ( filp , addr , len ,
pgoff , flags ) ;
2008-02-09 18:24:37 +01:00
}
2009-03-18 13:27:37 +01:00
return area ;
2008-02-09 18:24:37 +01:00
}
/*
* This function , called very early during the creation of a new
* process VM image , sets up which VM layout function to use :
*/
void arch_pick_mmap_layout ( struct mm_struct * mm )
{
2015-04-14 15:47:57 -07:00
unsigned long random_factor = 0UL ;
if ( current - > flags & PF_RANDOMIZE )
2015-04-14 15:48:00 -07:00
random_factor = arch_mmap_rnd ( ) ;
2015-04-14 15:47:57 -07:00
2008-02-09 18:24:37 +01:00
/*
* Fall back to the standard layout if the personality
* bit is set , or if the expected stack growth is unlimited :
*/
if ( mmap_is_legacy ( ) ) {
2015-04-14 15:47:57 -07:00
mm - > mmap_base = mmap_base_legacy ( random_factor ) ;
2008-02-09 18:24:37 +01:00
mm - > get_unmapped_area = s390_get_unmapped_area ;
} else {
2015-04-14 15:47:57 -07:00
mm - > mmap_base = mmap_base ( random_factor ) ;
2008-02-09 18:24:37 +01:00
mm - > get_unmapped_area = s390_get_unmapped_area_topdown ;
}
}