2009-04-10 22:33:10 +04:00
# include <linux/initrd.h>
2009-03-04 12:46:40 +03:00
# include <linux/ioport.h>
2009-03-03 14:15:06 +03:00
# include <linux/swap.h>
2009-03-04 12:46:40 +03:00
2009-03-03 14:15:06 +03:00
# include <asm/cacheflush.h>
2009-03-05 15:55:05 +03:00
# include <asm/e820.h>
2009-03-05 15:55:08 +03:00
# include <asm/init.h>
2009-03-03 14:15:06 +03:00
# include <asm/page.h>
2009-03-04 12:46:40 +03:00
# include <asm/page_types.h>
2009-03-03 14:15:06 +03:00
# include <asm/sections.h>
2009-05-06 16:06:47 +04:00
# include <asm/setup.h>
2009-03-03 14:15:06 +03:00
# include <asm/system.h>
2009-03-05 15:55:05 +03:00
# include <asm/tlbflush.h>
2009-04-28 17:00:50 +04:00
# include <asm/tlb.h>
2009-07-01 18:24:23 +04:00
# include <asm/proto.h>
2009-04-28 17:00:50 +04:00
DEFINE_PER_CPU ( struct mmu_gather , mmu_gathers ) ;
2009-03-05 15:55:05 +03:00
2009-03-05 15:55:06 +03:00
unsigned long __initdata e820_table_start ;
unsigned long __meminitdata e820_table_end ;
unsigned long __meminitdata e820_table_top ;
2009-03-05 15:55:05 +03:00
int after_bootmem ;
int direct_gbpages
# ifdef CONFIG_DIRECT_GBPAGES
= 1
# endif
;
2009-04-21 12:39:27 +04:00
int nx_enabled ;
# if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static int disable_nx __cpuinitdata ;
/*
* noexec = on | off
*
* Control non - executable mappings for processes .
*
* on Enable
* off Disable
*/
static int __init noexec_setup ( char * str )
{
if ( ! str )
return - EINVAL ;
if ( ! strncmp ( str , " on " , 2 ) ) {
__supported_pte_mask | = _PAGE_NX ;
disable_nx = 0 ;
} else if ( ! strncmp ( str , " off " , 3 ) ) {
disable_nx = 1 ;
__supported_pte_mask & = ~ _PAGE_NX ;
}
return 0 ;
}
early_param ( " noexec " , noexec_setup ) ;
# endif
# ifdef CONFIG_X86_PAE
static void __init set_nx ( void )
{
unsigned int v [ 4 ] , l , h ;
if ( cpu_has_pae & & ( cpuid_eax ( 0x80000000 ) > 0x80000001 ) ) {
cpuid ( 0x80000001 , & v [ 0 ] , & v [ 1 ] , & v [ 2 ] , & v [ 3 ] ) ;
if ( ( v [ 3 ] & ( 1 < < 20 ) ) & & ! disable_nx ) {
rdmsr ( MSR_EFER , l , h ) ;
l | = EFER_NX ;
wrmsr ( MSR_EFER , l , h ) ;
nx_enabled = 1 ;
__supported_pte_mask | = _PAGE_NX ;
}
}
}
# else
static inline void set_nx ( void )
{
}
# endif
# ifdef CONFIG_X86_64
void __cpuinit check_efer ( void )
{
unsigned long efer ;
rdmsrl ( MSR_EFER , efer ) ;
if ( ! ( efer & EFER_NX ) | | disable_nx )
__supported_pte_mask & = ~ _PAGE_NX ;
}
# endif
2009-03-05 15:55:05 +03:00
static void __init find_early_table_space ( unsigned long end , int use_pse ,
int use_gbpages )
{
unsigned long puds , pmds , ptes , tables , start ;
puds = ( end + PUD_SIZE - 1 ) > > PUD_SHIFT ;
tables = roundup ( puds * sizeof ( pud_t ) , PAGE_SIZE ) ;
if ( use_gbpages ) {
unsigned long extra ;
extra = end - ( ( end > > PUD_SHIFT ) < < PUD_SHIFT ) ;
pmds = ( extra + PMD_SIZE - 1 ) > > PMD_SHIFT ;
} else
pmds = ( end + PMD_SIZE - 1 ) > > PMD_SHIFT ;
tables + = roundup ( pmds * sizeof ( pmd_t ) , PAGE_SIZE ) ;
if ( use_pse ) {
unsigned long extra ;
extra = end - ( ( end > > PMD_SHIFT ) < < PMD_SHIFT ) ;
# ifdef CONFIG_X86_32
extra + = PMD_SIZE ;
# endif
ptes = ( extra + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
} else
ptes = ( end + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
tables + = roundup ( ptes * sizeof ( pte_t ) , PAGE_SIZE ) ;
# ifdef CONFIG_X86_32
/* for fixmap */
tables + = roundup ( __end_of_fixed_addresses * sizeof ( pte_t ) , PAGE_SIZE ) ;
# endif
/*
* RED - PEN putting page tables only on node 0 could
* cause a hotspot and fill up ZONE_DMA . The page tables
* need roughly 0.5 KB per GB .
*/
# ifdef CONFIG_X86_32
start = 0x7000 ;
2009-05-10 10:47:42 +04:00
# else
2009-03-05 15:55:05 +03:00
start = 0x8000 ;
# endif
2009-05-10 10:47:42 +04:00
e820_table_start = find_e820_area ( start , max_pfn_mapped < < PAGE_SHIFT ,
tables , PAGE_SIZE ) ;
2009-03-05 15:55:06 +03:00
if ( e820_table_start = = - 1UL )
2009-03-05 15:55:05 +03:00
panic ( " Cannot find space for the kernel page tables " ) ;
2009-03-05 15:55:06 +03:00
e820_table_start > > = PAGE_SHIFT ;
e820_table_end = e820_table_start ;
e820_table_top = e820_table_start + ( tables > > PAGE_SHIFT ) ;
2009-03-05 15:55:05 +03:00
printk ( KERN_DEBUG " kernel direct mapping tables up to %lx @ %lx-%lx \n " ,
2009-03-05 15:55:06 +03:00
end , e820_table_start < < PAGE_SHIFT , e820_table_top < < PAGE_SHIFT ) ;
2009-03-05 15:55:05 +03:00
}
struct map_range {
unsigned long start ;
unsigned long end ;
unsigned page_size_mask ;
} ;
# ifdef CONFIG_X86_32
# define NR_RANGE_MR 3
# else /* CONFIG_X86_64 */
# define NR_RANGE_MR 5
# endif
2009-03-12 15:40:06 +03:00
static int __meminit save_mr ( struct map_range * mr , int nr_range ,
unsigned long start_pfn , unsigned long end_pfn ,
unsigned long page_size_mask )
2009-03-05 15:55:05 +03:00
{
if ( start_pfn < end_pfn ) {
if ( nr_range > = NR_RANGE_MR )
panic ( " run out of range for init_memory_mapping \n " ) ;
mr [ nr_range ] . start = start_pfn < < PAGE_SHIFT ;
mr [ nr_range ] . end = end_pfn < < PAGE_SHIFT ;
mr [ nr_range ] . page_size_mask = page_size_mask ;
nr_range + + ;
}
return nr_range ;
}
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET .
* This runs before bootmem is initialized and gets pages directly from
* the physical memory . To access them they are temporarily mapped .
*/
unsigned long __init_refok init_memory_mapping ( unsigned long start ,
unsigned long end )
{
unsigned long page_size_mask = 0 ;
unsigned long start_pfn , end_pfn ;
2009-03-05 18:04:26 +03:00
unsigned long ret = 0 ;
2009-03-05 15:55:05 +03:00
unsigned long pos ;
struct map_range mr [ NR_RANGE_MR ] ;
int nr_range , i ;
int use_pse , use_gbpages ;
printk ( KERN_INFO " init_memory_mapping: %016lx-%016lx \n " , start , end ) ;
2008-04-04 02:53:23 +04:00
# if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
2009-03-05 15:55:05 +03:00
/*
* For CONFIG_DEBUG_PAGEALLOC , identity mapping will use small pages .
* This will simplify cpa ( ) , which otherwise needs to support splitting
* large pages into small in interrupt context , etc .
*/
use_pse = use_gbpages = 0 ;
# else
use_pse = cpu_has_pse ;
use_gbpages = direct_gbpages ;
# endif
set_nx ( ) ;
if ( nx_enabled )
printk ( KERN_INFO " NX (Execute Disable) protection: active \n " ) ;
/* Enable PSE if available */
if ( cpu_has_pse )
set_in_cr4 ( X86_CR4_PSE ) ;
/* Enable PGE if available */
if ( cpu_has_pge ) {
set_in_cr4 ( X86_CR4_PGE ) ;
__supported_pte_mask | = _PAGE_GLOBAL ;
}
if ( use_gbpages )
page_size_mask | = 1 < < PG_LEVEL_1G ;
if ( use_pse )
page_size_mask | = 1 < < PG_LEVEL_2M ;
memset ( mr , 0 , sizeof ( mr ) ) ;
nr_range = 0 ;
/* head if not big page alignment ? */
start_pfn = start > > PAGE_SHIFT ;
pos = start_pfn < < PAGE_SHIFT ;
# ifdef CONFIG_X86_32
/*
* Don ' t use a large page for the first 2 / 4 MB of memory
* because there are often fixed size MTRRs in there
* and overlapping MTRRs into large pages can cause
* slowdowns .
*/
if ( pos = = 0 )
end_pfn = 1 < < ( PMD_SHIFT - PAGE_SHIFT ) ;
else
end_pfn = ( ( pos + ( PMD_SIZE - 1 ) ) > > PMD_SHIFT )
< < ( PMD_SHIFT - PAGE_SHIFT ) ;
# else /* CONFIG_X86_64 */
end_pfn = ( ( pos + ( PMD_SIZE - 1 ) ) > > PMD_SHIFT )
< < ( PMD_SHIFT - PAGE_SHIFT ) ;
# endif
if ( end_pfn > ( end > > PAGE_SHIFT ) )
end_pfn = end > > PAGE_SHIFT ;
if ( start_pfn < end_pfn ) {
nr_range = save_mr ( mr , nr_range , start_pfn , end_pfn , 0 ) ;
pos = end_pfn < < PAGE_SHIFT ;
}
/* big page (2M) range */
start_pfn = ( ( pos + ( PMD_SIZE - 1 ) ) > > PMD_SHIFT )
< < ( PMD_SHIFT - PAGE_SHIFT ) ;
# ifdef CONFIG_X86_32
end_pfn = ( end > > PMD_SHIFT ) < < ( PMD_SHIFT - PAGE_SHIFT ) ;
# else /* CONFIG_X86_64 */
end_pfn = ( ( pos + ( PUD_SIZE - 1 ) ) > > PUD_SHIFT )
< < ( PUD_SHIFT - PAGE_SHIFT ) ;
if ( end_pfn > ( ( end > > PMD_SHIFT ) < < ( PMD_SHIFT - PAGE_SHIFT ) ) )
end_pfn = ( ( end > > PMD_SHIFT ) < < ( PMD_SHIFT - PAGE_SHIFT ) ) ;
# endif
if ( start_pfn < end_pfn ) {
nr_range = save_mr ( mr , nr_range , start_pfn , end_pfn ,
page_size_mask & ( 1 < < PG_LEVEL_2M ) ) ;
pos = end_pfn < < PAGE_SHIFT ;
}
# ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = ( ( pos + ( PUD_SIZE - 1 ) ) > > PUD_SHIFT )
< < ( PUD_SHIFT - PAGE_SHIFT ) ;
end_pfn = ( end > > PUD_SHIFT ) < < ( PUD_SHIFT - PAGE_SHIFT ) ;
if ( start_pfn < end_pfn ) {
nr_range = save_mr ( mr , nr_range , start_pfn , end_pfn ,
page_size_mask &
( ( 1 < < PG_LEVEL_2M ) | ( 1 < < PG_LEVEL_1G ) ) ) ;
pos = end_pfn < < PAGE_SHIFT ;
}
/* tail is not big page (1G) alignment */
start_pfn = ( ( pos + ( PMD_SIZE - 1 ) ) > > PMD_SHIFT )
< < ( PMD_SHIFT - PAGE_SHIFT ) ;
end_pfn = ( end > > PMD_SHIFT ) < < ( PMD_SHIFT - PAGE_SHIFT ) ;
if ( start_pfn < end_pfn ) {
nr_range = save_mr ( mr , nr_range , start_pfn , end_pfn ,
page_size_mask & ( 1 < < PG_LEVEL_2M ) ) ;
pos = end_pfn < < PAGE_SHIFT ;
}
# endif
/* tail is not big page (2M) alignment */
start_pfn = pos > > PAGE_SHIFT ;
end_pfn = end > > PAGE_SHIFT ;
nr_range = save_mr ( mr , nr_range , start_pfn , end_pfn , 0 ) ;
/* try to merge same page size and continuous */
for ( i = 0 ; nr_range > 1 & & i < nr_range - 1 ; i + + ) {
unsigned long old_start ;
if ( mr [ i ] . end ! = mr [ i + 1 ] . start | |
mr [ i ] . page_size_mask ! = mr [ i + 1 ] . page_size_mask )
continue ;
/* move it */
old_start = mr [ i ] . start ;
memmove ( & mr [ i ] , & mr [ i + 1 ] ,
( nr_range - 1 - i ) * sizeof ( struct map_range ) ) ;
mr [ i - - ] . start = old_start ;
nr_range - - ;
}
for ( i = 0 ; i < nr_range ; i + + )
printk ( KERN_DEBUG " %010lx - %010lx page %s \n " ,
mr [ i ] . start , mr [ i ] . end ,
( mr [ i ] . page_size_mask & ( 1 < < PG_LEVEL_1G ) ) ? " 1G " : (
( mr [ i ] . page_size_mask & ( 1 < < PG_LEVEL_2M ) ) ? " 2M " : " 4k " ) ) ;
/*
* Find space for the kernel direct mapping tables .
*
* Later we should allocate these tables in the local node of the
* memory mapped . Unfortunately this is done currently before the
* nodes are discovered .
*/
if ( ! after_bootmem )
find_early_table_space ( end , use_pse , use_gbpages ) ;
# ifdef CONFIG_X86_32
for ( i = 0 ; i < nr_range ; i + + )
2009-03-05 15:55:07 +03:00
kernel_physical_mapping_init ( mr [ i ] . start , mr [ i ] . end ,
mr [ i ] . page_size_mask ) ;
2009-03-05 15:55:05 +03:00
ret = end ;
# else /* CONFIG_X86_64 */
for ( i = 0 ; i < nr_range ; i + + )
ret = kernel_physical_mapping_init ( mr [ i ] . start , mr [ i ] . end ,
mr [ i ] . page_size_mask ) ;
# endif
# ifdef CONFIG_X86_32
early_ioremap_page_table_range_init ( ) ;
load_cr3 ( swapper_pg_dir ) ;
# endif
# ifdef CONFIG_X86_64
2009-05-06 16:06:47 +04:00
if ( ! after_bootmem & & ! start ) {
pud_t * pud ;
pmd_t * pmd ;
2009-03-05 15:55:05 +03:00
mmu_cr4_features = read_cr4 ( ) ;
2009-05-06 16:06:47 +04:00
/*
* _brk_end cannot change anymore , but it and _end may be
* located on different 2 M pages . cleanup_highmap ( ) , however ,
* can only consider _end when it runs , so destroy any
* mappings beyond _brk_end here .
*/
pud = pud_offset ( pgd_offset_k ( _brk_end ) , _brk_end ) ;
pmd = pmd_offset ( pud , _brk_end - 1 ) ;
while ( + + pmd < = pmd_offset ( pud , ( unsigned long ) _end - 1 ) )
pmd_clear ( pmd ) ;
}
2009-03-05 15:55:05 +03:00
# endif
__flush_tlb_all ( ) ;
2009-03-05 15:55:06 +03:00
if ( ! after_bootmem & & e820_table_end > e820_table_start )
reserve_early ( e820_table_start < < PAGE_SHIFT ,
e820_table_end < < PAGE_SHIFT , " PGTABLE " ) ;
2009-03-05 15:55:05 +03:00
if ( ! after_bootmem )
early_memtest ( start , end ) ;
return ret > > PAGE_SHIFT ;
}
2009-03-03 14:15:06 +03:00
2009-03-04 12:46:40 +03:00
/*
* devmem_is_allowed ( ) checks to see if / dev / mem access to a certain address
* is valid . The argument is a physical page number .
*
*
* On x86 , access has to be given to the first megabyte of ram because that area
* contains bios code and data regions used by X and dosemu and similar apps .
* Access has to be given to non - kernel - ram areas as well , these contain the PCI
* mmio resources as well as potential bios / acpi data regions .
*/
int devmem_is_allowed ( unsigned long pagenr )
{
if ( pagenr < = 256 )
return 1 ;
if ( iomem_is_exclusive ( pagenr < < PAGE_SHIFT ) )
return 0 ;
if ( ! page_is_ram ( pagenr ) )
return 1 ;
return 0 ;
}
2009-03-03 14:15:06 +03:00
void free_init_pages ( char * what , unsigned long begin , unsigned long end )
{
unsigned long addr = begin ;
if ( addr > = end )
return ;
/*
* If debugging page accesses then do not free this memory but
* mark them not present - any buggy init - section access will
* create a kernel page fault :
*/
# ifdef CONFIG_DEBUG_PAGEALLOC
printk ( KERN_INFO " debug: unmapping init memory %08lx..%08lx \n " ,
begin , PAGE_ALIGN ( end ) ) ;
set_memory_np ( begin , ( end - begin ) > > PAGE_SHIFT ) ;
# else
/*
* We just marked the kernel text read only above , now that
* we are going to free part of that , we need to make that
* writeable first .
*/
set_memory_rw ( begin , ( end - begin ) > > PAGE_SHIFT ) ;
printk ( KERN_INFO " Freeing %s: %luk freed \n " , what , ( end - begin ) > > 10 ) ;
for ( ; addr < end ; addr + = PAGE_SIZE ) {
ClearPageReserved ( virt_to_page ( addr ) ) ;
init_page_count ( virt_to_page ( addr ) ) ;
memset ( ( void * ) ( addr & ~ ( PAGE_SIZE - 1 ) ) ,
POISON_FREE_INITMEM , PAGE_SIZE ) ;
free_page ( addr ) ;
totalram_pages + + ;
}
# endif
}
void free_initmem ( void )
{
free_init_pages ( " unused kernel memory " ,
( unsigned long ) ( & __init_begin ) ,
( unsigned long ) ( & __init_end ) ) ;
}
2009-03-04 12:13:40 +03:00
# ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem ( unsigned long start , unsigned long end )
{
free_init_pages ( " initrd memory " , start , end ) ;
}
# endif