2005-04-17 02:20:36 +04:00
/*
* Initialize MMU support .
*
* Copyright ( C ) 1998 - 2003 Hewlett - Packard Co
* David Mosberger - Tang < davidm @ hpl . hp . com >
*/
# include <linux/config.h>
# include <linux/kernel.h>
# include <linux/init.h>
# include <linux/bootmem.h>
# include <linux/efi.h>
# include <linux/elf.h>
# include <linux/mm.h>
# include <linux/mmzone.h>
# include <linux/module.h>
# include <linux/personality.h>
# include <linux/reboot.h>
# include <linux/slab.h>
# include <linux/swap.h>
# include <linux/proc_fs.h>
# include <linux/bitops.h>
# include <asm/a.out.h>
# include <asm/dma.h>
# include <asm/ia32.h>
# include <asm/io.h>
# include <asm/machvec.h>
# include <asm/numa.h>
# include <asm/patch.h>
# include <asm/pgalloc.h>
# include <asm/sal.h>
# include <asm/sections.h>
# include <asm/system.h>
# include <asm/tlb.h>
# include <asm/uaccess.h>
# include <asm/unistd.h>
# include <asm/mca.h>
DEFINE_PER_CPU ( struct mmu_gather , mmu_gathers ) ;
2005-04-26 00:13:16 +04:00
DEFINE_PER_CPU ( unsigned long * , __pgtable_quicklist ) ;
DEFINE_PER_CPU ( long , __pgtable_quicklist_size ) ;
2005-04-17 02:20:36 +04:00
extern void ia64_tlb_init ( void ) ;
unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL ;
# ifdef CONFIG_VIRTUAL_MEM_MAP
unsigned long vmalloc_end = VMALLOC_END_INIT ;
EXPORT_SYMBOL ( vmalloc_end ) ;
struct page * vmem_map ;
EXPORT_SYMBOL ( vmem_map ) ;
# endif
2005-04-26 00:13:16 +04:00
struct page * zero_page_memmap_ptr ; /* map entry for zero page */
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( zero_page_memmap_ptr ) ;
2005-04-26 00:13:16 +04:00
# define MIN_PGT_PAGES 25UL
2005-04-26 00:16:59 +04:00
# define MAX_PGT_FREES_PER_PASS 16L
2005-04-26 00:13:16 +04:00
# define PGT_FRACTION_OF_NODE_MEM 16
static inline long
max_pgt_pages ( void )
{
u64 node_free_pages , max_pgt_pages ;
# ifndef CONFIG_NUMA
node_free_pages = nr_free_pages ( ) ;
# else
node_free_pages = nr_free_pages_pgdat ( NODE_DATA ( numa_node_id ( ) ) ) ;
# endif
max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM ;
max_pgt_pages = max ( max_pgt_pages , MIN_PGT_PAGES ) ;
return max_pgt_pages ;
}
static inline long
min_pages_to_free ( void )
{
long pages_to_free ;
pages_to_free = pgtable_quicklist_size - max_pgt_pages ( ) ;
pages_to_free = min ( pages_to_free , MAX_PGT_FREES_PER_PASS ) ;
return pages_to_free ;
}
2005-04-17 02:20:36 +04:00
void
2005-04-26 00:13:16 +04:00
check_pgt_cache ( void )
2005-04-17 02:20:36 +04:00
{
2005-04-26 00:13:16 +04:00
long pages_to_free ;
2005-04-17 02:20:36 +04:00
2005-04-26 00:13:16 +04:00
if ( unlikely ( pgtable_quicklist_size < = MIN_PGT_PAGES ) )
return ;
2005-04-17 02:20:36 +04:00
preempt_disable ( ) ;
2005-04-26 00:13:16 +04:00
while ( unlikely ( ( pages_to_free = min_pages_to_free ( ) ) > 0 ) ) {
while ( pages_to_free - - ) {
free_page ( ( unsigned long ) pgtable_quicklist_alloc ( ) ) ;
}
preempt_enable ( ) ;
preempt_disable ( ) ;
2005-04-17 02:20:36 +04:00
}
preempt_enable ( ) ;
}
void
lazy_mmu_prot_update ( pte_t pte )
{
unsigned long addr ;
struct page * page ;
if ( ! pte_exec ( pte ) )
return ; /* not an executable page... */
page = pte_page ( pte ) ;
addr = ( unsigned long ) page_address ( page ) ;
if ( test_bit ( PG_arch_1 , & page - > flags ) )
return ; /* i-cache is already coherent with d-cache */
flush_icache_range ( addr , addr + PAGE_SIZE ) ;
set_bit ( PG_arch_1 , & page - > flags ) ; /* mark page as clean */
}
inline void
ia64_set_rbs_bot ( void )
{
unsigned long stack_size = current - > signal - > rlim [ RLIMIT_STACK ] . rlim_max & - 16 ;
if ( stack_size > MAX_USER_STACK_SIZE )
stack_size = MAX_USER_STACK_SIZE ;
current - > thread . rbs_bot = STACK_TOP - stack_size ;
}
/*
* This performs some platform - dependent address space initialization .
* On IA - 64 , we want to setup the VM area for the register backing
* store ( which grows upwards ) and install the gateway page which is
* used for signal trampolines , etc .
*/
void
ia64_init_addr_space ( void )
{
struct vm_area_struct * vma ;
ia64_set_rbs_bot ( ) ;
/*
* If we ' re out of memory and kmem_cache_alloc ( ) returns NULL , we simply ignore
* the problem . When the process attempts to write to the register backing store
* for the first time , it will get a SEGFAULT in this case .
*/
vma = kmem_cache_alloc ( vm_area_cachep , SLAB_KERNEL ) ;
if ( vma ) {
memset ( vma , 0 , sizeof ( * vma ) ) ;
vma - > vm_mm = current - > mm ;
vma - > vm_start = current - > thread . rbs_bot & PAGE_MASK ;
vma - > vm_end = vma - > vm_start + PAGE_SIZE ;
vma - > vm_page_prot = protection_map [ VM_DATA_DEFAULT_FLAGS & 0x7 ] ;
vma - > vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP ;
down_write ( & current - > mm - > mmap_sem ) ;
if ( insert_vm_struct ( current - > mm , vma ) ) {
up_write ( & current - > mm - > mmap_sem ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
return ;
}
up_write ( & current - > mm - > mmap_sem ) ;
}
/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
if ( ! ( current - > personality & MMAP_PAGE_ZERO ) ) {
vma = kmem_cache_alloc ( vm_area_cachep , SLAB_KERNEL ) ;
if ( vma ) {
memset ( vma , 0 , sizeof ( * vma ) ) ;
vma - > vm_mm = current - > mm ;
vma - > vm_end = PAGE_SIZE ;
vma - > vm_page_prot = __pgprot ( pgprot_val ( PAGE_READONLY ) | _PAGE_MA_NAT ) ;
vma - > vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED ;
down_write ( & current - > mm - > mmap_sem ) ;
if ( insert_vm_struct ( current - > mm , vma ) ) {
up_write ( & current - > mm - > mmap_sem ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
return ;
}
up_write ( & current - > mm - > mmap_sem ) ;
}
}
}
void
free_initmem ( void )
{
unsigned long addr , eaddr ;
addr = ( unsigned long ) ia64_imva ( __init_begin ) ;
eaddr = ( unsigned long ) ia64_imva ( __init_end ) ;
while ( addr < eaddr ) {
ClearPageReserved ( virt_to_page ( addr ) ) ;
set_page_count ( virt_to_page ( addr ) , 1 ) ;
free_page ( addr ) ;
+ + totalram_pages ;
addr + = PAGE_SIZE ;
}
printk ( KERN_INFO " Freeing unused kernel memory: %ldkB freed \n " ,
( __init_end - __init_begin ) > > 10 ) ;
}
void
free_initrd_mem ( unsigned long start , unsigned long end )
{
struct page * page ;
/*
* EFI uses 4 KB pages while the kernel can use 4 KB or bigger .
* Thus EFI and the kernel may have different page sizes . It is
* therefore possible to have the initrd share the same page as
* the end of the kernel ( given current setup ) .
*
* To avoid freeing / using the wrong page ( kernel sized ) we :
* - align up the beginning of initrd
* - align down the end of initrd
*
* | |
* | = = = = = = = = = = = = = | a000
* | |
* | |
* | | 9000
* | /////////////|
* | /////////////|
* | = = = = = = = = = = = = = | 8000
* | ///INITRD////|
* | /////////////|
* | /////////////| 7000
* | |
* | KKKKKKKKKKKKK |
* | = = = = = = = = = = = = = | 6000
* | KKKKKKKKKKKKK |
* | KKKKKKKKKKKKK |
* K = kernel using 8 KB pages
*
* In this example , we must free page 8000 ONLY . So we must align up
* initrd_start and keep initrd_end as is .
*/
start = PAGE_ALIGN ( start ) ;
end = end & PAGE_MASK ;
if ( start < end )
printk ( KERN_INFO " Freeing initrd memory: %ldkB freed \n " , ( end - start ) > > 10 ) ;
for ( ; start < end ; start + = PAGE_SIZE ) {
if ( ! virt_addr_valid ( start ) )
continue ;
page = virt_to_page ( start ) ;
ClearPageReserved ( page ) ;
set_page_count ( page , 1 ) ;
free_page ( start ) ;
+ + totalram_pages ;
}
}
/*
* This installs a clean page in the kernel ' s page table .
*/
struct page *
put_kernel_page ( struct page * page , unsigned long address , pgprot_t pgprot )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
if ( ! PageReserved ( page ) )
printk ( KERN_ERR " put_kernel_page: page at 0x%p not in reserved memory \n " ,
page_address ( page ) ) ;
pgd = pgd_offset_k ( address ) ; /* note: this is NOT pgd_offset()! */
spin_lock ( & init_mm . page_table_lock ) ;
{
pud = pud_alloc ( & init_mm , pgd , address ) ;
if ( ! pud )
goto out ;
pmd = pmd_alloc ( & init_mm , pud , address ) ;
if ( ! pmd )
goto out ;
pte = pte_alloc_map ( & init_mm , pmd , address ) ;
if ( ! pte )
goto out ;
if ( ! pte_none ( * pte ) ) {
pte_unmap ( pte ) ;
goto out ;
}
set_pte ( pte , mk_pte ( page , pgprot ) ) ;
pte_unmap ( pte ) ;
}
out : spin_unlock ( & init_mm . page_table_lock ) ;
/* no need for flush_tlb */
return page ;
}
static void
setup_gate ( void )
{
struct page * page ;
/*
2005-06-08 21:45:00 +04:00
* Map the gate page twice : once read - only to export the ELF
* headers etc . and once execute - only page to enable
* privilege - promotion via " epc " :
2005-04-17 02:20:36 +04:00
*/
page = virt_to_page ( ia64_imva ( __start_gate_section ) ) ;
put_kernel_page ( page , GATE_ADDR , PAGE_READONLY ) ;
# ifdef HAVE_BUGGY_SEGREL
page = virt_to_page ( ia64_imva ( __start_gate_section + PAGE_SIZE ) ) ;
put_kernel_page ( page , GATE_ADDR + PAGE_SIZE , PAGE_GATE ) ;
# else
put_kernel_page ( page , GATE_ADDR + PERCPU_PAGE_SIZE , PAGE_GATE ) ;
2005-06-08 21:45:00 +04:00
/* Fill in the holes (if any) with read-only zero pages: */
{
unsigned long addr ;
for ( addr = GATE_ADDR + PAGE_SIZE ;
addr < GATE_ADDR + PERCPU_PAGE_SIZE ;
addr + = PAGE_SIZE )
{
put_kernel_page ( ZERO_PAGE ( 0 ) , addr ,
PAGE_READONLY ) ;
put_kernel_page ( ZERO_PAGE ( 0 ) , addr + PERCPU_PAGE_SIZE ,
PAGE_READONLY ) ;
}
}
2005-04-17 02:20:36 +04:00
# endif
ia64_patch_gate ( ) ;
}
void __devinit
ia64_mmu_init ( void * my_cpu_data )
{
unsigned long psr , pta , impl_va_bits ;
extern void __devinit tlb_init ( void ) ;
# ifdef CONFIG_DISABLE_VHPT
# define VHPT_ENABLE_BIT 0
# else
# define VHPT_ENABLE_BIT 1
# endif
/* Pin mapping for percpu area into TLB */
psr = ia64_clear_ic ( ) ;
ia64_itr ( 0x2 , IA64_TR_PERCPU_DATA , PERCPU_ADDR ,
pte_val ( pfn_pte ( __pa ( my_cpu_data ) > > PAGE_SHIFT , PAGE_KERNEL ) ) ,
PERCPU_PAGE_SHIFT ) ;
ia64_set_psr ( psr ) ;
ia64_srlz_i ( ) ;
/*
* Check if the virtually mapped linear page table ( VMLPT ) overlaps with a mapped
* address space . The IA - 64 architecture guarantees that at least 50 bits of
* virtual address space are implemented but if we pick a large enough page size
* ( e . g . , 64 KB ) , the mapped address space is big enough that it will overlap with
* VMLPT . I assume that once we run on machines big enough to warrant 64 KB pages ,
* IMPL_VA_MSB will be significantly bigger , so this is unlikely to become a
* problem in practice . Alternatively , we could truncate the top of the mapped
* address space to not permit mappings that would overlap with the VMLPT .
* - - davidm 00 / 12 / 06
*/
# define pte_bits 3
# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
/*
* The virtual page table has to cover the entire implemented address space within
* a region even though not all of this space may be mappable . The reason for
* this is that the Access bit and Dirty bit fault handlers perform
* non - speculative accesses to the virtual page table , so the address range of the
* virtual page table itself needs to be covered by virtual page table .
*/
# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
# define POW2(n) (1ULL << (n))
impl_va_bits = ffz ( ~ ( local_cpu_data - > unimpl_va_mask | ( 7UL < < 61 ) ) ) ;
if ( impl_va_bits < 51 | | impl_va_bits > 61 )
panic ( " CPU has bogus IMPL_VA_MSB value of %lu! \n " , impl_va_bits - 1 ) ;
/* place the VMLPT at the end of each page-table mapped region: */
pta = POW2 ( 61 ) - POW2 ( vmlpt_bits ) ;
if ( POW2 ( mapped_space_bits ) > = pta )
panic ( " mm/init: overlap between virtually mapped linear page table and "
" mapped kernel space! " ) ;
/*
* Set the ( virtually mapped linear ) page table address . Bit
* 8 selects between the short and long format , bits 2 - 7 the
* size of the table , and bit 0 whether the VHPT walker is
* enabled .
*/
ia64_set_pta ( pta | ( 0 < < 8 ) | ( vmlpt_bits < < 2 ) | VHPT_ENABLE_BIT ) ;
ia64_tlb_init ( ) ;
# ifdef CONFIG_HUGETLB_PAGE
ia64_set_rr ( HPAGE_REGION_BASE , HPAGE_SHIFT < < 2 ) ;
ia64_srlz_d ( ) ;
# endif
}
# ifdef CONFIG_VIRTUAL_MEM_MAP
int
create_mem_map_page_table ( u64 start , u64 end , void * arg )
{
unsigned long address , start_page , end_page ;
struct page * map_start , * map_end ;
int node ;
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
map_start = vmem_map + ( __pa ( start ) > > PAGE_SHIFT ) ;
map_end = vmem_map + ( __pa ( end ) > > PAGE_SHIFT ) ;
start_page = ( unsigned long ) map_start & PAGE_MASK ;
end_page = PAGE_ALIGN ( ( unsigned long ) map_end ) ;
node = paddr_to_nid ( __pa ( start ) ) ;
for ( address = start_page ; address < end_page ; address + = PAGE_SIZE ) {
pgd = pgd_offset_k ( address ) ;
if ( pgd_none ( * pgd ) )
pgd_populate ( & init_mm , pgd , alloc_bootmem_pages_node ( NODE_DATA ( node ) , PAGE_SIZE ) ) ;
pud = pud_offset ( pgd , address ) ;
if ( pud_none ( * pud ) )
pud_populate ( & init_mm , pud , alloc_bootmem_pages_node ( NODE_DATA ( node ) , PAGE_SIZE ) ) ;
pmd = pmd_offset ( pud , address ) ;
if ( pmd_none ( * pmd ) )
pmd_populate_kernel ( & init_mm , pmd , alloc_bootmem_pages_node ( NODE_DATA ( node ) , PAGE_SIZE ) ) ;
pte = pte_offset_kernel ( pmd , address ) ;
if ( pte_none ( * pte ) )
set_pte ( pte , pfn_pte ( __pa ( alloc_bootmem_pages_node ( NODE_DATA ( node ) , PAGE_SIZE ) ) > > PAGE_SHIFT ,
PAGE_KERNEL ) ) ;
}
return 0 ;
}
struct memmap_init_callback_data {
struct page * start ;
struct page * end ;
int nid ;
unsigned long zone ;
} ;
static int
virtual_memmap_init ( u64 start , u64 end , void * arg )
{
struct memmap_init_callback_data * args ;
struct page * map_start , * map_end ;
args = ( struct memmap_init_callback_data * ) arg ;
map_start = vmem_map + ( __pa ( start ) > > PAGE_SHIFT ) ;
map_end = vmem_map + ( __pa ( end ) > > PAGE_SHIFT ) ;
if ( map_start < args - > start )
map_start = args - > start ;
if ( map_end > args - > end )
map_end = args - > end ;
/*
* We have to initialize " out of bounds " struct page elements that fit completely
* on the same pages that were allocated for the " in bounds " elements because they
* may be referenced later ( and found to be " reserved " ) .
*/
map_start - = ( ( unsigned long ) map_start & ( PAGE_SIZE - 1 ) ) / sizeof ( struct page ) ;
map_end + = ( ( PAGE_ALIGN ( ( unsigned long ) map_end ) - ( unsigned long ) map_end )
/ sizeof ( struct page ) ) ;
if ( map_start < map_end )
memmap_init_zone ( ( unsigned long ) ( map_end - map_start ) ,
args - > nid , args - > zone , page_to_pfn ( map_start ) ) ;
return 0 ;
}
void
memmap_init ( unsigned long size , int nid , unsigned long zone ,
unsigned long start_pfn )
{
if ( ! vmem_map )
memmap_init_zone ( size , nid , zone , start_pfn ) ;
else {
struct page * start ;
struct memmap_init_callback_data args ;
start = pfn_to_page ( start_pfn ) ;
args . start = start ;
args . end = start + size ;
args . nid = nid ;
args . zone = zone ;
efi_memmap_walk ( virtual_memmap_init , & args ) ;
}
}
int
ia64_pfn_valid ( unsigned long pfn )
{
char byte ;
struct page * pg = pfn_to_page ( pfn ) ;
return ( __get_user ( byte , ( char __user * ) pg ) = = 0 )
& & ( ( ( ( u64 ) pg & PAGE_MASK ) = = ( ( ( u64 ) ( pg + 1 ) - 1 ) & PAGE_MASK ) )
| | ( __get_user ( byte , ( char __user * ) ( pg + 1 ) - 1 ) = = 0 ) ) ;
}
EXPORT_SYMBOL ( ia64_pfn_valid ) ;
int
find_largest_hole ( u64 start , u64 end , void * arg )
{
u64 * max_gap = arg ;
static u64 last_end = PAGE_OFFSET ;
/* NOTE: this algorithm assumes efi memmap table is ordered */
if ( * max_gap < ( start - last_end ) )
* max_gap = start - last_end ;
last_end = end ;
return 0 ;
}
# endif /* CONFIG_VIRTUAL_MEM_MAP */
static int
count_reserved_pages ( u64 start , u64 end , void * arg )
{
unsigned long num_reserved = 0 ;
unsigned long * count = arg ;
for ( ; start < end ; start + = PAGE_SIZE )
if ( PageReserved ( virt_to_page ( start ) ) )
+ + num_reserved ;
* count + = num_reserved ;
return 0 ;
}
/*
* Boot command - line option " nolwsys " can be used to disable the use of any light - weight
* system call handler . When this option is in effect , all fsyscalls will end up bubbling
* down into the kernel and calling the normal ( heavy - weight ) syscall handler . This is
* useful for performance testing , but conceivably could also come in handy for debugging
* purposes .
*/
static int nolwsys ;
static int __init
nolwsys_setup ( char * s )
{
nolwsys = 1 ;
return 1 ;
}
__setup ( " nolwsys " , nolwsys_setup ) ;
void
mem_init ( void )
{
long reserved_pages , codesize , datasize , initsize ;
pg_data_t * pgdat ;
int i ;
static struct kcore_list kcore_mem , kcore_vmem , kcore_kernel ;
2005-04-26 00:13:16 +04:00
BUG_ON ( PTRS_PER_PGD * sizeof ( pgd_t ) ! = PAGE_SIZE ) ;
BUG_ON ( PTRS_PER_PMD * sizeof ( pmd_t ) ! = PAGE_SIZE ) ;
BUG_ON ( PTRS_PER_PTE * sizeof ( pte_t ) ! = PAGE_SIZE ) ;
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_PCI
/*
* This needs to be called _after_ the command line has been parsed but _before_
* any drivers that may need the PCI DMA interface are initialized or bootmem has
* been freed .
*/
platform_dma_init ( ) ;
# endif
# ifndef CONFIG_DISCONTIGMEM
if ( ! mem_map )
BUG ( ) ;
max_mapnr = max_low_pfn ;
# endif
high_memory = __va ( max_low_pfn * PAGE_SIZE ) ;
kclist_add ( & kcore_mem , __va ( 0 ) , max_low_pfn * PAGE_SIZE ) ;
kclist_add ( & kcore_vmem , ( void * ) VMALLOC_START , VMALLOC_END - VMALLOC_START ) ;
kclist_add ( & kcore_kernel , _stext , _end - _stext ) ;
for_each_pgdat ( pgdat )
totalram_pages + = free_all_bootmem_node ( pgdat ) ;
reserved_pages = 0 ;
efi_memmap_walk ( count_reserved_pages , & reserved_pages ) ;
codesize = ( unsigned long ) _etext - ( unsigned long ) _stext ;
datasize = ( unsigned long ) _edata - ( unsigned long ) _etext ;
initsize = ( unsigned long ) __init_end - ( unsigned long ) __init_begin ;
printk ( KERN_INFO " Memory: %luk/%luk available (%luk code, %luk reserved, "
" %luk data, %luk init) \n " , ( unsigned long ) nr_free_pages ( ) < < ( PAGE_SHIFT - 10 ) ,
num_physpages < < ( PAGE_SHIFT - 10 ) , codesize > > 10 ,
reserved_pages < < ( PAGE_SHIFT - 10 ) , datasize > > 10 , initsize > > 10 ) ;
/*
* For fsyscall entrpoints with no light - weight handler , use the ordinary
* ( heavy - weight ) handler , but mark it by setting bit 0 , so the fsyscall entry
* code can tell them apart .
*/
for ( i = 0 ; i < NR_syscalls ; + + i ) {
extern unsigned long fsyscall_table [ NR_syscalls ] ;
extern unsigned long sys_call_table [ NR_syscalls ] ;
if ( ! fsyscall_table [ i ] | | nolwsys )
fsyscall_table [ i ] = sys_call_table [ i ] | 1 ;
}
setup_gate ( ) ;
# ifdef CONFIG_IA32_SUPPORT
ia32_mem_init ( ) ;
# endif
}