2005-04-16 15:20:36 -07:00
/*
* linux / arch / i386 / mm / pgtable . c
*/
# include <linux/sched.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/smp.h>
# include <linux/highmem.h>
# include <linux/slab.h>
# include <linux/pagemap.h>
# include <linux/spinlock.h>
2006-09-25 23:32:25 -07:00
# include <linux/module.h>
2005-04-16 15:20:36 -07:00
# include <asm/system.h>
# include <asm/pgtable.h>
# include <asm/pgalloc.h>
# include <asm/fixmap.h>
# include <asm/e820.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
void show_mem ( void )
{
int total = 0 , reserved = 0 ;
int shared = 0 , cached = 0 ;
int highmem = 0 ;
struct page * page ;
pg_data_t * pgdat ;
unsigned long i ;
2005-10-29 18:16:52 -07:00
unsigned long flags ;
2005-04-16 15:20:36 -07:00
2005-06-25 14:59:24 -07:00
printk ( KERN_INFO " Mem-info: \n " ) ;
2005-04-16 15:20:36 -07:00
show_free_areas ( ) ;
2005-06-25 14:59:24 -07:00
printk ( KERN_INFO " Free swap: %6ldkB \n " , nr_swap_pages < < ( PAGE_SHIFT - 10 ) ) ;
2006-03-27 01:15:59 -08:00
for_each_online_pgdat ( pgdat ) {
2005-10-29 18:16:52 -07:00
pgdat_resize_lock ( pgdat , & flags ) ;
2005-04-16 15:20:36 -07:00
for ( i = 0 ; i < pgdat - > node_spanned_pages ; + + i ) {
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 00:07:37 -07:00
page = pgdat_page_nr ( pgdat , i ) ;
2005-04-16 15:20:36 -07:00
total + + ;
if ( PageHighMem ( page ) )
highmem + + ;
if ( PageReserved ( page ) )
reserved + + ;
else if ( PageSwapCache ( page ) )
cached + + ;
else if ( page_count ( page ) )
shared + = page_count ( page ) - 1 ;
}
2005-10-29 18:16:52 -07:00
pgdat_resize_unlock ( pgdat , & flags ) ;
2005-04-16 15:20:36 -07:00
}
2005-06-25 14:59:24 -07:00
printk ( KERN_INFO " %d pages of RAM \n " , total ) ;
printk ( KERN_INFO " %d pages of HIGHMEM \n " , highmem ) ;
printk ( KERN_INFO " %d reserved pages \n " , reserved ) ;
printk ( KERN_INFO " %d pages shared \n " , shared ) ;
printk ( KERN_INFO " %d pages swap cached \n " , cached ) ;
2005-06-23 00:08:08 -07:00
2006-06-30 01:55:39 -07:00
printk ( KERN_INFO " %lu pages dirty \n " , global_page_state ( NR_FILE_DIRTY ) ) ;
2006-06-30 01:55:40 -07:00
printk ( KERN_INFO " %lu pages writeback \n " ,
global_page_state ( NR_WRITEBACK ) ) ;
2006-06-30 01:55:34 -07:00
printk ( KERN_INFO " %lu pages mapped \n " , global_page_state ( NR_FILE_MAPPED ) ) ;
2006-09-25 23:31:51 -07:00
printk ( KERN_INFO " %lu pages slab \n " ,
global_page_state ( NR_SLAB_RECLAIMABLE ) +
global_page_state ( NR_SLAB_UNRECLAIMABLE ) ) ;
2006-06-30 01:55:38 -07:00
printk ( KERN_INFO " %lu pages pagetables \n " ,
global_page_state ( NR_PAGETABLE ) ) ;
2005-04-16 15:20:36 -07:00
}
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame .
*/
static void set_pte_pfn ( unsigned long vaddr , unsigned long pfn , pgprot_t flags )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
pgd = swapper_pg_dir + pgd_index ( vaddr ) ;
if ( pgd_none ( * pgd ) ) {
BUG ( ) ;
return ;
}
pud = pud_offset ( pgd , vaddr ) ;
if ( pud_none ( * pud ) ) {
BUG ( ) ;
return ;
}
pmd = pmd_offset ( pud , vaddr ) ;
if ( pmd_none ( * pmd ) ) {
BUG ( ) ;
return ;
}
pte = pte_offset_kernel ( pmd , vaddr ) ;
/* <pfn,flags> stored as-is, to permit clearing entries */
set_pte ( pte , pfn_pte ( pfn , flags ) ) ;
/*
* It ' s enough to flush this one mapping .
* ( PGE mappings get flushed as well )
*/
__flush_tlb_one ( vaddr ) ;
}
/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame . pfn is for the base of the page ,
* vaddr is what the page gets mapped to - both must be properly aligned .
* The pmd must already be instantiated . Assumes PAE mode .
*/
void set_pmd_pfn ( unsigned long vaddr , unsigned long pfn , pgprot_t flags )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
if ( vaddr & ( PMD_SIZE - 1 ) ) { /* vaddr is misaligned */
2005-06-25 14:59:24 -07:00
printk ( KERN_WARNING " set_pmd_pfn: vaddr misaligned \n " ) ;
2005-04-16 15:20:36 -07:00
return ; /* BUG(); */
}
if ( pfn & ( PTRS_PER_PTE - 1 ) ) { /* pfn is misaligned */
2005-06-25 14:59:24 -07:00
printk ( KERN_WARNING " set_pmd_pfn: pfn misaligned \n " ) ;
2005-04-16 15:20:36 -07:00
return ; /* BUG(); */
}
pgd = swapper_pg_dir + pgd_index ( vaddr ) ;
if ( pgd_none ( * pgd ) ) {
2005-06-25 14:59:24 -07:00
printk ( KERN_WARNING " set_pmd_pfn: pgd_none \n " ) ;
2005-04-16 15:20:36 -07:00
return ; /* BUG(); */
}
pud = pud_offset ( pgd , vaddr ) ;
pmd = pmd_offset ( pud , vaddr ) ;
set_pmd ( pmd , pfn_pmd ( pfn , flags ) ) ;
/*
* It ' s enough to flush this one mapping .
* ( PGE mappings get flushed as well )
*/
__flush_tlb_one ( vaddr ) ;
}
2006-09-25 23:32:25 -07:00
static int fixmaps ;
# ifndef CONFIG_COMPAT_VDSO
unsigned long __FIXADDR_TOP = 0xfffff000 ;
EXPORT_SYMBOL ( __FIXADDR_TOP ) ;
# endif
2005-04-16 15:20:36 -07:00
void __set_fixmap ( enum fixed_addresses idx , unsigned long phys , pgprot_t flags )
{
unsigned long address = __fix_to_virt ( idx ) ;
if ( idx > = __end_of_fixed_addresses ) {
BUG ( ) ;
return ;
}
set_pte_pfn ( address , phys > > PAGE_SHIFT , flags ) ;
2006-09-25 23:32:25 -07:00
fixmaps + + ;
}
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @ reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor .
*/
void reserve_top_address ( unsigned long reserve )
{
BUG_ON ( fixmaps > 0 ) ;
# ifdef CONFIG_COMPAT_VDSO
BUG_ON ( reserve ! = 0 ) ;
# else
__FIXADDR_TOP = - reserve - PAGE_SIZE ;
__VMALLOC_RESERVE + = reserve ;
# endif
2005-04-16 15:20:36 -07:00
}
pte_t * pte_alloc_one_kernel ( struct mm_struct * mm , unsigned long address )
{
return ( pte_t * ) __get_free_page ( GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO ) ;
}
struct page * pte_alloc_one ( struct mm_struct * mm , unsigned long address )
{
struct page * pte ;
# ifdef CONFIG_HIGHPTE
pte = alloc_pages ( GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO , 0 ) ;
# else
pte = alloc_pages ( GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO , 0 ) ;
# endif
return pte ;
}
void pmd_ctor ( void * pmd , kmem_cache_t * cache , unsigned long flags )
{
memset ( pmd , 0 , PTRS_PER_PMD * sizeof ( pmd_t ) ) ;
}
/*
* List of all pgd ' s needed for non - PAE so it can invalidate entries
* in both cached and uncached pgd ' s ; not needed for PAE since the
* kernel pmd is shared . If PAE were not to share the pmd a similar
* tactic would be needed . This is essentially codepath - based locking
* against pageattr . c ; it is the unique case in which a valid change
* of kernel pagetables can ' t be lazily synchronized by vmalloc faults .
* vmalloc faults work because attached pagetables are never freed .
* The locking scheme was chosen on the basis of manfred ' s
* recommendations and having no core impact whatsoever .
* - - wli
*/
DEFINE_SPINLOCK ( pgd_lock ) ;
struct page * pgd_list ;
static inline void pgd_list_add ( pgd_t * pgd )
{
struct page * page = virt_to_page ( pgd ) ;
page - > index = ( unsigned long ) pgd_list ;
if ( pgd_list )
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
set_page_private ( pgd_list , ( unsigned long ) & page - > index ) ;
2005-04-16 15:20:36 -07:00
pgd_list = page ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
set_page_private ( page , ( unsigned long ) & pgd_list ) ;
2005-04-16 15:20:36 -07:00
}
static inline void pgd_list_del ( pgd_t * pgd )
{
struct page * next , * * pprev , * page = virt_to_page ( pgd ) ;
next = ( struct page * ) page - > index ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
pprev = ( struct page * * ) page_private ( page ) ;
2005-04-16 15:20:36 -07:00
* pprev = next ;
if ( next )
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
set_page_private ( next , ( unsigned long ) pprev ) ;
2005-04-16 15:20:36 -07:00
}
void pgd_ctor ( void * pgd , kmem_cache_t * cache , unsigned long unused )
{
unsigned long flags ;
2005-09-03 15:56:50 -07:00
if ( PTRS_PER_PMD = = 1 ) {
memset ( pgd , 0 , USER_PTRS_PER_PGD * sizeof ( pgd_t ) ) ;
2005-04-16 15:20:36 -07:00
spin_lock_irqsave ( & pgd_lock , flags ) ;
2005-09-03 15:56:50 -07:00
}
2005-04-16 15:20:36 -07:00
2005-09-03 15:56:50 -07:00
clone_pgd_range ( ( pgd_t * ) pgd + USER_PTRS_PER_PGD ,
2005-04-16 15:20:36 -07:00
swapper_pg_dir + USER_PTRS_PER_PGD ,
2005-09-03 15:56:50 -07:00
KERNEL_PGD_PTRS ) ;
2005-04-16 15:20:36 -07:00
if ( PTRS_PER_PMD > 1 )
return ;
pgd_list_add ( pgd ) ;
spin_unlock_irqrestore ( & pgd_lock , flags ) ;
}
/* never called when PTRS_PER_PMD > 1 */
void pgd_dtor ( void * pgd , kmem_cache_t * cache , unsigned long unused )
{
unsigned long flags ; /* can be called from interrupt context */
spin_lock_irqsave ( & pgd_lock , flags ) ;
pgd_list_del ( pgd ) ;
spin_unlock_irqrestore ( & pgd_lock , flags ) ;
}
pgd_t * pgd_alloc ( struct mm_struct * mm )
{
int i ;
pgd_t * pgd = kmem_cache_alloc ( pgd_cache , GFP_KERNEL ) ;
if ( PTRS_PER_PMD = = 1 | | ! pgd )
return pgd ;
for ( i = 0 ; i < USER_PTRS_PER_PGD ; + + i ) {
pmd_t * pmd = kmem_cache_alloc ( pmd_cache , GFP_KERNEL ) ;
if ( ! pmd )
goto out_oom ;
set_pgd ( & pgd [ i ] , __pgd ( 1 + __pa ( pmd ) ) ) ;
}
return pgd ;
out_oom :
for ( i - - ; i > = 0 ; i - - )
kmem_cache_free ( pmd_cache , ( void * ) __va ( pgd_val ( pgd [ i ] ) - 1 ) ) ;
kmem_cache_free ( pgd_cache , pgd ) ;
return NULL ;
}
void pgd_free ( pgd_t * pgd )
{
int i ;
/* in the PAE case user pgd entries are overwritten before usage */
if ( PTRS_PER_PMD > 1 )
for ( i = 0 ; i < USER_PTRS_PER_PGD ; + + i )
kmem_cache_free ( pmd_cache , ( void * ) __va ( pgd_val ( pgd [ i ] ) - 1 ) ) ;
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-19 13:29:15 -07:00
/* in the non-PAE case, free_pgtables() clears user pgd entries */
2005-04-16 15:20:36 -07:00
kmem_cache_free ( pgd_cache , pgd ) ;
}