2005-04-17 02:20:36 +04:00
/*
* Generic hugetlb support .
* ( C ) William Irwin , April 2004
*/
# include <linux/gfp.h>
# include <linux/list.h>
# include <linux/init.h>
# include <linux/module.h>
# include <linux/mm.h>
# include <linux/sysctl.h>
# include <linux/highmem.h>
# include <linux/nodemask.h>
2005-06-22 04:14:44 +04:00
# include <linux/pagemap.h>
2006-01-06 11:10:46 +03:00
# include <linux/mempolicy.h>
2006-01-08 12:00:57 +03:00
# include <linux/cpuset.h>
2006-01-06 11:10:46 +03:00
2005-06-22 04:14:44 +04:00
# include <asm/page.h>
# include <asm/pgtable.h>
# include <linux/hugetlb.h>
2005-04-17 02:20:36 +04:00
const unsigned long hugetlb_zero = 0 , hugetlb_infinity = ~ 0UL ;
static unsigned long nr_huge_pages , free_huge_pages ;
unsigned long max_huge_pages ;
static struct list_head hugepage_freelists [ MAX_NUMNODES ] ;
static unsigned int nr_huge_pages_node [ MAX_NUMNODES ] ;
static unsigned int free_huge_pages_node [ MAX_NUMNODES ] ;
2005-11-22 08:32:28 +03:00
/*
* Protects updates to hugepage_freelists , nr_huge_pages , and free_huge_pages
*/
2005-04-17 02:20:36 +04:00
static DEFINE_SPINLOCK ( hugetlb_lock ) ;
static void enqueue_huge_page ( struct page * page )
{
int nid = page_to_nid ( page ) ;
list_add ( & page - > lru , & hugepage_freelists [ nid ] ) ;
free_huge_pages + + ;
free_huge_pages_node [ nid ] + + ;
}
2006-01-06 11:10:46 +03:00
static struct page * dequeue_huge_page ( struct vm_area_struct * vma ,
unsigned long address )
2005-04-17 02:20:36 +04:00
{
int nid = numa_node_id ( ) ;
struct page * page = NULL ;
2006-01-06 11:10:46 +03:00
struct zonelist * zonelist = huge_zonelist ( vma , address ) ;
2006-01-06 11:10:45 +03:00
struct zone * * z ;
2005-04-17 02:20:36 +04:00
2006-01-06 11:10:45 +03:00
for ( z = zonelist - > zones ; * z ; z + + ) {
nid = ( * z ) - > zone_pgdat - > node_id ;
2006-01-08 12:00:57 +03:00
if ( cpuset_zone_allowed ( * z , GFP_HIGHUSER ) & &
! list_empty ( & hugepage_freelists [ nid ] ) )
2006-01-06 11:10:45 +03:00
break ;
2005-04-17 02:20:36 +04:00
}
2006-01-06 11:10:45 +03:00
if ( * z ) {
2005-04-17 02:20:36 +04:00
page = list_entry ( hugepage_freelists [ nid ] . next ,
struct page , lru ) ;
list_del ( & page - > lru ) ;
free_huge_pages - - ;
free_huge_pages_node [ nid ] - - ;
}
return page ;
}
static struct page * alloc_fresh_huge_page ( void )
{
static int nid = 0 ;
struct page * page ;
page = alloc_pages_node ( nid , GFP_HIGHUSER | __GFP_COMP | __GFP_NOWARN ,
HUGETLB_PAGE_ORDER ) ;
nid = ( nid + 1 ) % num_online_nodes ( ) ;
if ( page ) {
2005-11-22 08:32:28 +03:00
spin_lock ( & hugetlb_lock ) ;
2005-04-17 02:20:36 +04:00
nr_huge_pages + + ;
nr_huge_pages_node [ page_to_nid ( page ) ] + + ;
2005-11-22 08:32:28 +03:00
spin_unlock ( & hugetlb_lock ) ;
2005-04-17 02:20:36 +04:00
}
return page ;
}
void free_huge_page ( struct page * page )
{
BUG_ON ( page_count ( page ) ) ;
INIT_LIST_HEAD ( & page - > lru ) ;
[PATCH] compound page: use page[1].lru
If a compound page has its own put_page_testzero destructor (the only current
example is free_huge_page), that is noted in page[1].mapping of the compound
page. But that's rather a poor place to keep it: functions which call
set_page_dirty_lock after get_user_pages (e.g. Infiniband's
__ib_umem_release) ought to be checking first, otherwise set_page_dirty is
liable to crash on what's not the address of a struct address_space.
And now I'm about to make that worse: it turns out that every compound page
needs a destructor, so we can no longer rely on hugetlb pages going their own
special way, to avoid further problems of page->mapping reuse. For example,
not many people know that: on 50% of i386 -Os builds, the first tail page of a
compound page purports to be PageAnon (when its destructor has an odd
address), which surprises page_add_file_rmap.
Keep the compound page destructor in page[1].lru.next instead. And to free up
the common pairing of mapping and index, also move compound page order from
index to lru.prev. Slab reuses page->lru too: but if we ever need slab to use
compound pages, it can easily stack its use above this.
(akpm: decoded version of the above: the tail pages of a compound page now
have ->mapping==NULL, so there's no need for the set_page_dirty[_lock]()
caller to check that they're not compund pages before doing the dirty).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-15 00:52:58 +03:00
page [ 1 ] . lru . next = NULL ; /* reset dtor */
2005-04-17 02:20:36 +04:00
spin_lock ( & hugetlb_lock ) ;
enqueue_huge_page ( page ) ;
spin_unlock ( & hugetlb_lock ) ;
}
2006-01-06 11:10:46 +03:00
struct page * alloc_huge_page ( struct vm_area_struct * vma , unsigned long addr )
2005-04-17 02:20:36 +04:00
{
struct page * page ;
int i ;
spin_lock ( & hugetlb_lock ) ;
2006-01-06 11:10:46 +03:00
page = dequeue_huge_page ( vma , addr ) ;
2005-04-17 02:20:36 +04:00
if ( ! page ) {
spin_unlock ( & hugetlb_lock ) ;
return NULL ;
}
spin_unlock ( & hugetlb_lock ) ;
set_page_count ( page , 1 ) ;
[PATCH] compound page: use page[1].lru
If a compound page has its own put_page_testzero destructor (the only current
example is free_huge_page), that is noted in page[1].mapping of the compound
page. But that's rather a poor place to keep it: functions which call
set_page_dirty_lock after get_user_pages (e.g. Infiniband's
__ib_umem_release) ought to be checking first, otherwise set_page_dirty is
liable to crash on what's not the address of a struct address_space.
And now I'm about to make that worse: it turns out that every compound page
needs a destructor, so we can no longer rely on hugetlb pages going their own
special way, to avoid further problems of page->mapping reuse. For example,
not many people know that: on 50% of i386 -Os builds, the first tail page of a
compound page purports to be PageAnon (when its destructor has an odd
address), which surprises page_add_file_rmap.
Keep the compound page destructor in page[1].lru.next instead. And to free up
the common pairing of mapping and index, also move compound page order from
index to lru.prev. Slab reuses page->lru too: but if we ever need slab to use
compound pages, it can easily stack its use above this.
(akpm: decoded version of the above: the tail pages of a compound page now
have ->mapping==NULL, so there's no need for the set_page_dirty[_lock]()
caller to check that they're not compund pages before doing the dirty).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-15 00:52:58 +03:00
page [ 1 ] . lru . next = ( void * ) free_huge_page ; /* set dtor */
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < ( HPAGE_SIZE / PAGE_SIZE ) ; + + i )
2006-02-07 23:58:25 +03:00
clear_user_highpage ( & page [ i ] , addr ) ;
2005-04-17 02:20:36 +04:00
return page ;
}
static int __init hugetlb_init ( void )
{
unsigned long i ;
struct page * page ;
2005-11-07 03:06:55 +03:00
if ( HPAGE_SHIFT = = 0 )
return 0 ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < MAX_NUMNODES ; + + i )
INIT_LIST_HEAD ( & hugepage_freelists [ i ] ) ;
for ( i = 0 ; i < max_huge_pages ; + + i ) {
page = alloc_fresh_huge_page ( ) ;
if ( ! page )
break ;
spin_lock ( & hugetlb_lock ) ;
enqueue_huge_page ( page ) ;
spin_unlock ( & hugetlb_lock ) ;
}
max_huge_pages = free_huge_pages = nr_huge_pages = i ;
printk ( " Total HugeTLB memory allocated, %ld \n " , free_huge_pages ) ;
return 0 ;
}
module_init ( hugetlb_init ) ;
static int __init hugetlb_setup ( char * s )
{
if ( sscanf ( s , " %lu " , & max_huge_pages ) < = 0 )
max_huge_pages = 0 ;
return 1 ;
}
__setup ( " hugepages= " , hugetlb_setup ) ;
# ifdef CONFIG_SYSCTL
static void update_and_free_page ( struct page * page )
{
int i ;
nr_huge_pages - - ;
nr_huge_pages_node [ page_zone ( page ) - > zone_pgdat - > node_id ] - - ;
for ( i = 0 ; i < ( HPAGE_SIZE / PAGE_SIZE ) ; i + + ) {
page [ i ] . flags & = ~ ( 1 < < PG_locked | 1 < < PG_error | 1 < < PG_referenced |
1 < < PG_dirty | 1 < < PG_active | 1 < < PG_reserved |
1 < < PG_private | 1 < < PG_writeback ) ;
set_page_count ( & page [ i ] , 0 ) ;
}
set_page_count ( page , 1 ) ;
__free_pages ( page , HUGETLB_PAGE_ORDER ) ;
}
# ifdef CONFIG_HIGHMEM
static void try_to_free_low ( unsigned long count )
{
int i , nid ;
for ( i = 0 ; i < MAX_NUMNODES ; + + i ) {
struct page * page , * next ;
list_for_each_entry_safe ( page , next , & hugepage_freelists [ i ] , lru ) {
if ( PageHighMem ( page ) )
continue ;
list_del ( & page - > lru ) ;
update_and_free_page ( page ) ;
nid = page_zone ( page ) - > zone_pgdat - > node_id ;
free_huge_pages - - ;
free_huge_pages_node [ nid ] - - ;
if ( count > = nr_huge_pages )
return ;
}
}
}
# else
static inline void try_to_free_low ( unsigned long count )
{
}
# endif
static unsigned long set_max_huge_pages ( unsigned long count )
{
while ( count > nr_huge_pages ) {
struct page * page = alloc_fresh_huge_page ( ) ;
if ( ! page )
return nr_huge_pages ;
spin_lock ( & hugetlb_lock ) ;
enqueue_huge_page ( page ) ;
spin_unlock ( & hugetlb_lock ) ;
}
if ( count > = nr_huge_pages )
return nr_huge_pages ;
spin_lock ( & hugetlb_lock ) ;
try_to_free_low ( count ) ;
while ( count < nr_huge_pages ) {
2006-01-06 11:10:46 +03:00
struct page * page = dequeue_huge_page ( NULL , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ! page )
break ;
update_and_free_page ( page ) ;
}
spin_unlock ( & hugetlb_lock ) ;
return nr_huge_pages ;
}
int hugetlb_sysctl_handler ( struct ctl_table * table , int write ,
struct file * file , void __user * buffer ,
size_t * length , loff_t * ppos )
{
proc_doulongvec_minmax ( table , write , file , buffer , length , ppos ) ;
max_huge_pages = set_max_huge_pages ( max_huge_pages ) ;
return 0 ;
}
# endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo ( char * buf )
{
return sprintf ( buf ,
" HugePages_Total: %5lu \n "
" HugePages_Free: %5lu \n "
" Hugepagesize: %5lu kB \n " ,
nr_huge_pages ,
free_huge_pages ,
HPAGE_SIZE / 1024 ) ;
}
int hugetlb_report_node_meminfo ( int nid , char * buf )
{
return sprintf ( buf ,
" Node %d HugePages_Total: %5u \n "
" Node %d HugePages_Free: %5u \n " ,
nid , nr_huge_pages_node [ nid ] ,
nid , free_huge_pages_node [ nid ] ) ;
}
int is_hugepage_mem_enough ( size_t size )
{
return ( size + ~ HPAGE_MASK ) / HPAGE_SIZE < = free_huge_pages ;
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages ( void )
{
return nr_huge_pages * ( HPAGE_SIZE / PAGE_SIZE ) ;
}
/*
* We cannot handle pagefaults against hugetlb pages at all . They cause
* handle_mm_fault ( ) to try to instantiate regular - sized pages in the
* hugegpage VMA . do_page_fault ( ) is supposed to trap this , so BUG is we get
* this far .
*/
static struct page * hugetlb_nopage ( struct vm_area_struct * vma ,
unsigned long address , int * unused )
{
BUG ( ) ;
return NULL ;
}
struct vm_operations_struct hugetlb_vm_ops = {
. nopage = hugetlb_nopage ,
} ;
2006-01-06 11:10:44 +03:00
static pte_t make_huge_pte ( struct vm_area_struct * vma , struct page * page ,
int writable )
2005-06-22 04:14:44 +04:00
{
pte_t entry ;
2006-01-06 11:10:44 +03:00
if ( writable ) {
2005-06-22 04:14:44 +04:00
entry =
pte_mkwrite ( pte_mkdirty ( mk_pte ( page , vma - > vm_page_prot ) ) ) ;
} else {
entry = pte_wrprotect ( mk_pte ( page , vma - > vm_page_prot ) ) ;
}
entry = pte_mkyoung ( entry ) ;
entry = pte_mkhuge ( entry ) ;
return entry ;
}
2006-01-06 11:10:44 +03:00
static void set_huge_ptep_writable ( struct vm_area_struct * vma ,
unsigned long address , pte_t * ptep )
{
pte_t entry ;
entry = pte_mkwrite ( pte_mkdirty ( * ptep ) ) ;
ptep_set_access_flags ( vma , address , ptep , entry , 1 ) ;
update_mmu_cache ( vma , address , entry ) ;
lazy_mmu_prot_update ( entry ) ;
}
2005-06-22 04:14:44 +04:00
int copy_hugetlb_page_range ( struct mm_struct * dst , struct mm_struct * src ,
struct vm_area_struct * vma )
{
pte_t * src_pte , * dst_pte , entry ;
struct page * ptepage ;
2005-10-20 08:23:43 +04:00
unsigned long addr ;
2006-01-06 11:10:44 +03:00
int cow ;
cow = ( vma - > vm_flags & ( VM_SHARED | VM_MAYWRITE ) ) = = VM_MAYWRITE ;
2005-06-22 04:14:44 +04:00
2005-10-20 08:23:43 +04:00
for ( addr = vma - > vm_start ; addr < vma - > vm_end ; addr + = HPAGE_SIZE ) {
2005-10-30 04:16:23 +03:00
src_pte = huge_pte_offset ( src , addr ) ;
if ( ! src_pte )
continue ;
2005-06-22 04:14:44 +04:00
dst_pte = huge_pte_alloc ( dst , addr ) ;
if ( ! dst_pte )
goto nomem ;
2005-10-30 04:16:23 +03:00
spin_lock ( & dst - > page_table_lock ) ;
2005-10-20 08:23:43 +04:00
spin_lock ( & src - > page_table_lock ) ;
2005-10-30 04:16:23 +03:00
if ( ! pte_none ( * src_pte ) ) {
2006-01-06 11:10:44 +03:00
if ( cow )
ptep_set_wrprotect ( src , addr , src_pte ) ;
2005-10-20 08:23:43 +04:00
entry = * src_pte ;
ptepage = pte_page ( entry ) ;
get_page ( ptepage ) ;
2005-10-30 04:16:05 +03:00
add_mm_counter ( dst , file_rss , HPAGE_SIZE / PAGE_SIZE ) ;
2005-10-20 08:23:43 +04:00
set_huge_pte_at ( dst , addr , dst_pte , entry ) ;
}
spin_unlock ( & src - > page_table_lock ) ;
2005-10-30 04:16:23 +03:00
spin_unlock ( & dst - > page_table_lock ) ;
2005-06-22 04:14:44 +04:00
}
return 0 ;
nomem :
return - ENOMEM ;
}
void unmap_hugepage_range ( struct vm_area_struct * vma , unsigned long start ,
unsigned long end )
{
struct mm_struct * mm = vma - > vm_mm ;
unsigned long address ;
2005-08-05 22:59:35 +04:00
pte_t * ptep ;
2005-06-22 04:14:44 +04:00
pte_t pte ;
struct page * page ;
WARN_ON ( ! is_vm_hugetlb_page ( vma ) ) ;
BUG_ON ( start & ~ HPAGE_MASK ) ;
BUG_ON ( end & ~ HPAGE_MASK ) ;
2005-10-30 04:16:30 +03:00
spin_lock ( & mm - > page_table_lock ) ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
/* Update high watermark before we lower rss */
update_hiwater_rss ( mm ) ;
2005-06-22 04:14:44 +04:00
for ( address = start ; address < end ; address + = HPAGE_SIZE ) {
2005-08-05 22:59:35 +04:00
ptep = huge_pte_offset ( mm , address ) ;
2005-10-30 04:16:46 +03:00
if ( ! ptep )
2005-08-05 22:59:35 +04:00
continue ;
pte = huge_ptep_get_and_clear ( mm , address , ptep ) ;
2005-06-22 04:14:44 +04:00
if ( pte_none ( pte ) )
continue ;
2005-08-05 22:59:35 +04:00
2005-06-22 04:14:44 +04:00
page = pte_page ( pte ) ;
put_page ( page ) ;
2005-10-30 04:16:05 +03:00
add_mm_counter ( mm , file_rss , ( int ) - ( HPAGE_SIZE / PAGE_SIZE ) ) ;
2005-06-22 04:14:44 +04:00
}
2005-04-17 02:20:36 +04:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-30 04:16:30 +03:00
flush_tlb_range ( vma , start , end ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-22 04:14:44 +04:00
2006-01-06 11:10:44 +03:00
static int hugetlb_cow ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , pte_t * ptep , pte_t pte )
{
struct page * old_page , * new_page ;
int i , avoidcopy ;
old_page = pte_page ( pte ) ;
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
avoidcopy = ( page_count ( old_page ) = = 1 ) ;
if ( avoidcopy ) {
set_huge_ptep_writable ( vma , address , ptep ) ;
return VM_FAULT_MINOR ;
}
page_cache_get ( old_page ) ;
2006-01-06 11:10:46 +03:00
new_page = alloc_huge_page ( vma , address ) ;
2006-01-06 11:10:44 +03:00
if ( ! new_page ) {
page_cache_release ( old_page ) ;
2006-02-07 23:58:30 +03:00
return VM_FAULT_OOM ;
2006-01-06 11:10:44 +03:00
}
spin_unlock ( & mm - > page_table_lock ) ;
for ( i = 0 ; i < HPAGE_SIZE / PAGE_SIZE ; i + + )
copy_user_highpage ( new_page + i , old_page + i ,
address + i * PAGE_SIZE ) ;
spin_lock ( & mm - > page_table_lock ) ;
ptep = huge_pte_offset ( mm , address & HPAGE_MASK ) ;
if ( likely ( pte_same ( * ptep , pte ) ) ) {
/* Break COW */
set_huge_pte_at ( mm , address , ptep ,
make_huge_pte ( vma , new_page , 1 ) ) ;
/* Make the old page be freed below */
new_page = old_page ;
}
page_cache_release ( new_page ) ;
page_cache_release ( old_page ) ;
return VM_FAULT_MINOR ;
}
2006-01-06 11:10:43 +03:00
int hugetlb_no_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
2006-01-06 11:10:44 +03:00
unsigned long address , pte_t * ptep , int write_access )
2005-10-20 19:24:28 +04:00
{
int ret = VM_FAULT_SIGBUS ;
2005-10-30 04:16:46 +03:00
unsigned long idx ;
unsigned long size ;
struct page * page ;
struct address_space * mapping ;
2006-01-06 11:10:44 +03:00
pte_t new_pte ;
2005-10-30 04:16:46 +03:00
mapping = vma - > vm_file - > f_mapping ;
idx = ( ( address - vma - > vm_start ) > > HPAGE_SHIFT )
+ ( vma - > vm_pgoff > > ( HPAGE_SHIFT - PAGE_SHIFT ) ) ;
/*
* Use page lock to guard against racing truncation
* before we get page_table_lock .
*/
2006-01-06 11:10:49 +03:00
retry :
page = find_lock_page ( mapping , idx ) ;
if ( ! page ) {
if ( hugetlb_get_quota ( mapping ) )
goto out ;
page = alloc_huge_page ( vma , address ) ;
if ( ! page ) {
hugetlb_put_quota ( mapping ) ;
2006-02-07 23:58:30 +03:00
ret = VM_FAULT_OOM ;
2006-01-06 11:10:49 +03:00
goto out ;
}
2005-10-20 19:24:28 +04:00
2006-01-06 11:10:49 +03:00
if ( vma - > vm_flags & VM_SHARED ) {
int err ;
err = add_to_page_cache ( page , mapping , idx , GFP_KERNEL ) ;
if ( err ) {
put_page ( page ) ;
hugetlb_put_quota ( mapping ) ;
if ( err = = - EEXIST )
goto retry ;
goto out ;
}
} else
lock_page ( page ) ;
}
2006-01-06 11:10:44 +03:00
2005-10-20 19:24:28 +04:00
spin_lock ( & mm - > page_table_lock ) ;
2005-10-30 04:16:46 +03:00
size = i_size_read ( mapping - > host ) > > HPAGE_SHIFT ;
if ( idx > = size )
goto backout ;
ret = VM_FAULT_MINOR ;
2006-01-06 11:10:43 +03:00
if ( ! pte_none ( * ptep ) )
2005-10-30 04:16:46 +03:00
goto backout ;
add_mm_counter ( mm , file_rss , HPAGE_SIZE / PAGE_SIZE ) ;
2006-01-06 11:10:44 +03:00
new_pte = make_huge_pte ( vma , page , ( ( vma - > vm_flags & VM_WRITE )
& & ( vma - > vm_flags & VM_SHARED ) ) ) ;
set_huge_pte_at ( mm , address , ptep , new_pte ) ;
if ( write_access & & ! ( vma - > vm_flags & VM_SHARED ) ) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow ( mm , vma , address , ptep , new_pte ) ;
}
2005-10-20 19:24:28 +04:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-10-30 04:16:46 +03:00
unlock_page ( page ) ;
out :
2005-10-20 19:24:28 +04:00
return ret ;
2005-10-30 04:16:46 +03:00
backout :
spin_unlock ( & mm - > page_table_lock ) ;
hugetlb_put_quota ( mapping ) ;
unlock_page ( page ) ;
put_page ( page ) ;
goto out ;
2005-10-20 19:24:28 +04:00
}
2006-01-06 11:10:43 +03:00
int hugetlb_fault ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long address , int write_access )
{
pte_t * ptep ;
pte_t entry ;
2006-01-06 11:10:44 +03:00
int ret ;
2006-01-06 11:10:43 +03:00
ptep = huge_pte_alloc ( mm , address ) ;
if ( ! ptep )
return VM_FAULT_OOM ;
entry = * ptep ;
if ( pte_none ( entry ) )
2006-01-06 11:10:44 +03:00
return hugetlb_no_page ( mm , vma , address , ptep , write_access ) ;
2006-01-06 11:10:43 +03:00
2006-01-06 11:10:44 +03:00
ret = VM_FAULT_MINOR ;
spin_lock ( & mm - > page_table_lock ) ;
/* Check for a racing update before calling hugetlb_cow */
if ( likely ( pte_same ( entry , * ptep ) ) )
if ( write_access & & ! pte_write ( entry ) )
ret = hugetlb_cow ( mm , vma , address , ptep , entry ) ;
spin_unlock ( & mm - > page_table_lock ) ;
return ret ;
2006-01-06 11:10:43 +03:00
}
2005-06-22 04:14:44 +04:00
int follow_hugetlb_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
struct page * * pages , struct vm_area_struct * * vmas ,
unsigned long * position , int * length , int i )
{
unsigned long vpfn , vaddr = * position ;
int remainder = * length ;
vpfn = vaddr / PAGE_SIZE ;
2005-10-20 08:23:43 +04:00
spin_lock ( & mm - > page_table_lock ) ;
2005-06-22 04:14:44 +04:00
while ( vaddr < vma - > vm_end & & remainder ) {
2005-10-30 04:16:46 +03:00
pte_t * pte ;
struct page * page ;
2005-06-22 04:14:44 +04:00
2005-10-30 04:16:46 +03:00
/*
* Some archs ( sparc64 , sh * ) have multiple pte_ts to
* each hugepage . We have to make * sure we get the
* first , for the page indexing below to work .
*/
pte = huge_pte_offset ( mm , vaddr & HPAGE_MASK ) ;
2005-06-22 04:14:44 +04:00
2005-10-30 04:16:46 +03:00
if ( ! pte | | pte_none ( * pte ) ) {
int ret ;
2005-06-22 04:14:44 +04:00
2005-10-30 04:16:46 +03:00
spin_unlock ( & mm - > page_table_lock ) ;
ret = hugetlb_fault ( mm , vma , vaddr , 0 ) ;
spin_lock ( & mm - > page_table_lock ) ;
if ( ret = = VM_FAULT_MINOR )
continue ;
2005-06-22 04:14:44 +04:00
2005-10-30 04:16:46 +03:00
remainder = 0 ;
if ( ! i )
i = - EFAULT ;
break ;
}
if ( pages ) {
page = & pte_page ( * pte ) [ vpfn % ( HPAGE_SIZE / PAGE_SIZE ) ] ;
2005-06-22 04:14:44 +04:00
get_page ( page ) ;
pages [ i ] = page ;
}
if ( vmas )
vmas [ i ] = vma ;
vaddr + = PAGE_SIZE ;
+ + vpfn ;
- - remainder ;
+ + i ;
}
2005-10-20 08:23:43 +04:00
spin_unlock ( & mm - > page_table_lock ) ;
2005-06-22 04:14:44 +04:00
* length = remainder ;
* position = vaddr ;
return i ;
}