2005-04-16 15:20:36 -07:00
/*
* linux / mm / page_io . c
*
* Copyright ( C ) 1991 , 1992 , 1993 , 1994 Linus Torvalds
*
* Swap reorganised 29.12 .95 ,
* Asynchronous swapping added 30.12 .95 . Stephen Tweedie
* Removed race in async swapping . 14.4 .1996 . Bruno Haible
* Add swap of shared pages through the page cache . 20.2 .1998 . Stephen Tweedie
* Always use brw_page , life becomes simpler . 12 May 1998 Eric Biederman
*/
# include <linux/mm.h>
# include <linux/kernel_stat.h>
# include <linux/pagemap.h>
# include <linux/swap.h>
# include <linux/bio.h>
# include <linux/swapops.h>
# include <linux/writeback.h>
# include <asm/pgtable.h>
2005-10-07 07:46:04 +01:00
static struct bio * get_swap_bio ( gfp_t gfp_flags , pgoff_t index ,
2005-04-16 15:20:36 -07:00
struct page * page , bio_end_io_t end_io )
{
struct bio * bio ;
bio = bio_alloc ( gfp_flags , 1 ) ;
if ( bio ) {
struct swap_info_struct * sis ;
swp_entry_t entry = { . val = index , } ;
sis = get_swap_info_struct ( swp_type ( entry ) ) ;
bio - > bi_sector = map_swap_page ( sis , swp_offset ( entry ) ) *
( PAGE_SIZE > > 9 ) ;
bio - > bi_bdev = sis - > bdev ;
bio - > bi_io_vec [ 0 ] . bv_page = page ;
bio - > bi_io_vec [ 0 ] . bv_len = PAGE_SIZE ;
bio - > bi_io_vec [ 0 ] . bv_offset = 0 ;
bio - > bi_vcnt = 1 ;
bio - > bi_idx = 0 ;
bio - > bi_size = PAGE_SIZE ;
bio - > bi_end_io = end_io ;
}
return bio ;
}
2007-09-27 12:47:43 +02:00
static void end_swap_bio_write ( struct bio * bio , int err )
2005-04-16 15:20:36 -07:00
{
const int uptodate = test_bit ( BIO_UPTODATE , & bio - > bi_flags ) ;
struct page * page = bio - > bi_io_vec [ 0 ] . bv_page ;
2006-09-25 23:31:26 -07:00
if ( ! uptodate ) {
2005-04-16 15:20:36 -07:00
SetPageError ( page ) ;
2006-09-25 23:31:26 -07:00
/*
* We failed to write the page out to swap - space .
* Re - dirty the page in order to avoid it being reclaimed .
* Also print a dire warning that things will go BAD ( tm )
* very quickly .
*
* Also clear PG_reclaim to avoid rotate_reclaimable_page ( )
*/
set_page_dirty ( page ) ;
printk ( KERN_ALERT " Write-error on swap-device (%u:%u:%Lu) \n " ,
imajor ( bio - > bi_bdev - > bd_inode ) ,
iminor ( bio - > bi_bdev - > bd_inode ) ,
( unsigned long long ) bio - > bi_sector ) ;
ClearPageReclaim ( page ) ;
}
2005-04-16 15:20:36 -07:00
end_page_writeback ( page ) ;
bio_put ( bio ) ;
}
2007-09-27 12:47:43 +02:00
void end_swap_bio_read ( struct bio * bio , int err )
2005-04-16 15:20:36 -07:00
{
const int uptodate = test_bit ( BIO_UPTODATE , & bio - > bi_flags ) ;
struct page * page = bio - > bi_io_vec [ 0 ] . bv_page ;
if ( ! uptodate ) {
SetPageError ( page ) ;
ClearPageUptodate ( page ) ;
2006-09-25 23:31:26 -07:00
printk ( KERN_ALERT " Read-error on swap-device (%u:%u:%Lu) \n " ,
imajor ( bio - > bi_bdev - > bd_inode ) ,
iminor ( bio - > bi_bdev - > bd_inode ) ,
( unsigned long long ) bio - > bi_sector ) ;
2005-04-16 15:20:36 -07:00
} else {
SetPageUptodate ( page ) ;
}
unlock_page ( page ) ;
bio_put ( bio ) ;
}
/*
* We may have stale swap cache pages in memory : notice
* them here and get rid of the unnecessary final write .
*/
int swap_writepage ( struct page * page , struct writeback_control * wbc )
{
struct bio * bio ;
int ret = 0 , rw = WRITE ;
if ( remove_exclusive_swap_page ( page ) ) {
unlock_page ( page ) ;
goto out ;
}
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
bio = get_swap_bio ( GFP_NOIO , page_private ( page ) , page ,
end_swap_bio_write ) ;
2005-04-16 15:20:36 -07:00
if ( bio = = NULL ) {
set_page_dirty ( page ) ;
unlock_page ( page ) ;
ret = - ENOMEM ;
goto out ;
}
if ( wbc - > sync_mode = = WB_SYNC_ALL )
rw | = ( 1 < < BIO_RW_SYNC ) ;
2006-06-30 01:55:45 -07:00
count_vm_event ( PSWPOUT ) ;
2005-04-16 15:20:36 -07:00
set_page_writeback ( page ) ;
unlock_page ( page ) ;
submit_bio ( rw , bio ) ;
out :
return ret ;
}
int swap_readpage ( struct file * file , struct page * page )
{
struct bio * bio ;
int ret = 0 ;
BUG_ON ( ! PageLocked ( page ) ) ;
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-04 22:29:34 -08:00
BUG_ON ( PageUptodate ( page ) ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
bio = get_swap_bio ( GFP_KERNEL , page_private ( page ) , page ,
end_swap_bio_read ) ;
2005-04-16 15:20:36 -07:00
if ( bio = = NULL ) {
unlock_page ( page ) ;
ret = - ENOMEM ;
goto out ;
}
2006-06-30 01:55:45 -07:00
count_vm_event ( PSWPIN ) ;
2005-04-16 15:20:36 -07:00
submit_bio ( READ , bio ) ;
out :
return ret ;
}