2005-04-17 02:20:36 +04:00
/*
2006-10-04 01:01:26 +04:00
* mm / page - writeback . c
2005-04-17 02:20:36 +04:00
*
* Copyright ( C ) 2002 , Linus Torvalds .
*
* Contains functions related to writing back dirty pages at the
* address_space level .
*
* 10 Apr2002 akpm @ zip . com . au
* Initial version
*/
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/spinlock.h>
# include <linux/fs.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/slab.h>
# include <linux/pagemap.h>
# include <linux/writeback.h>
# include <linux/init.h>
# include <linux/backing-dev.h>
# include <linux/blkdev.h>
# include <linux/mpage.h>
2006-09-26 10:30:57 +04:00
# include <linux/rmap.h>
2005-04-17 02:20:36 +04:00
# include <linux/percpu.h>
# include <linux/notifier.h>
# include <linux/smp.h>
# include <linux/sysctl.h>
# include <linux/cpu.h>
# include <linux/syscalls.h>
2006-08-29 22:05:54 +04:00
# include <linux/buffer_head.h>
2006-08-29 22:06:09 +04:00
# include <linux/pagevec.h>
2005-04-17 02:20:36 +04:00
/*
* The maximum number of pages to writeout in a single bdflush / kupdate
* operation . We do this so we don ' t hold I_LOCK against an inode for
* enormous amounts of time , which would block a userspace task which has
* been forced to throttle against that inode . Also , the code reevaluates
* the dirty each time it has written this many pages .
*/
# define MAX_WRITEBACK_PAGES 1024
/*
* After a CPU has dirtied this many pages , balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling .
*/
static long ratelimit_pages = 32 ;
2006-01-19 04:42:26 +03:00
static int dirty_exceeded __cacheline_aligned_in_smp ; /* Dirty mem may be over limit */
2005-04-17 02:20:36 +04:00
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non - background writeback , this is how many pages it will attempt to write .
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I / O are submitted .
*/
static inline long sync_writeback_pages ( void )
{
return ratelimit_pages + ratelimit_pages / 2 ;
}
/* The following parameters are exported via /proc/sys/vm */
/*
* Start background writeback ( via pdflush ) at this percentage
*/
int dirty_background_ratio = 10 ;
/*
* The generator of dirty data starts writeback at this percentage
*/
int vm_dirty_ratio = 40 ;
/*
2006-04-11 09:54:35 +04:00
* The interval between ` kupdate ' - style writebacks , in jiffies
2005-04-17 02:20:36 +04:00
*/
2006-03-24 14:15:48 +03:00
int dirty_writeback_interval = 5 * HZ ;
2005-04-17 02:20:36 +04:00
/*
2006-04-11 09:54:35 +04:00
* The longest number of jiffies for which data is allowed to remain dirty
2005-04-17 02:20:36 +04:00
*/
2006-03-24 14:15:48 +03:00
int dirty_expire_interval = 30 * HZ ;
2005-04-17 02:20:36 +04:00
/*
* Flag that makes the machine dump writes / reads and block dirtyings .
*/
int block_dump ;
/*
2006-03-24 14:15:49 +03:00
* Flag that puts the machine in " laptop mode " . Doubles as a timeout in jiffies :
* a full sync is triggered after this time elapses without any disk activity .
2005-04-17 02:20:36 +04:00
*/
int laptop_mode ;
EXPORT_SYMBOL ( laptop_mode ) ;
/* End of sysctl-exported parameters */
static void background_writeout ( unsigned long _min_pages ) ;
/*
* Work out the current dirty - memory clamping and background writeout
* thresholds .
*
* The main aim here is to lower them aggressively if there is a lot of mapped
* memory around . To avoid stressing page reclaim with lots of unreclaimable
* pages . It is better to clamp down on writers than to start swapping , and
* performing lots of scanning .
*
* We only allow 1 / 2 of the currently - unmapped memory to be dirtied .
*
* We don ' t permit the clamping level to fall below 5 % - that is getting rather
* excessive .
*
* We make sure that the background writeout level is below the adjusted
* clamping level .
*/
static void
2006-06-30 12:55:42 +04:00
get_dirty_limits ( long * pbackground , long * pdirty ,
struct address_space * mapping )
2005-04-17 02:20:36 +04:00
{
int background_ratio ; /* Percentages */
int dirty_ratio ;
int unmapped_ratio ;
long background ;
long dirty ;
2006-09-29 13:01:24 +04:00
unsigned long available_memory = vm_total_pages ;
2005-04-17 02:20:36 +04:00
struct task_struct * tsk ;
# ifdef CONFIG_HIGHMEM
/*
* If this mapping can only allocate from low memory ,
* we exclude high memory from our count .
*/
if ( mapping & & ! ( mapping_gfp_mask ( mapping ) & __GFP_HIGHMEM ) )
available_memory - = totalhigh_pages ;
# endif
2006-06-30 12:55:42 +04:00
unmapped_ratio = 100 - ( ( global_page_state ( NR_FILE_MAPPED ) +
global_page_state ( NR_ANON_PAGES ) ) * 100 ) /
2006-09-29 13:01:24 +04:00
vm_total_pages ;
2005-04-17 02:20:36 +04:00
dirty_ratio = vm_dirty_ratio ;
if ( dirty_ratio > unmapped_ratio / 2 )
dirty_ratio = unmapped_ratio / 2 ;
if ( dirty_ratio < 5 )
dirty_ratio = 5 ;
background_ratio = dirty_background_ratio ;
if ( background_ratio > = dirty_ratio )
background_ratio = dirty_ratio / 2 ;
background = ( background_ratio * available_memory ) / 100 ;
dirty = ( dirty_ratio * available_memory ) / 100 ;
tsk = current ;
if ( tsk - > flags & PF_LESS_THROTTLE | | rt_task ( tsk ) ) {
background + = background / 4 ;
dirty + = dirty / 4 ;
}
* pbackground = background ;
* pdirty = dirty ;
}
/*
* balance_dirty_pages ( ) must be called by processes which are generating dirty
* data . It looks at the number of dirty pages in the machine and will force
* the caller to perform writeback if the system is over ` vm_dirty_ratio ' .
* If we ' re over ` background_thresh ' then pdflush is woken to perform some
* writeout .
*/
static void balance_dirty_pages ( struct address_space * mapping )
{
long nr_reclaimable ;
long background_thresh ;
long dirty_thresh ;
unsigned long pages_written = 0 ;
unsigned long write_chunk = sync_writeback_pages ( ) ;
struct backing_dev_info * bdi = mapping - > backing_dev_info ;
for ( ; ; ) {
struct writeback_control wbc = {
. bdi = bdi ,
. sync_mode = WB_SYNC_NONE ,
. older_than_this = NULL ,
. nr_to_write = write_chunk ,
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:26 +04:00
. range_cyclic = 1 ,
2005-04-17 02:20:36 +04:00
} ;
2006-06-30 12:55:42 +04:00
get_dirty_limits ( & background_thresh , & dirty_thresh , mapping ) ;
nr_reclaimable = global_page_state ( NR_FILE_DIRTY ) +
global_page_state ( NR_UNSTABLE_NFS ) ;
if ( nr_reclaimable + global_page_state ( NR_WRITEBACK ) < =
dirty_thresh )
break ;
2005-04-17 02:20:36 +04:00
2006-01-19 04:42:26 +03:00
if ( ! dirty_exceeded )
dirty_exceeded = 1 ;
2005-04-17 02:20:36 +04:00
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems ( i . e . NFS ) in which data may have been
* written to the server ' s write cache , but has not yet
* been flushed to permanent storage .
*/
if ( nr_reclaimable ) {
writeback_inodes ( & wbc ) ;
2006-06-30 12:55:42 +04:00
get_dirty_limits ( & background_thresh ,
& dirty_thresh , mapping ) ;
nr_reclaimable = global_page_state ( NR_FILE_DIRTY ) +
global_page_state ( NR_UNSTABLE_NFS ) ;
if ( nr_reclaimable +
global_page_state ( NR_WRITEBACK )
< = dirty_thresh )
break ;
2005-04-17 02:20:36 +04:00
pages_written + = write_chunk - wbc . nr_to_write ;
if ( pages_written > = write_chunk )
break ; /* We've done our duty */
}
blk_congestion_wait ( WRITE , HZ / 10 ) ;
}
2006-06-30 12:55:42 +04:00
if ( nr_reclaimable + global_page_state ( NR_WRITEBACK )
< = dirty_thresh & & dirty_exceeded )
dirty_exceeded = 0 ;
2005-04-17 02:20:36 +04:00
if ( writeback_in_progress ( bdi ) )
return ; /* pdflush is already working this queue */
/*
* In laptop mode , we wait until hitting the higher threshold before
* starting background writeout , and then write out all the way down
* to the lower threshold . So slow writers cause minimal disk activity .
*
* In normal mode , we start background writeout at the lower
* background_thresh , to keep the amount of dirty memory low .
*/
if ( ( laptop_mode & & pages_written ) | |
( ! laptop_mode & & ( nr_reclaimable > background_thresh ) ) )
pdflush_operation ( background_writeout , 0 ) ;
}
2006-09-26 10:30:58 +04:00
void set_page_dirty_balance ( struct page * page )
{
if ( set_page_dirty ( page ) ) {
struct address_space * mapping = page_mapping ( page ) ;
if ( mapping )
balance_dirty_pages_ratelimited ( mapping ) ;
}
}
2005-04-17 02:20:36 +04:00
/**
2006-03-24 14:18:10 +03:00
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
2005-05-01 19:59:26 +04:00
* @ mapping : address_space which was dirtied
2006-04-02 15:59:55 +04:00
* @ nr_pages_dirtied : number of pages which the caller has just dirtied
2005-04-17 02:20:36 +04:00
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied . The function will periodically check the system ' s
* dirty state and will initiate writeback if needed .
*
* On really big machines , get_writeback_state is expensive , so try to avoid
* calling it too often ( ratelimiting ) . But once we ' re over the dirty memory
* limit we decrease the ratelimiting by a lot , to prevent individual processes
* from overshooting the limit by ( ratelimit_pages ) each .
*/
2006-03-24 14:18:10 +03:00
void balance_dirty_pages_ratelimited_nr ( struct address_space * mapping ,
unsigned long nr_pages_dirtied )
2005-04-17 02:20:36 +04:00
{
2006-03-24 14:18:10 +03:00
static DEFINE_PER_CPU ( unsigned long , ratelimits ) = 0 ;
unsigned long ratelimit ;
unsigned long * p ;
2005-04-17 02:20:36 +04:00
ratelimit = ratelimit_pages ;
if ( dirty_exceeded )
ratelimit = 8 ;
/*
* Check the rate limiting . Also , we do not want to throttle real - time
* tasks in balance_dirty_pages ( ) . Period .
*/
2006-03-24 14:18:10 +03:00
preempt_disable ( ) ;
p = & __get_cpu_var ( ratelimits ) ;
* p + = nr_pages_dirtied ;
if ( unlikely ( * p > = ratelimit ) ) {
* p = 0 ;
preempt_enable ( ) ;
2005-04-17 02:20:36 +04:00
balance_dirty_pages ( mapping ) ;
return ;
}
2006-03-24 14:18:10 +03:00
preempt_enable ( ) ;
2005-04-17 02:20:36 +04:00
}
2006-03-24 14:18:10 +03:00
EXPORT_SYMBOL ( balance_dirty_pages_ratelimited_nr ) ;
2005-04-17 02:20:36 +04:00
void throttle_vm_writeout ( void )
{
long background_thresh ;
long dirty_thresh ;
for ( ; ; ) {
2006-06-30 12:55:42 +04:00
get_dirty_limits ( & background_thresh , & dirty_thresh , NULL ) ;
2005-04-17 02:20:36 +04:00
/*
* Boost the allowable dirty threshold a bit for page
* allocators so they don ' t get DoS ' ed by heavy writers
*/
dirty_thresh + = dirty_thresh / 10 ; /* wheeee... */
2006-06-30 12:55:42 +04:00
if ( global_page_state ( NR_UNSTABLE_NFS ) +
global_page_state ( NR_WRITEBACK ) < = dirty_thresh )
break ;
2005-04-17 02:20:36 +04:00
blk_congestion_wait ( WRITE , HZ / 10 ) ;
}
}
/*
* writeback at least _min_pages , and keep writing until the amount of dirty
* memory is less than the background threshold , or until we ' re all clean .
*/
static void background_writeout ( unsigned long _min_pages )
{
long min_pages = _min_pages ;
struct writeback_control wbc = {
. bdi = NULL ,
. sync_mode = WB_SYNC_NONE ,
. older_than_this = NULL ,
. nr_to_write = 0 ,
. nonblocking = 1 ,
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:26 +04:00
. range_cyclic = 1 ,
2005-04-17 02:20:36 +04:00
} ;
for ( ; ; ) {
long background_thresh ;
long dirty_thresh ;
2006-06-30 12:55:42 +04:00
get_dirty_limits ( & background_thresh , & dirty_thresh , NULL ) ;
if ( global_page_state ( NR_FILE_DIRTY ) +
global_page_state ( NR_UNSTABLE_NFS ) < background_thresh
2005-04-17 02:20:36 +04:00
& & min_pages < = 0 )
break ;
wbc . encountered_congestion = 0 ;
wbc . nr_to_write = MAX_WRITEBACK_PAGES ;
wbc . pages_skipped = 0 ;
writeback_inodes ( & wbc ) ;
min_pages - = MAX_WRITEBACK_PAGES - wbc . nr_to_write ;
if ( wbc . nr_to_write > 0 | | wbc . pages_skipped > 0 ) {
/* Wrote less than expected */
blk_congestion_wait ( WRITE , HZ / 10 ) ;
if ( ! wbc . encountered_congestion )
break ;
}
}
}
/*
* Start writeback of ` nr_pages ' pages . If ` nr_pages ' is zero , write back
* the whole world . Returns 0 if a pdflush thread was dispatched . Returns
* - 1 if all pdflush threads were busy .
*/
2005-06-29 07:44:55 +04:00
int wakeup_pdflush ( long nr_pages )
2005-04-17 02:20:36 +04:00
{
2006-06-30 12:55:42 +04:00
if ( nr_pages = = 0 )
nr_pages = global_page_state ( NR_FILE_DIRTY ) +
global_page_state ( NR_UNSTABLE_NFS ) ;
2005-04-17 02:20:36 +04:00
return pdflush_operation ( background_writeout , nr_pages ) ;
}
static void wb_timer_fn ( unsigned long unused ) ;
static void laptop_timer_fn ( unsigned long unused ) ;
2005-09-10 00:10:40 +04:00
static DEFINE_TIMER ( wb_timer , wb_timer_fn , 0 , 0 ) ;
static DEFINE_TIMER ( laptop_mode_wb_timer , laptop_timer_fn , 0 , 0 ) ;
2005-04-17 02:20:36 +04:00
/*
* Periodic writeback of " old " data .
*
* Define " old " : the first time one of an inode ' s pages is dirtied , we mark the
* dirtying - time in the inode ' s address_space . So this periodic writeback code
* just walks the superblock inode list , writing back any inodes which are
* older than a specific point in time .
*
2006-03-24 14:15:48 +03:00
* Try to run once per dirty_writeback_interval . But if a writeback event
* takes longer than a dirty_writeback_interval interval , then leave a
2005-04-17 02:20:36 +04:00
* one - second gap .
*
* older_than_this takes precedence over nr_to_write . So we ' ll only write back
* all dirty pages if they are all attached to " old " mappings .
*/
static void wb_kupdate ( unsigned long arg )
{
unsigned long oldest_jif ;
unsigned long start_jif ;
unsigned long next_jif ;
long nr_to_write ;
struct writeback_control wbc = {
. bdi = NULL ,
. sync_mode = WB_SYNC_NONE ,
. older_than_this = & oldest_jif ,
. nr_to_write = 0 ,
. nonblocking = 1 ,
. for_kupdate = 1 ,
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:26 +04:00
. range_cyclic = 1 ,
2005-04-17 02:20:36 +04:00
} ;
sync_supers ( ) ;
2006-03-24 14:15:48 +03:00
oldest_jif = jiffies - dirty_expire_interval ;
2005-04-17 02:20:36 +04:00
start_jif = jiffies ;
2006-03-24 14:15:48 +03:00
next_jif = start_jif + dirty_writeback_interval ;
2006-06-30 12:55:42 +04:00
nr_to_write = global_page_state ( NR_FILE_DIRTY ) +
global_page_state ( NR_UNSTABLE_NFS ) +
2005-04-17 02:20:36 +04:00
( inodes_stat . nr_inodes - inodes_stat . nr_unused ) ;
while ( nr_to_write > 0 ) {
wbc . encountered_congestion = 0 ;
wbc . nr_to_write = MAX_WRITEBACK_PAGES ;
writeback_inodes ( & wbc ) ;
if ( wbc . nr_to_write > 0 ) {
if ( wbc . encountered_congestion )
blk_congestion_wait ( WRITE , HZ / 10 ) ;
else
break ; /* All the old data is written */
}
nr_to_write - = MAX_WRITEBACK_PAGES - wbc . nr_to_write ;
}
if ( time_before ( next_jif , jiffies + HZ ) )
next_jif = jiffies + HZ ;
2006-03-24 14:15:48 +03:00
if ( dirty_writeback_interval )
2005-04-17 02:20:36 +04:00
mod_timer ( & wb_timer , next_jif ) ;
}
/*
* sysctl handler for / proc / sys / vm / dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler ( ctl_table * table , int write ,
struct file * file , void __user * buffer , size_t * length , loff_t * ppos )
{
2006-03-24 14:15:48 +03:00
proc_dointvec_userhz_jiffies ( table , write , file , buffer , length , ppos ) ;
if ( dirty_writeback_interval ) {
2005-04-17 02:20:36 +04:00
mod_timer ( & wb_timer ,
2006-03-24 14:15:48 +03:00
jiffies + dirty_writeback_interval ) ;
} else {
2005-04-17 02:20:36 +04:00
del_timer ( & wb_timer ) ;
}
return 0 ;
}
static void wb_timer_fn ( unsigned long unused )
{
if ( pdflush_operation ( wb_kupdate , 0 ) < 0 )
mod_timer ( & wb_timer , jiffies + HZ ) ; /* delay 1 second */
}
static void laptop_flush ( unsigned long unused )
{
sys_sync ( ) ;
}
static void laptop_timer_fn ( unsigned long unused )
{
pdflush_operation ( laptop_flush , 0 ) ;
}
/*
* We ' ve spun up the disk and we ' re in laptop mode : schedule writeback
* of all dirty data a few seconds from now . If the flush is already scheduled
* then push it back - the user is still using the disk .
*/
void laptop_io_completion ( void )
{
2006-03-24 14:15:49 +03:00
mod_timer ( & laptop_mode_wb_timer , jiffies + laptop_mode ) ;
2005-04-17 02:20:36 +04:00
}
/*
* We ' re in laptop mode and we ' ve just synced . The sync ' s writes will have
* caused another writeback to be scheduled by laptop_io_completion .
* Nothing needs to be written back anymore , so we unschedule the writeback .
*/
void laptop_sync_completion ( void )
{
del_timer ( & laptop_mode_wb_timer ) ;
}
/*
* If ratelimit_pages is too high then we can get into dirty - data overload
* if a large number of processes all perform writes at the same time .
* If it is too low then SMP machines will call the ( expensive )
* get_writeback_state too often .
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel , we cannot go more than 3 % ( 1 / 32 ) over the dirty memory
* thresholds before writeback cuts in .
*
* But the limit should not be set too high . Because it also controls the
* amount of memory which the balance_dirty_pages ( ) caller has to write back .
* If this is too large then the caller will block on the IO queue all the
* time . So limit it to four megabytes - the balance_dirty_pages ( ) caller
* will write six megabyte chunks , max .
*/
2006-09-29 13:01:25 +04:00
void writeback_set_ratelimit ( void )
2005-04-17 02:20:36 +04:00
{
2006-09-29 13:01:24 +04:00
ratelimit_pages = vm_total_pages / ( num_online_cpus ( ) * 32 ) ;
2005-04-17 02:20:36 +04:00
if ( ratelimit_pages < 16 )
ratelimit_pages = 16 ;
if ( ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024 )
ratelimit_pages = ( 4096 * 1024 ) / PAGE_CACHE_SIZE ;
}
2006-06-27 13:54:10 +04:00
static int __cpuinit
2005-04-17 02:20:36 +04:00
ratelimit_handler ( struct notifier_block * self , unsigned long u , void * v )
{
2006-09-29 13:01:25 +04:00
writeback_set_ratelimit ( ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2006-06-27 13:54:09 +04:00
static struct notifier_block __cpuinitdata ratelimit_nb = {
2005-04-17 02:20:36 +04:00
. notifier_call = ratelimit_handler ,
. next = NULL ,
} ;
/*
* If the machine has a large highmem : lowmem ratio then scale back the default
* dirty memory thresholds : allowing too much dirty highmem pins an excessive
* number of buffer_heads .
*/
void __init page_writeback_init ( void )
{
long buffer_pages = nr_free_buffer_pages ( ) ;
long correction ;
2006-09-29 13:01:24 +04:00
correction = ( 100 * 4 * buffer_pages ) / vm_total_pages ;
2005-04-17 02:20:36 +04:00
if ( correction < 100 ) {
dirty_background_ratio * = correction ;
dirty_background_ratio / = 100 ;
vm_dirty_ratio * = correction ;
vm_dirty_ratio / = 100 ;
if ( dirty_background_ratio < = 0 )
dirty_background_ratio = 1 ;
if ( vm_dirty_ratio < = 0 )
vm_dirty_ratio = 1 ;
}
2006-03-24 14:15:48 +03:00
mod_timer ( & wb_timer , jiffies + dirty_writeback_interval ) ;
2006-09-29 13:01:25 +04:00
writeback_set_ratelimit ( ) ;
2005-04-17 02:20:36 +04:00
register_cpu_notifier ( & ratelimit_nb ) ;
}
2006-08-29 22:06:09 +04:00
/**
* generic_writepages - walk the list of dirty pages of the given
* address space and writepage ( ) all of them .
*
* @ mapping : address space structure to write
* @ wbc : subtract the number of written pages from * @ wbc - > nr_to_write
*
* This is a library function , which implements the writepages ( )
* address_space_operation .
*
* If a page is already under I / O , generic_writepages ( ) skips it , even
* if it ' s dirty . This is desirable behaviour for memory - cleaning writeback ,
* but it is INCORRECT for data - integrity system calls such as fsync ( ) . fsync ( )
* and msync ( ) need to guarantee that all the data which was dirty at the time
* the call was made get new I / O started against them . If wbc - > sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete .
*
* Derived from mpage_writepages ( ) - if you fix this you should check that
* also !
*/
int generic_writepages ( struct address_space * mapping ,
struct writeback_control * wbc )
{
struct backing_dev_info * bdi = mapping - > backing_dev_info ;
int ret = 0 ;
int done = 0 ;
int ( * writepage ) ( struct page * page , struct writeback_control * wbc ) ;
struct pagevec pvec ;
int nr_pages ;
pgoff_t index ;
pgoff_t end ; /* Inclusive */
int scanned = 0 ;
int range_whole = 0 ;
if ( wbc - > nonblocking & & bdi_write_congested ( bdi ) ) {
wbc - > encountered_congestion = 1 ;
return 0 ;
}
writepage = mapping - > a_ops - > writepage ;
/* deal with chardevs and other special file */
if ( ! writepage )
return 0 ;
pagevec_init ( & pvec , 0 ) ;
if ( wbc - > range_cyclic ) {
index = mapping - > writeback_index ; /* Start from prev offset */
end = - 1 ;
} else {
index = wbc - > range_start > > PAGE_CACHE_SHIFT ;
end = wbc - > range_end > > PAGE_CACHE_SHIFT ;
if ( wbc - > range_start = = 0 & & wbc - > range_end = = LLONG_MAX )
range_whole = 1 ;
scanned = 1 ;
}
retry :
while ( ! done & & ( index < = end ) & &
( nr_pages = pagevec_lookup_tag ( & pvec , mapping , & index ,
PAGECACHE_TAG_DIRTY ,
min ( end - index , ( pgoff_t ) PAGEVEC_SIZE - 1 ) + 1 ) ) ) {
unsigned i ;
scanned = 1 ;
for ( i = 0 ; i < nr_pages ; i + + ) {
struct page * page = pvec . pages [ i ] ;
/*
* At this point we hold neither mapping - > tree_lock nor
* lock on the page itself : the page may be truncated or
* invalidated ( changing page - > mapping to NULL ) , or even
* swizzled back from swapper_space to tmpfs file
* mapping
*/
lock_page ( page ) ;
if ( unlikely ( page - > mapping ! = mapping ) ) {
unlock_page ( page ) ;
continue ;
}
if ( ! wbc - > range_cyclic & & page - > index > end ) {
done = 1 ;
unlock_page ( page ) ;
continue ;
}
if ( wbc - > sync_mode ! = WB_SYNC_NONE )
wait_on_page_writeback ( page ) ;
if ( PageWriteback ( page ) | |
! clear_page_dirty_for_io ( page ) ) {
unlock_page ( page ) ;
continue ;
}
ret = ( * writepage ) ( page , wbc ) ;
if ( ret ) {
if ( ret = = - ENOSPC )
set_bit ( AS_ENOSPC , & mapping - > flags ) ;
else
set_bit ( AS_EIO , & mapping - > flags ) ;
}
if ( unlikely ( ret = = AOP_WRITEPAGE_ACTIVATE ) )
unlock_page ( page ) ;
if ( ret | | ( - - ( wbc - > nr_to_write ) < = 0 ) )
done = 1 ;
if ( wbc - > nonblocking & & bdi_write_congested ( bdi ) ) {
wbc - > encountered_congestion = 1 ;
done = 1 ;
}
}
pagevec_release ( & pvec ) ;
cond_resched ( ) ;
}
if ( ! scanned & & ! done ) {
/*
* We hit the last page and there is more work to be done : wrap
* back to the start of the file
*/
scanned = 1 ;
index = 0 ;
goto retry ;
}
if ( wbc - > range_cyclic | | ( range_whole & & wbc - > nr_to_write > 0 ) )
mapping - > writeback_index = index ;
return ret ;
}
EXPORT_SYMBOL ( generic_writepages ) ;
2005-04-17 02:20:36 +04:00
int do_writepages ( struct address_space * mapping , struct writeback_control * wbc )
{
2005-11-17 02:07:01 +03:00
int ret ;
2005-04-17 02:20:36 +04:00
if ( wbc - > nr_to_write < = 0 )
return 0 ;
2005-11-17 02:07:01 +03:00
wbc - > for_writepages = 1 ;
2005-04-17 02:20:36 +04:00
if ( mapping - > a_ops - > writepages )
2006-09-26 10:30:57 +04:00
ret = mapping - > a_ops - > writepages ( mapping , wbc ) ;
2005-11-17 02:07:01 +03:00
else
ret = generic_writepages ( mapping , wbc ) ;
wbc - > for_writepages = 0 ;
return ret ;
2005-04-17 02:20:36 +04:00
}
/**
* write_one_page - write out a single page and optionally wait on I / O
*
2005-05-01 19:59:26 +04:00
* @ page : the page to write
* @ wait : if true , wait on writeout
2005-04-17 02:20:36 +04:00
*
* The page must be locked by the caller and will be unlocked upon return .
*
* write_one_page ( ) returns a negative error code if I / O failed .
*/
int write_one_page ( struct page * page , int wait )
{
struct address_space * mapping = page - > mapping ;
int ret = 0 ;
struct writeback_control wbc = {
. sync_mode = WB_SYNC_ALL ,
. nr_to_write = 1 ,
} ;
BUG_ON ( ! PageLocked ( page ) ) ;
if ( wait )
wait_on_page_writeback ( page ) ;
if ( clear_page_dirty_for_io ( page ) ) {
page_cache_get ( page ) ;
ret = mapping - > a_ops - > writepage ( page , & wbc ) ;
if ( ret = = 0 & & wait ) {
wait_on_page_writeback ( page ) ;
if ( PageError ( page ) )
ret = - EIO ;
}
page_cache_release ( page ) ;
} else {
unlock_page ( page ) ;
}
return ret ;
}
EXPORT_SYMBOL ( write_one_page ) ;
/*
* For address_spaces which do not use buffers . Just tag the page as dirty in
* its radix tree .
*
* This is also used when a single buffer is being dirtied : we want to set the
* page dirty in that case , but not all the buffers . This is a " bottom-up "
* dirtying , whereas __set_page_dirty_buffers ( ) is a " top-down " dirtying .
*
* Most callers have locked the page , which pins the address_space in memory .
* But zap_pte_range ( ) does not lock the page , however in that case the
* mapping is pinned by the vma ' s - > vm_file reference .
*
* We take care to handle the case where the page was truncated from the
* mapping by re - checking page_mapping ( ) insode tree_lock .
*/
int __set_page_dirty_nobuffers ( struct page * page )
{
if ( ! TestSetPageDirty ( page ) ) {
struct address_space * mapping = page_mapping ( page ) ;
struct address_space * mapping2 ;
if ( mapping ) {
write_lock_irq ( & mapping - > tree_lock ) ;
mapping2 = page_mapping ( page ) ;
if ( mapping2 ) { /* Race with truncate? */
BUG_ON ( mapping2 ! = mapping ) ;
if ( mapping_cap_account_dirty ( mapping ) )
2006-06-30 12:55:39 +04:00
__inc_zone_page_state ( page ,
NR_FILE_DIRTY ) ;
2005-04-17 02:20:36 +04:00
radix_tree_tag_set ( & mapping - > page_tree ,
page_index ( page ) , PAGECACHE_TAG_DIRTY ) ;
}
write_unlock_irq ( & mapping - > tree_lock ) ;
if ( mapping - > host ) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty ( mapping - > host ,
I_DIRTY_PAGES ) ;
}
}
2006-03-24 14:18:11 +03:00
return 1 ;
2005-04-17 02:20:36 +04:00
}
2006-03-24 14:18:11 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( __set_page_dirty_nobuffers ) ;
/*
* When a writepage implementation decides that it doesn ' t want to write this
* page for some reason , it should redirty the locked page via
* redirty_page_for_writepage ( ) and it should then unlock the page and return 0
*/
int redirty_page_for_writepage ( struct writeback_control * wbc , struct page * page )
{
wbc - > pages_skipped + + ;
return __set_page_dirty_nobuffers ( page ) ;
}
EXPORT_SYMBOL ( redirty_page_for_writepage ) ;
/*
* If the mapping doesn ' t provide a set_page_dirty a_op , then
* just fall through and assume that it wants buffer_heads .
*/
int fastcall set_page_dirty ( struct page * page )
{
struct address_space * mapping = page_mapping ( page ) ;
if ( likely ( mapping ) ) {
int ( * spd ) ( struct page * ) = mapping - > a_ops - > set_page_dirty ;
[PATCH] BLOCK: Make it possible to disable the block layer [try #6]
Make it possible to disable the block layer. Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.
This patch does the following:
(*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
support.
(*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
an item that uses the block layer. This includes:
(*) Block I/O tracing.
(*) Disk partition code.
(*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.
(*) The SCSI layer. As far as I can tell, even SCSI chardevs use the
block layer to do scheduling. Some drivers that use SCSI facilities -
such as USB storage - end up disabled indirectly from this.
(*) Various block-based device drivers, such as IDE and the old CDROM
drivers.
(*) MTD blockdev handling and FTL.
(*) JFFS - which uses set_bdev_super(), something it could avoid doing by
taking a leaf out of JFFS2's book.
(*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is,
however, still used in places, and so is still available.
(*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
parts of linux/fs.h.
(*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.
(*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.
(*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
is not enabled.
(*) fs/no-block.c is created to hold out-of-line stubs and things that are
required when CONFIG_BLOCK is not set:
(*) Default blockdev file operations (to give error ENODEV on opening).
(*) Makes some /proc changes:
(*) /proc/devices does not list any blockdevs.
(*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.
(*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.
(*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
given command other than Q_SYNC or if a special device is specified.
(*) In init/do_mounts.c, no reference is made to the blockdev routines if
CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2.
(*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
error ENOSYS by way of cond_syscall if so).
(*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
CONFIG_BLOCK is not set, since they can't then happen.
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2006-09-30 22:45:40 +04:00
# ifdef CONFIG_BLOCK
if ( ! spd )
spd = __set_page_dirty_buffers ;
# endif
return ( * spd ) ( page ) ;
2005-04-17 02:20:36 +04:00
}
2006-03-24 14:18:11 +03:00
if ( ! PageDirty ( page ) ) {
if ( ! TestSetPageDirty ( page ) )
return 1 ;
}
2005-04-17 02:20:36 +04:00
return 0 ;
}
EXPORT_SYMBOL ( set_page_dirty ) ;
/*
* set_page_dirty ( ) is racy if the caller has no reference against
* page - > mapping - > host , and if the page is unlocked . This is because another
* CPU could truncate the page off the mapping and then free the mapping .
*
* Usually , the page _is_ locked , or the caller is a user - space process which
* holds a reference on the inode by having an open file .
*
* In other cases , the page should be locked before running set_page_dirty ( ) .
*/
int set_page_dirty_lock ( struct page * page )
{
int ret ;
2006-09-26 10:31:24 +04:00
lock_page_nosync ( page ) ;
2005-04-17 02:20:36 +04:00
ret = set_page_dirty ( page ) ;
unlock_page ( page ) ;
return ret ;
}
EXPORT_SYMBOL ( set_page_dirty_lock ) ;
/*
* Clear a page ' s dirty flag , while caring for dirty memory accounting .
* Returns true if the page was previously dirty .
*/
int test_clear_page_dirty ( struct page * page )
{
struct address_space * mapping = page_mapping ( page ) ;
unsigned long flags ;
if ( mapping ) {
write_lock_irqsave ( & mapping - > tree_lock , flags ) ;
if ( TestClearPageDirty ( page ) ) {
radix_tree_tag_clear ( & mapping - > page_tree ,
page_index ( page ) ,
PAGECACHE_TAG_DIRTY ) ;
2006-06-30 12:55:39 +04:00
write_unlock_irqrestore ( & mapping - > tree_lock , flags ) ;
2006-09-26 10:30:57 +04:00
/*
* We can continue to use ` mapping ' here because the
* page is locked , which pins the address_space
*/
if ( mapping_cap_account_dirty ( mapping ) ) {
page_mkclean ( page ) ;
dec_zone_page_state ( page , NR_FILE_DIRTY ) ;
}
2005-04-17 02:20:36 +04:00
return 1 ;
}
write_unlock_irqrestore ( & mapping - > tree_lock , flags ) ;
return 0 ;
}
return TestClearPageDirty ( page ) ;
}
EXPORT_SYMBOL ( test_clear_page_dirty ) ;
/*
* Clear a page ' s dirty flag , while caring for dirty memory accounting .
* Returns true if the page was previously dirty .
*
* This is for preparing to put the page under writeout . We leave the page
* tagged as dirty in the radix tree so that a concurrent write - for - sync
* can discover it via a PAGECACHE_TAG_DIRTY walk . The - > writepage
* implementation will run either set_page_writeback ( ) or set_page_dirty ( ) ,
* at which stage we bring the page ' s dirty flag and radix - tree dirty tag
* back into sync .
*
* This incoherency between the page ' s dirty flag and radix - tree tag is
* unfortunate , but it only exists while the page is locked .
*/
int clear_page_dirty_for_io ( struct page * page )
{
struct address_space * mapping = page_mapping ( page ) ;
if ( mapping ) {
if ( TestClearPageDirty ( page ) ) {
2006-09-26 10:30:57 +04:00
if ( mapping_cap_account_dirty ( mapping ) ) {
page_mkclean ( page ) ;
2006-06-30 12:55:39 +04:00
dec_zone_page_state ( page , NR_FILE_DIRTY ) ;
2006-09-26 10:30:57 +04:00
}
2005-04-17 02:20:36 +04:00
return 1 ;
}
return 0 ;
}
return TestClearPageDirty ( page ) ;
}
2005-11-18 12:10:53 +03:00
EXPORT_SYMBOL ( clear_page_dirty_for_io ) ;
2005-04-17 02:20:36 +04:00
int test_clear_page_writeback ( struct page * page )
{
struct address_space * mapping = page_mapping ( page ) ;
int ret ;
if ( mapping ) {
unsigned long flags ;
write_lock_irqsave ( & mapping - > tree_lock , flags ) ;
ret = TestClearPageWriteback ( page ) ;
if ( ret )
radix_tree_tag_clear ( & mapping - > page_tree ,
page_index ( page ) ,
PAGECACHE_TAG_WRITEBACK ) ;
write_unlock_irqrestore ( & mapping - > tree_lock , flags ) ;
} else {
ret = TestClearPageWriteback ( page ) ;
}
return ret ;
}
int test_set_page_writeback ( struct page * page )
{
struct address_space * mapping = page_mapping ( page ) ;
int ret ;
if ( mapping ) {
unsigned long flags ;
write_lock_irqsave ( & mapping - > tree_lock , flags ) ;
ret = TestSetPageWriteback ( page ) ;
if ( ! ret )
radix_tree_tag_set ( & mapping - > page_tree ,
page_index ( page ) ,
PAGECACHE_TAG_WRITEBACK ) ;
if ( ! PageDirty ( page ) )
radix_tree_tag_clear ( & mapping - > page_tree ,
page_index ( page ) ,
PAGECACHE_TAG_DIRTY ) ;
write_unlock_irqrestore ( & mapping - > tree_lock , flags ) ;
} else {
ret = TestSetPageWriteback ( page ) ;
}
return ret ;
}
EXPORT_SYMBOL ( test_set_page_writeback ) ;
2006-08-23 04:06:24 +04:00
/*
* Wakes up tasks that are being throttled due to writeback congestion
*/
void writeback_congestion_end ( void )
{
blk_congestion_end ( WRITE ) ;
}
EXPORT_SYMBOL ( writeback_congestion_end ) ;
2005-04-17 02:20:36 +04:00
/*
* Return true if any of the pages in the mapping are marged with the
* passed tag .
*/
int mapping_tagged ( struct address_space * mapping , int tag )
{
unsigned long flags ;
int ret ;
read_lock_irqsave ( & mapping - > tree_lock , flags ) ;
ret = radix_tree_tagged ( & mapping - > page_tree , tag ) ;
read_unlock_irqrestore ( & mapping - > tree_lock , flags ) ;
return ret ;
}
EXPORT_SYMBOL ( mapping_tagged ) ;