2013-03-23 16:11:31 -07:00
/*
* background writeback - scan btree for dirty data and write it to the backing
* device
*
* Copyright 2010 , 2011 Kent Overstreet < kent . overstreet @ gmail . com >
* Copyright 2012 Google , Inc .
*/
# include "bcache.h"
# include "btree.h"
# include "debug.h"
2013-06-05 06:21:07 -07:00
# include "writeback.h"
2013-03-23 16:11:31 -07:00
2013-07-24 17:50:06 -07:00
# include <linux/delay.h>
# include <linux/freezer.h>
# include <linux/kthread.h>
2013-04-26 15:39:55 -07:00
# include <trace/events/bcache.h>
2013-03-23 16:11:31 -07:00
/* Rate limiting */
static void __update_writeback_rate ( struct cached_dev * dc )
{
struct cache_set * c = dc - > disk . c ;
uint64_t cache_sectors = c - > nbuckets * c - > sb . bucket_size ;
uint64_t cache_dirty_target =
div_u64 ( cache_sectors * dc - > writeback_percent , 100 ) ;
int64_t target = div64_u64 ( cache_dirty_target * bdev_sectors ( dc - > bdev ) ,
c - > cached_dev_sectors ) ;
/* PD controller */
2013-06-05 06:21:07 -07:00
int64_t dirty = bcache_dev_sectors_dirty ( & dc - > disk ) ;
2013-03-23 16:11:31 -07:00
int64_t derivative = dirty - dc - > disk . sectors_dirty_last ;
2013-11-11 13:58:34 -08:00
int64_t proportional = dirty - target ;
int64_t change ;
2013-03-23 16:11:31 -07:00
dc - > disk . sectors_dirty_last = dirty ;
2013-11-11 13:58:34 -08:00
/* Scale to sectors per second */
2013-03-23 16:11:31 -07:00
2013-11-11 13:58:34 -08:00
proportional * = dc - > writeback_rate_update_seconds ;
proportional = div_s64 ( proportional , dc - > writeback_rate_p_term_inverse ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 13:58:34 -08:00
derivative = div_s64 ( derivative , dc - > writeback_rate_update_seconds ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 13:58:34 -08:00
derivative = ewma_add ( dc - > disk . sectors_dirty_derivative , derivative ,
( dc - > writeback_rate_d_term /
dc - > writeback_rate_update_seconds ) ? : 1 , 0 ) ;
derivative * = dc - > writeback_rate_d_term ;
derivative = div_s64 ( derivative , dc - > writeback_rate_p_term_inverse ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 13:58:34 -08:00
change = proportional + derivative ;
2013-03-23 16:11:31 -07:00
/* Don't increase writeback rate if the device isn't keeping up */
if ( change > 0 & &
time_after64 ( local_clock ( ) ,
2013-11-11 13:58:34 -08:00
dc - > writeback_rate . next + NSEC_PER_MSEC ) )
2013-03-23 16:11:31 -07:00
change = 0 ;
dc - > writeback_rate . rate =
2013-11-11 13:58:34 -08:00
clamp_t ( int64_t , ( int64_t ) dc - > writeback_rate . rate + change ,
2013-03-23 16:11:31 -07:00
1 , NSEC_PER_MSEC ) ;
2013-11-11 13:58:34 -08:00
dc - > writeback_rate_proportional = proportional ;
2013-03-23 16:11:31 -07:00
dc - > writeback_rate_derivative = derivative ;
dc - > writeback_rate_change = change ;
dc - > writeback_rate_target = target ;
}
static void update_writeback_rate ( struct work_struct * work )
{
struct cached_dev * dc = container_of ( to_delayed_work ( work ) ,
struct cached_dev ,
writeback_rate_update ) ;
down_read ( & dc - > writeback_lock ) ;
if ( atomic_read ( & dc - > has_dirty ) & &
dc - > writeback_percent )
__update_writeback_rate ( dc ) ;
up_read ( & dc - > writeback_lock ) ;
2013-07-24 17:50:06 -07:00
schedule_delayed_work ( & dc - > writeback_rate_update ,
dc - > writeback_rate_update_seconds * HZ ) ;
2013-03-23 16:11:31 -07:00
}
static unsigned writeback_delay ( struct cached_dev * dc , unsigned sectors )
{
2013-08-21 17:49:09 -07:00
if ( test_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) | |
2013-03-23 16:11:31 -07:00
! dc - > writeback_percent )
return 0 ;
2013-11-11 13:58:34 -08:00
return bch_next_delay ( & dc - > writeback_rate , sectors ) ;
2013-03-23 16:11:31 -07:00
}
2013-07-24 17:50:06 -07:00
struct dirty_io {
struct closure cl ;
struct cached_dev * dc ;
struct bio bio ;
} ;
2013-06-05 06:24:39 -07:00
2013-03-23 16:11:31 -07:00
static void dirty_init ( struct keybuf_key * w )
{
struct dirty_io * io = w - > private ;
struct bio * bio = & io - > bio ;
bio_init ( bio ) ;
if ( ! io - > dc - > writeback_percent )
bio_set_prio ( bio , IOPRIO_PRIO_VALUE ( IOPRIO_CLASS_IDLE , 0 ) ) ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_size = KEY_SIZE ( & w - > key ) < < 9 ;
2013-03-23 16:11:31 -07:00
bio - > bi_max_vecs = DIV_ROUND_UP ( KEY_SIZE ( & w - > key ) , PAGE_SECTORS ) ;
bio - > bi_private = w ;
bio - > bi_io_vec = bio - > bi_inline_vecs ;
2013-03-28 12:50:55 -06:00
bch_bio_map ( bio , NULL ) ;
2013-03-23 16:11:31 -07:00
}
static void dirty_io_destructor ( struct closure * cl )
{
struct dirty_io * io = container_of ( cl , struct dirty_io , cl ) ;
kfree ( io ) ;
}
static void write_dirty_finish ( struct closure * cl )
{
struct dirty_io * io = container_of ( cl , struct dirty_io , cl ) ;
struct keybuf_key * w = io - > bio . bi_private ;
struct cached_dev * dc = io - > dc ;
2013-06-06 18:15:57 -07:00
struct bio_vec * bv ;
int i ;
2013-03-23 16:11:31 -07:00
2013-06-06 18:15:57 -07:00
bio_for_each_segment_all ( bv , & io - > bio , i )
2013-03-23 16:11:31 -07:00
__free_page ( bv - > bv_page ) ;
/* This is kind of a dumb way of signalling errors. */
if ( KEY_DIRTY ( & w - > key ) ) {
2013-07-24 18:07:22 -07:00
int ret ;
2013-03-23 16:11:31 -07:00
unsigned i ;
2013-07-24 17:26:51 -07:00
struct keylist keys ;
bch_keylist_init ( & keys ) ;
2013-03-23 16:11:31 -07:00
2013-09-10 18:52:54 -07:00
bkey_copy ( keys . top , & w - > key ) ;
SET_KEY_DIRTY ( keys . top , false ) ;
bch_keylist_push ( & keys ) ;
2013-03-23 16:11:31 -07:00
for ( i = 0 ; i < KEY_PTRS ( & w - > key ) ; i + + )
atomic_inc ( & PTR_BUCKET ( dc - > disk . c , & w - > key , i ) - > pin ) ;
2013-07-24 18:07:22 -07:00
ret = bch_btree_insert ( dc - > disk . c , & keys , NULL , & w - > key ) ;
2013-03-23 16:11:31 -07:00
2013-07-24 18:06:22 -07:00
if ( ret )
2013-04-26 15:39:55 -07:00
trace_bcache_writeback_collision ( & w - > key ) ;
2013-07-24 18:06:22 -07:00
atomic_long_inc ( ret
2013-03-23 16:11:31 -07:00
? & dc - > disk . c - > writeback_keys_failed
: & dc - > disk . c - > writeback_keys_done ) ;
}
bch_keybuf_del ( & dc - > writeback_keys , w ) ;
2013-09-23 23:17:31 -07:00
up ( & dc - > in_flight ) ;
2013-03-23 16:11:31 -07:00
closure_return_with_destructor ( cl , dirty_io_destructor ) ;
}
static void dirty_endio ( struct bio * bio , int error )
{
struct keybuf_key * w = bio - > bi_private ;
struct dirty_io * io = w - > private ;
if ( error )
SET_KEY_DIRTY ( & w - > key , false ) ;
closure_put ( & io - > cl ) ;
}
static void write_dirty ( struct closure * cl )
{
struct dirty_io * io = container_of ( cl , struct dirty_io , cl ) ;
struct keybuf_key * w = io - > bio . bi_private ;
dirty_init ( w ) ;
io - > bio . bi_rw = WRITE ;
2013-10-11 15:44:27 -07:00
io - > bio . bi_iter . bi_sector = KEY_START ( & w - > key ) ;
2013-03-23 16:11:31 -07:00
io - > bio . bi_bdev = io - > dc - > bdev ;
io - > bio . bi_end_io = dirty_endio ;
closure_bio_submit ( & io - > bio , cl , & io - > dc - > disk ) ;
2013-09-23 23:17:31 -07:00
continue_at ( cl , write_dirty_finish , system_wq ) ;
2013-03-23 16:11:31 -07:00
}
static void read_dirty_endio ( struct bio * bio , int error )
{
struct keybuf_key * w = bio - > bi_private ;
struct dirty_io * io = w - > private ;
bch_count_io_errors ( PTR_CACHE ( io - > dc - > disk . c , & w - > key , 0 ) ,
error , " reading dirty data from cache " ) ;
dirty_endio ( bio , error ) ;
}
static void read_dirty_submit ( struct closure * cl )
{
struct dirty_io * io = container_of ( cl , struct dirty_io , cl ) ;
closure_bio_submit ( & io - > bio , cl , & io - > dc - > disk ) ;
2013-09-23 23:17:31 -07:00
continue_at ( cl , write_dirty , system_wq ) ;
2013-03-23 16:11:31 -07:00
}
2013-07-24 17:50:06 -07:00
static void read_dirty ( struct cached_dev * dc )
2013-03-23 16:11:31 -07:00
{
2013-07-24 17:50:06 -07:00
unsigned delay = 0 ;
2013-03-23 16:11:31 -07:00
struct keybuf_key * w ;
struct dirty_io * io ;
2013-07-24 17:50:06 -07:00
struct closure cl ;
closure_init_stack ( & cl ) ;
2013-03-23 16:11:31 -07:00
/*
* XXX : if we error , background writeback just spins . Should use some
* mempools .
*/
2013-07-24 17:50:06 -07:00
while ( ! kthread_should_stop ( ) ) {
try_to_freeze ( ) ;
2013-03-23 16:11:31 -07:00
w = bch_keybuf_next ( & dc - > writeback_keys ) ;
if ( ! w )
break ;
BUG_ON ( ptr_stale ( dc - > disk . c , & w - > key , 0 ) ) ;
2013-07-24 17:50:06 -07:00
if ( KEY_START ( & w - > key ) ! = dc - > last_read | |
jiffies_to_msecs ( delay ) > 50 )
while ( ! kthread_should_stop ( ) & & delay )
2013-11-28 17:28:37 -08:00
delay = schedule_timeout_uninterruptible ( delay ) ;
2013-03-23 16:11:31 -07:00
dc - > last_read = KEY_OFFSET ( & w - > key ) ;
io = kzalloc ( sizeof ( struct dirty_io ) + sizeof ( struct bio_vec )
* DIV_ROUND_UP ( KEY_SIZE ( & w - > key ) , PAGE_SECTORS ) ,
GFP_KERNEL ) ;
if ( ! io )
goto err ;
w - > private = io ;
io - > dc = dc ;
dirty_init ( w ) ;
2013-10-11 15:44:27 -07:00
io - > bio . bi_iter . bi_sector = PTR_OFFSET ( & w - > key , 0 ) ;
2013-03-23 16:11:31 -07:00
io - > bio . bi_bdev = PTR_CACHE ( dc - > disk . c ,
& w - > key , 0 ) - > bdev ;
io - > bio . bi_rw = READ ;
io - > bio . bi_end_io = read_dirty_endio ;
2013-06-06 18:15:57 -07:00
if ( bio_alloc_pages ( & io - > bio , GFP_KERNEL ) )
2013-03-23 16:11:31 -07:00
goto err_free ;
2013-04-26 15:39:55 -07:00
trace_bcache_writeback ( & w - > key ) ;
2013-03-23 16:11:31 -07:00
2013-09-23 23:17:31 -07:00
down ( & dc - > in_flight ) ;
2013-07-24 17:50:06 -07:00
closure_call ( & io - > cl , read_dirty_submit , NULL , & cl ) ;
2013-03-23 16:11:31 -07:00
delay = writeback_delay ( dc , KEY_SIZE ( & w - > key ) ) ;
}
if ( 0 ) {
err_free :
kfree ( w - > private ) ;
err :
bch_keybuf_del ( & dc - > writeback_keys , w ) ;
}
2013-09-23 23:17:31 -07:00
/*
* Wait for outstanding writeback IOs to finish ( and keybuf slots to be
* freed ) before refilling again
*/
2013-07-24 17:50:06 -07:00
closure_sync ( & cl ) ;
}
/* Scan for dirty data */
void bcache_dev_sectors_dirty_add ( struct cache_set * c , unsigned inode ,
uint64_t offset , int nr_sectors )
{
struct bcache_device * d = c - > devices [ inode ] ;
2013-10-31 15:43:22 -07:00
unsigned stripe_offset , stripe , sectors_dirty ;
2013-07-24 17:50:06 -07:00
if ( ! d )
return ;
2013-10-31 15:43:22 -07:00
stripe = offset_to_stripe ( d , offset ) ;
2013-07-24 17:50:06 -07:00
stripe_offset = offset & ( d - > stripe_size - 1 ) ;
while ( nr_sectors ) {
int s = min_t ( unsigned , abs ( nr_sectors ) ,
d - > stripe_size - stripe_offset ) ;
if ( nr_sectors < 0 )
s = - s ;
2013-10-31 15:43:22 -07:00
if ( stripe > = d - > nr_stripes )
return ;
sectors_dirty = atomic_add_return ( s ,
d - > stripe_sectors_dirty + stripe ) ;
if ( sectors_dirty = = d - > stripe_size )
set_bit ( stripe , d - > full_dirty_stripes ) ;
else
clear_bit ( stripe , d - > full_dirty_stripes ) ;
2013-07-24 17:50:06 -07:00
nr_sectors - = s ;
stripe_offset = 0 ;
stripe + + ;
}
}
static bool dirty_pred ( struct keybuf * buf , struct bkey * k )
{
return KEY_DIRTY ( k ) ;
}
2013-10-31 15:43:22 -07:00
static void refill_full_stripes ( struct cached_dev * dc )
2013-07-24 17:50:06 -07:00
{
2013-10-31 15:43:22 -07:00
struct keybuf * buf = & dc - > writeback_keys ;
unsigned start_stripe , stripe , next_stripe ;
bool wrapped = false ;
stripe = offset_to_stripe ( & dc - > disk , KEY_OFFSET ( & buf - > last_scanned ) ) ;
2013-07-24 17:50:06 -07:00
2013-10-31 15:43:22 -07:00
if ( stripe > = dc - > disk . nr_stripes )
stripe = 0 ;
2013-07-24 17:50:06 -07:00
2013-10-31 15:43:22 -07:00
start_stripe = stripe ;
2013-07-24 17:50:06 -07:00
while ( 1 ) {
2013-10-31 15:43:22 -07:00
stripe = find_next_bit ( dc - > disk . full_dirty_stripes ,
dc - > disk . nr_stripes , stripe ) ;
2013-07-24 17:50:06 -07:00
2013-10-31 15:43:22 -07:00
if ( stripe = = dc - > disk . nr_stripes )
goto next ;
2013-07-24 17:50:06 -07:00
2013-10-31 15:43:22 -07:00
next_stripe = find_next_zero_bit ( dc - > disk . full_dirty_stripes ,
dc - > disk . nr_stripes , stripe ) ;
buf - > last_scanned = KEY ( dc - > disk . id ,
stripe * dc - > disk . stripe_size , 0 ) ;
bch_refill_keybuf ( dc - > disk . c , buf ,
& KEY ( dc - > disk . id ,
next_stripe * dc - > disk . stripe_size , 0 ) ,
dirty_pred ) ;
if ( array_freelist_empty ( & buf - > freelist ) )
return ;
stripe = next_stripe ;
next :
if ( wrapped & & stripe > start_stripe )
return ;
if ( stripe = = dc - > disk . nr_stripes ) {
stripe = 0 ;
wrapped = true ;
}
2013-07-24 17:50:06 -07:00
}
}
static bool refill_dirty ( struct cached_dev * dc )
{
struct keybuf * buf = & dc - > writeback_keys ;
struct bkey end = KEY ( dc - > disk . id , MAX_KEY_OFFSET , 0 ) ;
2013-10-31 15:43:22 -07:00
bool searched_from_start = false ;
if ( dc - > partial_stripes_expensive ) {
refill_full_stripes ( dc ) ;
if ( array_freelist_empty ( & buf - > freelist ) )
return false ;
}
2013-07-24 17:50:06 -07:00
if ( bkey_cmp ( & buf - > last_scanned , & end ) > = 0 ) {
buf - > last_scanned = KEY ( dc - > disk . id , 0 , 0 ) ;
searched_from_start = true ;
}
2013-10-31 15:43:22 -07:00
bch_refill_keybuf ( dc - > disk . c , buf , & end , dirty_pred ) ;
2013-07-24 17:50:06 -07:00
return bkey_cmp ( & buf - > last_scanned , & end ) > = 0 & & searched_from_start ;
}
static int bch_writeback_thread ( void * arg )
{
struct cached_dev * dc = arg ;
bool searched_full_index ;
while ( ! kthread_should_stop ( ) ) {
down_write ( & dc - > writeback_lock ) ;
if ( ! atomic_read ( & dc - > has_dirty ) | |
2013-08-21 17:49:09 -07:00
( ! test_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) & &
2013-07-24 17:50:06 -07:00
! dc - > writeback_running ) ) {
up_write ( & dc - > writeback_lock ) ;
set_current_state ( TASK_INTERRUPTIBLE ) ;
if ( kthread_should_stop ( ) )
return 0 ;
try_to_freeze ( ) ;
schedule ( ) ;
continue ;
}
searched_full_index = refill_dirty ( dc ) ;
if ( searched_full_index & &
RB_EMPTY_ROOT ( & dc - > writeback_keys . keys ) ) {
atomic_set ( & dc - > has_dirty , 0 ) ;
cached_dev_put ( dc ) ;
SET_BDEV_STATE ( & dc - > sb , BDEV_STATE_CLEAN ) ;
bch_write_bdev_super ( dc , NULL ) ;
}
up_write ( & dc - > writeback_lock ) ;
bch_ratelimit_reset ( & dc - > writeback_rate ) ;
read_dirty ( dc ) ;
if ( searched_full_index ) {
unsigned delay = dc - > writeback_delay * HZ ;
while ( delay & &
! kthread_should_stop ( ) & &
2013-08-21 17:49:09 -07:00
! test_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) )
2013-11-28 17:28:37 -08:00
delay = schedule_timeout_uninterruptible ( delay ) ;
2013-07-24 17:50:06 -07:00
}
}
return 0 ;
2013-03-23 16:11:31 -07:00
}
2013-05-11 17:07:26 -07:00
/* Init */
2013-07-24 17:44:17 -07:00
struct sectors_dirty_init {
struct btree_op op ;
unsigned inode ;
} ;
static int sectors_dirty_init_fn ( struct btree_op * _op , struct btree * b ,
2013-09-10 18:48:51 -07:00
struct bkey * k )
2013-05-11 17:07:26 -07:00
{
2013-07-24 17:44:17 -07:00
struct sectors_dirty_init * op = container_of ( _op ,
struct sectors_dirty_init , op ) ;
2013-09-10 18:48:51 -07:00
if ( KEY_INODE ( k ) > op - > inode )
return MAP_DONE ;
2013-05-11 17:07:26 -07:00
2013-09-10 18:48:51 -07:00
if ( KEY_DIRTY ( k ) )
bcache_dev_sectors_dirty_add ( b - > c , KEY_INODE ( k ) ,
KEY_START ( k ) , KEY_SIZE ( k ) ) ;
return MAP_CONTINUE ;
2013-05-11 17:07:26 -07:00
}
void bch_sectors_dirty_init ( struct cached_dev * dc )
{
2013-07-24 17:44:17 -07:00
struct sectors_dirty_init op ;
2013-05-11 17:07:26 -07:00
2013-07-24 18:04:18 -07:00
bch_btree_op_init ( & op . op , - 1 ) ;
2013-09-10 18:48:51 -07:00
op . inode = dc - > disk . id ;
2013-07-24 17:44:17 -07:00
bch_btree_map_keys ( & op . op , dc - > disk . c , & KEY ( op . inode , 0 , 0 ) ,
2013-09-10 18:48:51 -07:00
sectors_dirty_init_fn , 0 ) ;
2013-11-11 13:58:34 -08:00
dc - > disk . sectors_dirty_last = bcache_dev_sectors_dirty ( & dc - > disk ) ;
2013-05-11 17:07:26 -07:00
}
2013-07-24 17:50:06 -07:00
int bch_cached_dev_writeback_init ( struct cached_dev * dc )
2013-03-23 16:11:31 -07:00
{
2013-09-23 23:17:31 -07:00
sema_init ( & dc - > in_flight , 64 ) ;
2013-03-23 16:11:31 -07:00
init_rwsem ( & dc - > writeback_lock ) ;
2013-06-05 06:24:39 -07:00
bch_keybuf_init ( & dc - > writeback_keys ) ;
2013-03-23 16:11:31 -07:00
dc - > writeback_metadata = true ;
dc - > writeback_running = true ;
dc - > writeback_percent = 10 ;
dc - > writeback_delay = 30 ;
dc - > writeback_rate . rate = 1024 ;
2013-11-11 13:58:34 -08:00
dc - > writeback_rate_update_seconds = 5 ;
dc - > writeback_rate_d_term = 30 ;
dc - > writeback_rate_p_term_inverse = 6000 ;
2013-03-23 16:11:31 -07:00
2013-07-24 17:50:06 -07:00
dc - > writeback_thread = kthread_create ( bch_writeback_thread , dc ,
" bcache_writeback " ) ;
if ( IS_ERR ( dc - > writeback_thread ) )
return PTR_ERR ( dc - > writeback_thread ) ;
2013-03-23 16:11:31 -07:00
INIT_DELAYED_WORK ( & dc - > writeback_rate_update , update_writeback_rate ) ;
schedule_delayed_work ( & dc - > writeback_rate_update ,
dc - > writeback_rate_update_seconds * HZ ) ;
return 0 ;
}