2013-03-23 16:11:31 -07:00
/*
* bcache journalling code , for btree insertions
*
* Copyright 2012 Google , Inc .
*/
# include "bcache.h"
# include "btree.h"
# include "debug.h"
2014-04-10 17:58:49 -07:00
# include "extents.h"
2013-03-23 16:11:31 -07:00
2013-04-26 15:39:55 -07:00
# include <trace/events/bcache.h>
2013-03-23 16:11:31 -07:00
/*
* Journal replay / recovery :
*
* This code is all driven from run_cache_set ( ) ; we first read the journal
* entries , do some other stuff , then we mark all the keys in the journal
* entries ( same as garbage collection would ) , then we replay them - reinserting
* them into the cache in precisely the same order as they appear in the
* journal .
*
* We only journal keys that go in leaf nodes , which simplifies things quite a
* bit .
*/
static void journal_read_endio ( struct bio * bio , int error )
{
struct closure * cl = bio - > bi_private ;
closure_put ( cl ) ;
}
static int journal_read_bucket ( struct cache * ca , struct list_head * list ,
2013-07-24 17:44:17 -07:00
unsigned bucket_index )
2013-03-23 16:11:31 -07:00
{
struct journal_device * ja = & ca - > journal ;
struct bio * bio = & ja - > bio ;
struct journal_replay * i ;
struct jset * j , * data = ca - > set - > journal . w [ 0 ] . data ;
2013-07-24 17:44:17 -07:00
struct closure cl ;
2013-03-23 16:11:31 -07:00
unsigned len , left , offset = 0 ;
int ret = 0 ;
sector_t bucket = bucket_to_sector ( ca - > set , ca - > sb . d [ bucket_index ] ) ;
2013-07-24 17:44:17 -07:00
closure_init_stack ( & cl ) ;
2013-08-05 14:04:06 -07:00
pr_debug ( " reading %u " , bucket_index ) ;
2013-03-23 16:11:31 -07:00
while ( offset < ca - > sb . bucket_size ) {
reread : left = ca - > sb . bucket_size - offset ;
2013-08-05 14:04:06 -07:00
len = min_t ( unsigned , left , PAGE_SECTORS < < JSET_BITS ) ;
2013-03-23 16:11:31 -07:00
bio_reset ( bio ) ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_sector = bucket + offset ;
2013-03-23 16:11:31 -07:00
bio - > bi_bdev = ca - > bdev ;
bio - > bi_rw = READ ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_size = len < < 9 ;
2013-03-23 16:11:31 -07:00
bio - > bi_end_io = journal_read_endio ;
2013-07-24 17:44:17 -07:00
bio - > bi_private = & cl ;
2013-03-28 12:50:55 -06:00
bch_bio_map ( bio , data ) ;
2013-03-23 16:11:31 -07:00
2013-07-24 17:44:17 -07:00
closure_bio_submit ( bio , & cl , ca ) ;
closure_sync ( & cl ) ;
2013-03-23 16:11:31 -07:00
/* This function could be simpler now since we no longer write
* journal entries that overlap bucket boundaries ; this means
* the start of a bucket will always have a valid journal entry
* if it has any journal entries at all .
*/
j = data ;
while ( len ) {
struct list_head * where ;
size_t blocks , bytes = set_bytes ( j ) ;
2013-08-05 14:04:06 -07:00
if ( j - > magic ! = jset_magic ( & ca - > sb ) ) {
pr_debug ( " %u: bad magic " , bucket_index ) ;
2013-03-23 16:11:31 -07:00
return ret ;
2013-08-05 14:04:06 -07:00
}
2013-03-23 16:11:31 -07:00
2013-08-05 14:04:06 -07:00
if ( bytes > left < < 9 | |
bytes > PAGE_SIZE < < JSET_BITS ) {
pr_info ( " %u: too big, %zu bytes, offset %u " ,
bucket_index , bytes , offset ) ;
2013-03-23 16:11:31 -07:00
return ret ;
2013-08-05 14:04:06 -07:00
}
2013-03-23 16:11:31 -07:00
if ( bytes > len < < 9 )
goto reread ;
2013-08-05 14:04:06 -07:00
if ( j - > csum ! = csum_set ( j ) ) {
pr_info ( " %u: bad csum, %zu bytes, offset %u " ,
bucket_index , bytes , offset ) ;
2013-03-23 16:11:31 -07:00
return ret ;
2013-08-05 14:04:06 -07:00
}
2013-03-23 16:11:31 -07:00
2013-12-17 23:49:49 -08:00
blocks = set_blocks ( j , block_bytes ( ca - > set ) ) ;
2013-03-23 16:11:31 -07:00
while ( ! list_empty ( list ) ) {
i = list_first_entry ( list ,
struct journal_replay , list ) ;
if ( i - > j . seq > = j - > last_seq )
break ;
list_del ( & i - > list ) ;
kfree ( i ) ;
}
list_for_each_entry_reverse ( i , list , list ) {
if ( j - > seq = = i - > j . seq )
goto next_set ;
if ( j - > seq < i - > j . last_seq )
goto next_set ;
if ( j - > seq > i - > j . seq ) {
where = & i - > list ;
goto add ;
}
}
where = list ;
add :
i = kmalloc ( offsetof ( struct journal_replay , j ) +
bytes , GFP_KERNEL ) ;
if ( ! i )
return - ENOMEM ;
memcpy ( & i - > j , j , bytes ) ;
list_add ( & i - > list , where ) ;
ret = 1 ;
ja - > seq [ bucket_index ] = j - > seq ;
next_set :
offset + = blocks * ca - > sb . block_size ;
len - = blocks * ca - > sb . block_size ;
j = ( ( void * ) j ) + blocks * block_bytes ( ca ) ;
}
}
return ret ;
}
2013-07-24 17:44:17 -07:00
int bch_journal_read ( struct cache_set * c , struct list_head * list )
2013-03-23 16:11:31 -07:00
{
# define read_bucket(b) \
( { \
2013-07-24 17:44:17 -07:00
int ret = journal_read_bucket ( ca , list , b ) ; \
2013-03-23 16:11:31 -07:00
__set_bit ( b , bitmap ) ; \
if ( ret < 0 ) \
return ret ; \
ret ; \
} )
struct cache * ca ;
unsigned iter ;
for_each_cache ( ca , c , iter ) {
struct journal_device * ja = & ca - > journal ;
unsigned long bitmap [ SB_JOURNAL_BUCKETS / BITS_PER_LONG ] ;
unsigned i , l , r , m ;
uint64_t seq ;
bitmap_zero ( bitmap , SB_JOURNAL_BUCKETS ) ;
pr_debug ( " %u journal buckets " , ca - > sb . njournal_buckets ) ;
2013-09-23 23:17:29 -07:00
/*
* Read journal buckets ordered by golden ratio hash to quickly
2013-03-23 16:11:31 -07:00
* find a sequence of buckets with valid journal entries
*/
for ( i = 0 ; i < ca - > sb . njournal_buckets ; i + + ) {
l = ( i * 2654435769U ) % ca - > sb . njournal_buckets ;
if ( test_bit ( l , bitmap ) )
break ;
if ( read_bucket ( l ) )
goto bsearch ;
}
2013-09-23 23:17:29 -07:00
/*
* If that fails , check all the buckets we haven ' t checked
2013-03-23 16:11:31 -07:00
* already
*/
pr_debug ( " falling back to linear search " ) ;
2013-09-23 23:17:29 -07:00
for ( l = find_first_zero_bit ( bitmap , ca - > sb . njournal_buckets ) ;
l < ca - > sb . njournal_buckets ;
l = find_next_zero_bit ( bitmap , ca - > sb . njournal_buckets , l + 1 ) )
2013-03-23 16:11:31 -07:00
if ( read_bucket ( l ) )
goto bsearch ;
2013-09-23 23:17:29 -07:00
2014-06-02 15:39:44 -07:00
/* no journal entries on this device? */
if ( l = = ca - > sb . njournal_buckets )
2013-09-23 23:17:29 -07:00
continue ;
2013-03-23 16:11:31 -07:00
bsearch :
2014-06-02 15:39:44 -07:00
BUG_ON ( list_empty ( list ) ) ;
2013-03-23 16:11:31 -07:00
/* Binary search */
2014-04-10 16:09:51 -07:00
m = l ;
r = find_next_bit ( bitmap , ca - > sb . njournal_buckets , l + 1 ) ;
2013-03-23 16:11:31 -07:00
pr_debug ( " starting binary search, l %u r %u " , l , r ) ;
while ( l + 1 < r ) {
2013-07-11 22:42:14 -07:00
seq = list_entry ( list - > prev , struct journal_replay ,
list ) - > j . seq ;
2013-03-23 16:11:31 -07:00
m = ( l + r ) > > 1 ;
2013-07-11 22:42:14 -07:00
read_bucket ( m ) ;
2013-03-23 16:11:31 -07:00
2013-07-11 22:42:14 -07:00
if ( seq ! = list_entry ( list - > prev , struct journal_replay ,
list ) - > j . seq )
2013-03-23 16:11:31 -07:00
l = m ;
else
r = m ;
}
2013-09-23 23:17:29 -07:00
/*
* Read buckets in reverse order until we stop finding more
2013-03-23 16:11:31 -07:00
* journal entries
*/
2013-09-23 23:17:29 -07:00
pr_debug ( " finishing up: m %u njournal_buckets %u " ,
m , ca - > sb . njournal_buckets ) ;
2013-03-23 16:11:31 -07:00
l = m ;
while ( 1 ) {
if ( ! l - - )
l = ca - > sb . njournal_buckets - 1 ;
if ( l = = m )
break ;
if ( test_bit ( l , bitmap ) )
continue ;
if ( ! read_bucket ( l ) )
break ;
}
seq = 0 ;
for ( i = 0 ; i < ca - > sb . njournal_buckets ; i + + )
if ( ja - > seq [ i ] > seq ) {
seq = ja - > seq [ i ] ;
2014-03-13 13:44:21 -07:00
/*
* When journal_reclaim ( ) goes to allocate for
* the first time , it ' ll use the bucket after
* ja - > cur_idx
*/
ja - > cur_idx = i ;
ja - > last_idx = ja - > discard_idx = ( i + 1 ) %
ca - > sb . njournal_buckets ;
2013-03-23 16:11:31 -07:00
}
}
2013-09-23 23:17:29 -07:00
if ( ! list_empty ( list ) )
c - > journal . seq = list_entry ( list - > prev ,
struct journal_replay ,
list ) - > j . seq ;
2013-03-23 16:11:31 -07:00
return 0 ;
# undef read_bucket
}
void bch_journal_mark ( struct cache_set * c , struct list_head * list )
{
atomic_t p = { 0 } ;
struct bkey * k ;
struct journal_replay * i ;
struct journal * j = & c - > journal ;
uint64_t last = j - > seq ;
/*
* journal . pin should never fill up - we never write a journal
* entry when it would fill up . But if for some reason it does , we
* iterate over the list in reverse order so that we can just skip that
* refcount instead of bugging .
*/
list_for_each_entry_reverse ( i , list , list ) {
BUG_ON ( last < i - > j . seq ) ;
i - > pin = NULL ;
while ( last - - ! = i - > j . seq )
if ( fifo_free ( & j - > pin ) > 1 ) {
fifo_push_front ( & j - > pin , p ) ;
atomic_set ( & fifo_front ( & j - > pin ) , 0 ) ;
}
if ( fifo_free ( & j - > pin ) > 1 ) {
fifo_push_front ( & j - > pin , p ) ;
i - > pin = & fifo_front ( & j - > pin ) ;
atomic_set ( i - > pin , 1 ) ;
}
for ( k = i - > j . start ;
2013-12-17 21:56:21 -08:00
k < bset_bkey_last ( & i - > j ) ;
2014-04-10 17:58:49 -07:00
k = bkey_next ( k ) )
if ( ! __bch_extent_invalid ( c , k ) ) {
unsigned j ;
2013-03-23 16:11:31 -07:00
2014-04-10 17:58:49 -07:00
for ( j = 0 ; j < KEY_PTRS ( k ) ; j + + )
if ( ptr_available ( c , k , j ) )
atomic_inc ( & PTR_BUCKET ( c , k , j ) - > pin ) ;
2014-02-24 19:55:28 -08:00
2014-04-10 17:58:49 -07:00
bch_initial_mark_key ( c , 0 , k ) ;
}
2013-03-23 16:11:31 -07:00
}
}
2013-07-24 17:44:17 -07:00
int bch_journal_replay ( struct cache_set * s , struct list_head * list )
2013-03-23 16:11:31 -07:00
{
int ret = 0 , keys = 0 , entries = 0 ;
struct bkey * k ;
struct journal_replay * i =
list_entry ( list - > prev , struct journal_replay , list ) ;
uint64_t start = i - > j . last_seq , end = i - > j . seq , n = start ;
2013-07-24 17:26:51 -07:00
struct keylist keylist ;
2013-03-23 16:11:31 -07:00
list_for_each_entry ( i , list , list ) {
BUG_ON ( i - > pin & & atomic_read ( i - > pin ) ! = 1 ) ;
2013-07-11 19:42:51 -07:00
cache_set_err_on ( n ! = i - > j . seq , s ,
" bcache: journal entries %llu-%llu missing! (replaying %llu-%llu) " ,
n , i - > j . seq - 1 , start , end ) ;
2013-03-23 16:11:31 -07:00
for ( k = i - > j . start ;
2013-12-17 21:56:21 -08:00
k < bset_bkey_last ( & i - > j ) ;
2013-03-23 16:11:31 -07:00
k = bkey_next ( k ) ) {
2013-04-26 15:39:55 -07:00
trace_bcache_journal_replay_key ( k ) ;
2014-01-08 21:22:02 -08:00
bch_keylist_init_single ( & keylist , k ) ;
2013-03-23 16:11:31 -07:00
2013-07-24 18:07:22 -07:00
ret = bch_btree_insert ( s , & keylist , i - > pin , NULL ) ;
2013-03-23 16:11:31 -07:00
if ( ret )
goto err ;
2013-07-24 17:26:51 -07:00
BUG_ON ( ! bch_keylist_empty ( & keylist ) ) ;
2013-03-23 16:11:31 -07:00
keys + + ;
cond_resched ( ) ;
}
if ( i - > pin )
atomic_dec ( i - > pin ) ;
n = i - > j . seq + 1 ;
entries + + ;
}
pr_info ( " journal replay done, %i keys in %i entries, seq %llu " ,
keys , entries , end ) ;
2013-07-24 18:04:18 -07:00
err :
2013-03-23 16:11:31 -07:00
while ( ! list_empty ( list ) ) {
i = list_first_entry ( list , struct journal_replay , list ) ;
list_del ( & i - > list ) ;
kfree ( i ) ;
}
2013-07-24 18:04:18 -07:00
2013-03-23 16:11:31 -07:00
return ret ;
}
/* Journalling */
static void btree_flush_write ( struct cache_set * c )
{
/*
* Try to find the btree node with that references the oldest journal
* entry , best is our current candidate and is locked if non NULL :
*/
2013-10-24 17:07:04 -07:00
struct btree * b , * best ;
unsigned i ;
retry :
best = NULL ;
for_each_cached_btree ( b , c , i )
if ( btree_current_write ( b ) - > journal ) {
if ( ! best )
best = b ;
else if ( journal_pin_cmp ( c ,
2013-07-24 17:44:17 -07:00
btree_current_write ( best ) - > journal ,
btree_current_write ( b ) - > journal ) ) {
2013-10-24 17:07:04 -07:00
best = b ;
}
}
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
b = best ;
if ( b ) {
2014-03-04 16:42:42 -08:00
mutex_lock ( & b - > write_lock ) ;
2013-10-24 17:07:04 -07:00
if ( ! btree_current_write ( b ) - > journal ) {
2014-03-04 16:42:42 -08:00
mutex_unlock ( & b - > write_lock ) ;
2013-10-24 17:07:04 -07:00
/* We raced */
goto retry ;
2013-03-23 16:11:31 -07:00
}
2014-03-04 16:42:42 -08:00
__bch_btree_node_write ( b , NULL ) ;
mutex_unlock ( & b - > write_lock ) ;
2013-03-23 16:11:31 -07:00
}
}
# define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
static void journal_discard_endio ( struct bio * bio , int error )
{
struct journal_device * ja =
container_of ( bio , struct journal_device , discard_bio ) ;
struct cache * ca = container_of ( ja , struct cache , journal ) ;
atomic_set ( & ja - > discard_in_flight , DISCARD_DONE ) ;
closure_wake_up ( & ca - > set - > journal . wait ) ;
closure_put ( & ca - > set - > cl ) ;
}
static void journal_discard_work ( struct work_struct * work )
{
struct journal_device * ja =
container_of ( work , struct journal_device , discard_work ) ;
submit_bio ( 0 , & ja - > discard_bio ) ;
}
static void do_journal_discard ( struct cache * ca )
{
struct journal_device * ja = & ca - > journal ;
struct bio * bio = & ja - > discard_bio ;
if ( ! ca - > discard ) {
ja - > discard_idx = ja - > last_idx ;
return ;
}
2013-09-23 23:17:27 -07:00
switch ( atomic_read ( & ja - > discard_in_flight ) ) {
2013-03-23 16:11:31 -07:00
case DISCARD_IN_FLIGHT :
return ;
case DISCARD_DONE :
ja - > discard_idx = ( ja - > discard_idx + 1 ) %
ca - > sb . njournal_buckets ;
atomic_set ( & ja - > discard_in_flight , DISCARD_READY ) ;
/* fallthrough */
case DISCARD_READY :
if ( ja - > discard_idx = = ja - > last_idx )
return ;
atomic_set ( & ja - > discard_in_flight , DISCARD_IN_FLIGHT ) ;
bio_init ( bio ) ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_sector = bucket_to_sector ( ca - > set ,
2013-03-25 11:46:44 -07:00
ca - > sb . d [ ja - > discard_idx ] ) ;
2013-03-23 16:11:31 -07:00
bio - > bi_bdev = ca - > bdev ;
bio - > bi_rw = REQ_WRITE | REQ_DISCARD ;
bio - > bi_max_vecs = 1 ;
bio - > bi_io_vec = bio - > bi_inline_vecs ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_size = bucket_bytes ( ca ) ;
2013-03-23 16:11:31 -07:00
bio - > bi_end_io = journal_discard_endio ;
closure_get ( & ca - > set - > cl ) ;
INIT_WORK ( & ja - > discard_work , journal_discard_work ) ;
schedule_work ( & ja - > discard_work ) ;
}
}
static void journal_reclaim ( struct cache_set * c )
{
struct bkey * k = & c - > journal . key ;
struct cache * ca ;
uint64_t last_seq ;
unsigned iter , n = 0 ;
atomic_t p ;
while ( ! atomic_read ( & fifo_front ( & c - > journal . pin ) ) )
fifo_pop ( & c - > journal . pin , p ) ;
last_seq = last_seq ( & c - > journal ) ;
/* Update last_idx */
for_each_cache ( ca , c , iter ) {
struct journal_device * ja = & ca - > journal ;
while ( ja - > last_idx ! = ja - > cur_idx & &
ja - > seq [ ja - > last_idx ] < last_seq )
ja - > last_idx = ( ja - > last_idx + 1 ) %
ca - > sb . njournal_buckets ;
}
for_each_cache ( ca , c , iter )
do_journal_discard ( ca ) ;
if ( c - > journal . blocks_free )
2013-10-24 17:07:04 -07:00
goto out ;
2013-03-23 16:11:31 -07:00
/*
* Allocate :
* XXX : Sort by free journal space
*/
for_each_cache ( ca , c , iter ) {
struct journal_device * ja = & ca - > journal ;
unsigned next = ( ja - > cur_idx + 1 ) % ca - > sb . njournal_buckets ;
/* No space available on this device */
if ( next = = ja - > discard_idx )
continue ;
ja - > cur_idx = next ;
k - > ptr [ n + + ] = PTR ( 0 ,
bucket_to_sector ( c , ca - > sb . d [ ja - > cur_idx ] ) ,
ca - > sb . nr_this_dev ) ;
}
bkey_init ( k ) ;
SET_KEY_PTRS ( k , n ) ;
if ( n )
c - > journal . blocks_free = c - > sb . bucket_size > > c - > block_bits ;
2013-10-24 17:07:04 -07:00
out :
2013-03-23 16:11:31 -07:00
if ( ! journal_full ( & c - > journal ) )
__closure_wake_up ( & c - > journal . wait ) ;
}
void bch_journal_next ( struct journal * j )
{
atomic_t p = { 1 } ;
j - > cur = ( j - > cur = = j - > w )
? & j - > w [ 1 ]
: & j - > w [ 0 ] ;
/*
* The fifo_push ( ) needs to happen at the same time as j - > seq is
* incremented for last_seq ( ) to be calculated correctly
*/
BUG_ON ( ! fifo_push ( & j - > pin , p ) ) ;
atomic_set ( & fifo_back ( & j - > pin ) , 1 ) ;
j - > cur - > data - > seq = + + j - > seq ;
2014-02-19 19:48:26 -08:00
j - > cur - > dirty = false ;
2013-03-23 16:11:31 -07:00
j - > cur - > need_write = false ;
j - > cur - > data - > keys = 0 ;
if ( fifo_full ( & j - > pin ) )
pr_debug ( " journal_pin full (%zu) " , fifo_used ( & j - > pin ) ) ;
}
static void journal_write_endio ( struct bio * bio , int error )
{
struct journal_write * w = bio - > bi_private ;
cache_set_err_on ( error , w - > c , " journal io error " ) ;
2013-10-08 15:50:46 -07:00
closure_put ( & w - > c - > journal . io ) ;
2013-03-23 16:11:31 -07:00
}
static void journal_write ( struct closure * ) ;
static void journal_write_done ( struct closure * cl )
{
2013-10-08 15:50:46 -07:00
struct journal * j = container_of ( cl , struct journal , io ) ;
2013-03-23 16:11:31 -07:00
struct journal_write * w = ( j - > cur = = j - > w )
? & j - > w [ 1 ]
: & j - > w [ 0 ] ;
__closure_wake_up ( & w - > wait ) ;
2013-10-08 15:50:46 -07:00
continue_at_nobarrier ( cl , journal_write , system_wq ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-16 15:27:25 -08:00
static void journal_write_unlock ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , journal . io ) ;
c - > journal . io_in_flight = 0 ;
spin_unlock ( & c - > journal . lock ) ;
}
2013-03-23 16:11:31 -07:00
static void journal_write_unlocked ( struct closure * cl )
2013-03-26 13:49:02 -07:00
__releases ( c - > journal . lock )
2013-03-23 16:11:31 -07:00
{
2013-10-08 15:50:46 -07:00
struct cache_set * c = container_of ( cl , struct cache_set , journal . io ) ;
2013-03-23 16:11:31 -07:00
struct cache * ca ;
struct journal_write * w = c - > journal . cur ;
struct bkey * k = & c - > journal . key ;
2013-12-17 23:49:49 -08:00
unsigned i , sectors = set_blocks ( w - > data , block_bytes ( c ) ) *
c - > sb . block_size ;
2013-03-23 16:11:31 -07:00
struct bio * bio ;
struct bio_list list ;
bio_list_init ( & list ) ;
if ( ! w - > need_write ) {
2013-12-16 15:27:25 -08:00
closure_return_with_destructor ( cl , journal_write_unlock ) ;
2013-03-23 16:11:31 -07:00
} else if ( journal_full ( & c - > journal ) ) {
journal_reclaim ( c ) ;
spin_unlock ( & c - > journal . lock ) ;
btree_flush_write ( c ) ;
continue_at ( cl , journal_write , system_wq ) ;
}
2013-12-17 23:49:49 -08:00
c - > journal . blocks_free - = set_blocks ( w - > data , block_bytes ( c ) ) ;
2013-03-23 16:11:31 -07:00
w - > data - > btree_level = c - > root - > level ;
bkey_copy ( & w - > data - > btree_root , & c - > root - > key ) ;
bkey_copy ( & w - > data - > uuid_bucket , & c - > uuid_bucket ) ;
for_each_cache ( ca , c , i )
w - > data - > prio_bucket [ ca - > sb . nr_this_dev ] = ca - > prio_buckets [ 0 ] ;
2013-10-31 15:46:42 -07:00
w - > data - > magic = jset_magic ( & c - > sb ) ;
2013-03-23 16:11:31 -07:00
w - > data - > version = BCACHE_JSET_VERSION ;
w - > data - > last_seq = last_seq ( & c - > journal ) ;
w - > data - > csum = csum_set ( w - > data ) ;
for ( i = 0 ; i < KEY_PTRS ( k ) ; i + + ) {
ca = PTR_CACHE ( c , k , i ) ;
bio = & ca - > journal . bio ;
atomic_long_add ( sectors , & ca - > meta_sectors_written ) ;
bio_reset ( bio ) ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_sector = PTR_OFFSET ( k , i ) ;
2013-03-23 16:11:31 -07:00
bio - > bi_bdev = ca - > bdev ;
2013-06-26 17:25:38 -07:00
bio - > bi_rw = REQ_WRITE | REQ_SYNC | REQ_META | REQ_FLUSH | REQ_FUA ;
2013-10-11 15:44:27 -07:00
bio - > bi_iter . bi_size = sectors < < 9 ;
2013-03-23 16:11:31 -07:00
bio - > bi_end_io = journal_write_endio ;
bio - > bi_private = w ;
2013-03-28 12:50:55 -06:00
bch_bio_map ( bio , w - > data ) ;
2013-03-23 16:11:31 -07:00
trace_bcache_journal_write ( bio ) ;
bio_list_add ( & list , bio ) ;
SET_PTR_OFFSET ( k , i , PTR_OFFSET ( k , i ) + sectors ) ;
ca - > journal . seq [ ca - > journal . cur_idx ] = w - > data - > seq ;
}
atomic_dec_bug ( & fifo_back ( & c - > journal . pin ) ) ;
bch_journal_next ( & c - > journal ) ;
journal_reclaim ( c ) ;
spin_unlock ( & c - > journal . lock ) ;
while ( ( bio = bio_list_pop ( & list ) ) )
closure_bio_submit ( bio , cl , c - > cache [ 0 ] ) ;
continue_at ( cl , journal_write_done , NULL ) ;
}
static void journal_write ( struct closure * cl )
{
2013-10-08 15:50:46 -07:00
struct cache_set * c = container_of ( cl , struct cache_set , journal . io ) ;
2013-03-23 16:11:31 -07:00
spin_lock ( & c - > journal . lock ) ;
journal_write_unlocked ( cl ) ;
}
2013-10-24 17:07:04 -07:00
static void journal_try_write ( struct cache_set * c )
2013-03-26 13:49:02 -07:00
__releases ( c - > journal . lock )
2013-03-23 16:11:31 -07:00
{
2013-10-08 15:50:46 -07:00
struct closure * cl = & c - > journal . io ;
struct journal_write * w = c - > journal . cur ;
w - > need_write = true ;
2013-03-23 16:11:31 -07:00
2013-12-16 15:27:25 -08:00
if ( ! c - > journal . io_in_flight ) {
c - > journal . io_in_flight = 1 ;
closure_call ( cl , journal_write_unlocked , NULL , & c - > cl ) ;
} else {
2013-10-24 17:07:04 -07:00
spin_unlock ( & c - > journal . lock ) ;
2013-12-16 15:27:25 -08:00
}
2013-03-23 16:11:31 -07:00
}
2013-10-24 17:07:04 -07:00
static struct journal_write * journal_wait_for_write ( struct cache_set * c ,
unsigned nkeys )
2013-03-23 16:11:31 -07:00
{
2013-10-24 17:07:04 -07:00
size_t sectors ;
struct closure cl ;
2013-12-10 16:10:46 -08:00
bool wait = false ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
closure_init_stack ( & cl ) ;
spin_lock ( & c - > journal . lock ) ;
while ( 1 ) {
struct journal_write * w = c - > journal . cur ;
sectors = __set_blocks ( w - > data , w - > data - > keys + nkeys ,
2013-12-17 23:49:49 -08:00
block_bytes ( c ) ) * c - > sb . block_size ;
2013-10-24 17:07:04 -07:00
if ( sectors < = min_t ( size_t ,
c - > journal . blocks_free * c - > sb . block_size ,
PAGE_SECTORS < < JSET_BITS ) )
return w ;
2013-12-10 16:10:46 -08:00
if ( wait )
closure_wait ( & c - > journal . wait , & cl ) ;
2013-10-24 17:07:04 -07:00
if ( ! journal_full ( & c - > journal ) ) {
2013-12-10 16:10:46 -08:00
if ( wait )
trace_bcache_journal_entry_full ( c ) ;
2013-10-24 17:07:04 -07:00
/*
* XXX : If we were inserting so many keys that they
* won ' t fit in an _empty_ journal write , we ' ll
* deadlock . For now , handle this in
* bch_keylist_realloc ( ) - but something to think about .
*/
BUG_ON ( ! w - > data - > keys ) ;
journal_try_write ( c ) ; /* unlocks */
} else {
2013-12-10 16:10:46 -08:00
if ( wait )
trace_bcache_journal_full ( c ) ;
2013-10-24 17:07:04 -07:00
journal_reclaim ( c ) ;
spin_unlock ( & c - > journal . lock ) ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
btree_flush_write ( c ) ;
}
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
closure_sync ( & cl ) ;
spin_lock ( & c - > journal . lock ) ;
2013-12-10 16:10:46 -08:00
wait = true ;
2013-03-23 16:11:31 -07:00
}
}
2013-10-08 15:50:46 -07:00
static void journal_write_work ( struct work_struct * work )
{
struct cache_set * c = container_of ( to_delayed_work ( work ) ,
struct cache_set ,
journal . work ) ;
spin_lock ( & c - > journal . lock ) ;
2014-02-19 19:48:26 -08:00
if ( c - > journal . cur - > dirty )
journal_try_write ( c ) ;
else
spin_unlock ( & c - > journal . lock ) ;
2013-10-08 15:50:46 -07:00
}
2013-03-23 16:11:31 -07:00
/*
* Entry point to the journalling code - bio_insert ( ) and btree_invalidate ( )
* pass bch_journal ( ) a list of keys to be journalled , and then
* bch_journal ( ) hands those same keys off to btree_insert_async ( )
*/
2013-10-24 17:07:04 -07:00
atomic_t * bch_journal ( struct cache_set * c ,
struct keylist * keys ,
struct closure * parent )
2013-03-23 16:11:31 -07:00
{
struct journal_write * w ;
2013-10-24 17:07:04 -07:00
atomic_t * ret ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
if ( ! CACHE_SYNC ( & c - > sb ) )
return NULL ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
w = journal_wait_for_write ( c , bch_keylist_nkeys ( keys ) ) ;
2013-04-26 15:39:55 -07:00
2013-12-17 21:56:21 -08:00
memcpy ( bset_bkey_last ( w - > data ) , keys - > keys , bch_keylist_bytes ( keys ) ) ;
2013-10-24 17:07:04 -07:00
w - > data - > keys + = bch_keylist_nkeys ( keys ) ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
ret = & fifo_back ( & c - > journal . pin ) ;
atomic_inc ( ret ) ;
2013-03-23 16:11:31 -07:00
2013-10-24 17:07:04 -07:00
if ( parent ) {
closure_wait ( & w - > wait , parent ) ;
2013-10-08 15:50:46 -07:00
journal_try_write ( c ) ;
2014-02-19 19:48:26 -08:00
} else if ( ! w - > dirty ) {
w - > dirty = true ;
2013-10-08 15:50:46 -07:00
schedule_delayed_work ( & c - > journal . work ,
msecs_to_jiffies ( c - > journal_delay_ms ) ) ;
spin_unlock ( & c - > journal . lock ) ;
} else {
spin_unlock ( & c - > journal . lock ) ;
2013-03-23 16:11:31 -07:00
}
2013-10-24 17:07:04 -07:00
return ret ;
}
void bch_journal_meta ( struct cache_set * c , struct closure * cl )
{
struct keylist keys ;
atomic_t * ref ;
bch_keylist_init ( & keys ) ;
ref = bch_journal ( c , & keys , cl ) ;
if ( ref )
atomic_dec_bug ( ref ) ;
2013-03-23 16:11:31 -07:00
}
void bch_journal_free ( struct cache_set * c )
{
free_pages ( ( unsigned long ) c - > journal . w [ 1 ] . data , JSET_BITS ) ;
free_pages ( ( unsigned long ) c - > journal . w [ 0 ] . data , JSET_BITS ) ;
free_fifo ( & c - > journal . pin ) ;
}
int bch_journal_alloc ( struct cache_set * c )
{
struct journal * j = & c - > journal ;
spin_lock_init ( & j - > lock ) ;
2013-10-08 15:50:46 -07:00
INIT_DELAYED_WORK ( & j - > work , journal_write_work ) ;
2013-03-23 16:11:31 -07:00
c - > journal_delay_ms = 100 ;
j - > w [ 0 ] . c = c ;
j - > w [ 1 ] . c = c ;
if ( ! ( init_fifo ( & j - > pin , JOURNAL_PIN , GFP_KERNEL ) ) | |
! ( j - > w [ 0 ] . data = ( void * ) __get_free_pages ( GFP_KERNEL , JSET_BITS ) ) | |
! ( j - > w [ 1 ] . data = ( void * ) __get_free_pages ( GFP_KERNEL , JSET_BITS ) ) )
return - ENOMEM ;
return 0 ;
}