2013-03-24 03:11:31 +04:00
/*
* bcache setup / teardown code , and some metadata io - read a superblock and
* figure out what to do with it .
*
* Copyright 2010 , 2011 Kent Overstreet < kent . overstreet @ gmail . com >
* Copyright 2012 Google , Inc .
*/
# include "bcache.h"
# include "btree.h"
# include "debug.h"
2013-12-21 05:22:05 +04:00
# include "extents.h"
2013-03-24 03:11:31 +04:00
# include "request.h"
2013-06-05 17:21:07 +04:00
# include "writeback.h"
2013-03-24 03:11:31 +04:00
2013-04-27 02:39:55 +04:00
# include <linux/blkdev.h>
2013-03-24 03:11:31 +04:00
# include <linux/buffer_head.h>
# include <linux/debugfs.h>
# include <linux/genhd.h>
2013-07-31 12:12:02 +04:00
# include <linux/idr.h>
2013-07-11 05:31:58 +04:00
# include <linux/kthread.h>
2013-03-24 03:11:31 +04:00
# include <linux/module.h>
# include <linux/random.h>
# include <linux/reboot.h>
# include <linux/sysfs.h>
MODULE_LICENSE ( " GPL " ) ;
MODULE_AUTHOR ( " Kent Overstreet <kent.overstreet@gmail.com> " ) ;
static const char bcache_magic [ ] = {
0xc6 , 0x85 , 0x73 , 0xf6 , 0x4e , 0x1a , 0x45 , 0xca ,
0x82 , 0x65 , 0xf5 , 0x7f , 0x48 , 0xba , 0x6d , 0x81
} ;
static const char invalid_uuid [ ] = {
0xa0 , 0x3e , 0xf8 , 0xed , 0x3e , 0xe1 , 0xb8 , 0x78 ,
0xc8 , 0x50 , 0xfc , 0x5e , 0xcb , 0x16 , 0xcd , 0x99
} ;
/* Default is -1; we skip past it for struct cached_dev's cache mode */
const char * const bch_cache_modes [ ] = {
" default " ,
" writethrough " ,
" writeback " ,
" writearound " ,
" none " ,
NULL
} ;
static struct kobject * bcache_kobj ;
struct mutex bch_register_lock ;
LIST_HEAD ( bch_cache_sets ) ;
static LIST_HEAD ( uncached_devices ) ;
2013-07-31 12:12:02 +04:00
static int bcache_major ;
static DEFINE_IDA ( bcache_minor ) ;
2013-03-24 03:11:31 +04:00
static wait_queue_head_t unregister_wait ;
struct workqueue_struct * bcache_wq ;
# define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
/* Superblock */
static const char * read_super ( struct cache_sb * sb , struct block_device * bdev ,
struct page * * res )
{
const char * err ;
struct cache_sb * s ;
struct buffer_head * bh = __bread ( bdev , 1 , SB_SIZE ) ;
unsigned i ;
if ( ! bh )
return " IO error " ;
s = ( struct cache_sb * ) bh - > b_data ;
sb - > offset = le64_to_cpu ( s - > offset ) ;
sb - > version = le64_to_cpu ( s - > version ) ;
memcpy ( sb - > magic , s - > magic , 16 ) ;
memcpy ( sb - > uuid , s - > uuid , 16 ) ;
memcpy ( sb - > set_uuid , s - > set_uuid , 16 ) ;
memcpy ( sb - > label , s - > label , SB_LABEL_SIZE ) ;
sb - > flags = le64_to_cpu ( s - > flags ) ;
sb - > seq = le64_to_cpu ( s - > seq ) ;
sb - > last_mount = le32_to_cpu ( s - > last_mount ) ;
sb - > first_bucket = le16_to_cpu ( s - > first_bucket ) ;
sb - > keys = le16_to_cpu ( s - > keys ) ;
for ( i = 0 ; i < SB_JOURNAL_BUCKETS ; i + + )
sb - > d [ i ] = le64_to_cpu ( s - > d [ i ] ) ;
pr_debug ( " read sb version %llu, flags %llu, seq %llu, journal size %u " ,
sb - > version , sb - > flags , sb - > seq , sb - > keys ) ;
err = " Not a bcache superblock " ;
if ( sb - > offset ! = SB_SECTOR )
goto err ;
if ( memcmp ( sb - > magic , bcache_magic , 16 ) )
goto err ;
err = " Too many journal buckets " ;
if ( sb - > keys > SB_JOURNAL_BUCKETS )
goto err ;
err = " Bad checksum " ;
if ( s - > csum ! = csum_set ( s ) )
goto err ;
err = " Bad UUID " ;
2013-03-28 22:50:55 +04:00
if ( bch_is_zero ( sb - > uuid , 16 ) )
2013-03-24 03:11:31 +04:00
goto err ;
2013-04-24 08:51:48 +04:00
sb - > block_size = le16_to_cpu ( s - > block_size ) ;
err = " Superblock block size smaller than device block size " ;
if ( sb - > block_size < < 9 < bdev_logical_block_size ( bdev ) )
goto err ;
2013-04-12 02:14:35 +04:00
switch ( sb - > version ) {
case BCACHE_SB_VERSION_BDEV :
sb - > data_offset = BDEV_DATA_START_DEFAULT ;
break ;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET :
sb - > data_offset = le64_to_cpu ( s - > data_offset ) ;
err = " Bad data offset " ;
if ( sb - > data_offset < BDEV_DATA_START_DEFAULT )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
break ;
case BCACHE_SB_VERSION_CDEV :
case BCACHE_SB_VERSION_CDEV_WITH_UUID :
sb - > nbuckets = le64_to_cpu ( s - > nbuckets ) ;
sb - > bucket_size = le16_to_cpu ( s - > bucket_size ) ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
sb - > nr_in_set = le16_to_cpu ( s - > nr_in_set ) ;
sb - > nr_this_dev = le16_to_cpu ( s - > nr_this_dev ) ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Too many buckets " ;
if ( sb - > nbuckets > LONG_MAX )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Not enough buckets " ;
if ( sb - > nbuckets < 1 < < 7 )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Bad block/bucket size " ;
if ( ! is_power_of_2 ( sb - > block_size ) | |
sb - > block_size > PAGE_SECTORS | |
! is_power_of_2 ( sb - > bucket_size ) | |
sb - > bucket_size < PAGE_SECTORS )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Invalid superblock: device too small " ;
if ( get_capacity ( bdev - > bd_disk ) < sb - > bucket_size * sb - > nbuckets )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Bad UUID " ;
if ( bch_is_zero ( sb - > set_uuid , 16 ) )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Bad cache device number in set " ;
if ( ! sb - > nr_in_set | |
sb - > nr_in_set < = sb - > nr_this_dev | |
sb - > nr_in_set > MAX_CACHES_PER_SET )
2013-03-24 03:11:31 +04:00
goto err ;
2013-04-12 02:14:35 +04:00
err = " Journal buckets not sequential " ;
for ( i = 0 ; i < sb - > keys ; i + + )
if ( sb - > d [ i ] ! = sb - > first_bucket + i )
goto err ;
2013-03-24 03:11:31 +04:00
2013-04-12 02:14:35 +04:00
err = " Too many journal buckets " ;
if ( sb - > first_bucket + sb - > keys > sb - > nbuckets )
goto err ;
err = " Invalid superblock: first bucket comes before end of super " ;
if ( sb - > first_bucket * sb - > bucket_size < 16 )
goto err ;
break ;
default :
err = " Unsupported superblock version " ;
2013-03-24 03:11:31 +04:00
goto err ;
2013-04-12 02:14:35 +04:00
}
2013-03-24 03:11:31 +04:00
sb - > last_mount = get_seconds ( ) ;
err = NULL ;
get_page ( bh - > b_page ) ;
* res = bh - > b_page ;
err :
put_bh ( bh ) ;
return err ;
}
2015-07-20 16:29:37 +03:00
static void write_bdev_super_endio ( struct bio * bio )
2013-03-24 03:11:31 +04:00
{
struct cached_dev * dc = bio - > bi_private ;
/* XXX: error checking */
2013-12-17 03:27:25 +04:00
closure_put ( & dc - > sb_write ) ;
2013-03-24 03:11:31 +04:00
}
static void __write_super ( struct cache_sb * sb , struct bio * bio )
{
struct cache_sb * out = page_address ( bio - > bi_io_vec [ 0 ] . bv_page ) ;
unsigned i ;
2013-10-12 02:44:27 +04:00
bio - > bi_iter . bi_sector = SB_SECTOR ;
bio - > bi_iter . bi_size = SB_SIZE ;
2016-06-05 22:32:05 +03:00
bio_set_op_attrs ( bio , REQ_OP_WRITE , REQ_SYNC | REQ_META ) ;
2013-03-28 22:50:55 +04:00
bch_bio_map ( bio , NULL ) ;
2013-03-24 03:11:31 +04:00
out - > offset = cpu_to_le64 ( sb - > offset ) ;
out - > version = cpu_to_le64 ( sb - > version ) ;
memcpy ( out - > uuid , sb - > uuid , 16 ) ;
memcpy ( out - > set_uuid , sb - > set_uuid , 16 ) ;
memcpy ( out - > label , sb - > label , SB_LABEL_SIZE ) ;
out - > flags = cpu_to_le64 ( sb - > flags ) ;
out - > seq = cpu_to_le64 ( sb - > seq ) ;
out - > last_mount = cpu_to_le32 ( sb - > last_mount ) ;
out - > first_bucket = cpu_to_le16 ( sb - > first_bucket ) ;
out - > keys = cpu_to_le16 ( sb - > keys ) ;
for ( i = 0 ; i < sb - > keys ; i + + )
out - > d [ i ] = cpu_to_le64 ( sb - > d [ i ] ) ;
out - > csum = csum_set ( out ) ;
pr_debug ( " ver %llu, flags %llu, seq %llu " ,
sb - > version , sb - > flags , sb - > seq ) ;
2016-06-05 22:31:41 +03:00
submit_bio ( bio ) ;
2013-03-24 03:11:31 +04:00
}
2013-12-17 03:27:25 +04:00
static void bch_write_bdev_super_unlock ( struct closure * cl )
{
struct cached_dev * dc = container_of ( cl , struct cached_dev , sb_write ) ;
up ( & dc - > sb_write_mutex ) ;
}
2013-03-24 03:11:31 +04:00
void bch_write_bdev_super ( struct cached_dev * dc , struct closure * parent )
{
2013-12-17 03:27:25 +04:00
struct closure * cl = & dc - > sb_write ;
2013-03-24 03:11:31 +04:00
struct bio * bio = & dc - > sb_bio ;
2013-12-17 03:27:25 +04:00
down ( & dc - > sb_write_mutex ) ;
closure_init ( cl , parent ) ;
2013-03-24 03:11:31 +04:00
bio_reset ( bio ) ;
bio - > bi_bdev = dc - > bdev ;
bio - > bi_end_io = write_bdev_super_endio ;
bio - > bi_private = dc ;
closure_get ( cl ) ;
__write_super ( & dc - > sb , bio ) ;
2013-12-17 03:27:25 +04:00
closure_return_with_destructor ( cl , bch_write_bdev_super_unlock ) ;
2013-03-24 03:11:31 +04:00
}
2015-07-20 16:29:37 +03:00
static void write_super_endio ( struct bio * bio )
2013-03-24 03:11:31 +04:00
{
struct cache * ca = bio - > bi_private ;
2015-07-20 16:29:37 +03:00
bch_count_io_errors ( ca , bio - > bi_error , " writing superblock " ) ;
2013-12-17 03:27:25 +04:00
closure_put ( & ca - > set - > sb_write ) ;
}
static void bcache_write_super_unlock ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , sb_write ) ;
up ( & c - > sb_write_mutex ) ;
2013-03-24 03:11:31 +04:00
}
void bcache_write_super ( struct cache_set * c )
{
2013-12-17 03:27:25 +04:00
struct closure * cl = & c - > sb_write ;
2013-03-24 03:11:31 +04:00
struct cache * ca ;
unsigned i ;
2013-12-17 03:27:25 +04:00
down ( & c - > sb_write_mutex ) ;
closure_init ( cl , & c - > cl ) ;
2013-03-24 03:11:31 +04:00
c - > sb . seq + + ;
for_each_cache ( ca , c , i ) {
struct bio * bio = & ca - > sb_bio ;
2013-04-12 02:14:35 +04:00
ca - > sb . version = BCACHE_SB_VERSION_CDEV_WITH_UUID ;
2013-03-24 03:11:31 +04:00
ca - > sb . seq = c - > sb . seq ;
ca - > sb . last_mount = c - > sb . last_mount ;
SET_CACHE_SYNC ( & ca - > sb , CACHE_SYNC ( & c - > sb ) ) ;
bio_reset ( bio ) ;
bio - > bi_bdev = ca - > bdev ;
bio - > bi_end_io = write_super_endio ;
bio - > bi_private = ca ;
closure_get ( cl ) ;
__write_super ( & ca - > sb , bio ) ;
}
2013-12-17 03:27:25 +04:00
closure_return_with_destructor ( cl , bcache_write_super_unlock ) ;
2013-03-24 03:11:31 +04:00
}
/* UUID io */
2015-07-20 16:29:37 +03:00
static void uuid_endio ( struct bio * bio )
2013-03-24 03:11:31 +04:00
{
struct closure * cl = bio - > bi_private ;
2013-12-17 03:27:25 +04:00
struct cache_set * c = container_of ( cl , struct cache_set , uuid_write ) ;
2013-03-24 03:11:31 +04:00
2015-07-20 16:29:37 +03:00
cache_set_err_on ( bio - > bi_error , c , " accessing uuids " ) ;
2013-03-24 03:11:31 +04:00
bch_bbio_free ( bio , c ) ;
closure_put ( cl ) ;
}
2013-12-17 03:27:25 +04:00
static void uuid_io_unlock ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , uuid_write ) ;
up ( & c - > uuid_write_mutex ) ;
}
2016-06-05 22:32:05 +03:00
static void uuid_io ( struct cache_set * c , int op , unsigned long op_flags ,
2013-03-24 03:11:31 +04:00
struct bkey * k , struct closure * parent )
{
2013-12-17 03:27:25 +04:00
struct closure * cl = & c - > uuid_write ;
2013-03-24 03:11:31 +04:00
struct uuid_entry * u ;
unsigned i ;
2013-05-15 07:33:16 +04:00
char buf [ 80 ] ;
2013-03-24 03:11:31 +04:00
BUG_ON ( ! parent ) ;
2013-12-17 03:27:25 +04:00
down ( & c - > uuid_write_mutex ) ;
closure_init ( cl , parent ) ;
2013-03-24 03:11:31 +04:00
for ( i = 0 ; i < KEY_PTRS ( k ) ; i + + ) {
struct bio * bio = bch_bbio_alloc ( c ) ;
2016-08-06 00:35:16 +03:00
bio - > bi_opf = REQ_SYNC | REQ_META | op_flags ;
2013-10-12 02:44:27 +04:00
bio - > bi_iter . bi_size = KEY_SIZE ( k ) < < 9 ;
2013-03-24 03:11:31 +04:00
bio - > bi_end_io = uuid_endio ;
bio - > bi_private = cl ;
2016-06-05 22:32:05 +03:00
bio_set_op_attrs ( bio , op , REQ_SYNC | REQ_META | op_flags ) ;
2013-03-28 22:50:55 +04:00
bch_bio_map ( bio , c - > uuids ) ;
2013-03-24 03:11:31 +04:00
bch_submit_bbio ( bio , c , k , i ) ;
2016-06-05 22:32:05 +03:00
if ( op ! = REQ_OP_WRITE )
2013-03-24 03:11:31 +04:00
break ;
}
2013-12-18 11:47:33 +04:00
bch_extent_to_text ( buf , sizeof ( buf ) , k ) ;
2016-06-05 22:32:05 +03:00
pr_debug ( " %s UUIDs at %s " , op = = REQ_OP_WRITE ? " wrote " : " read " , buf ) ;
2013-03-24 03:11:31 +04:00
for ( u = c - > uuids ; u < c - > uuids + c - > nr_uuids ; u + + )
2013-03-28 22:50:55 +04:00
if ( ! bch_is_zero ( u - > uuid , 16 ) )
2013-03-24 03:11:31 +04:00
pr_debug ( " Slot %zi: %pU: %s: 1st: %u last: %u inv: %u " ,
u - c - > uuids , u - > uuid , u - > label ,
u - > first_reg , u - > last_reg , u - > invalidated ) ;
2013-12-17 03:27:25 +04:00
closure_return_with_destructor ( cl , uuid_io_unlock ) ;
2013-03-24 03:11:31 +04:00
}
static char * uuid_read ( struct cache_set * c , struct jset * j , struct closure * cl )
{
struct bkey * k = & j - > uuid_bucket ;
2013-12-21 05:22:05 +04:00
if ( __bch_btree_ptr_invalid ( c , k ) )
2013-03-24 03:11:31 +04:00
return " bad uuid pointer " ;
bkey_copy ( & c - > uuid_bucket , k ) ;
2016-06-05 22:32:05 +03:00
uuid_io ( c , REQ_OP_READ , READ_SYNC , k , cl ) ;
2013-03-24 03:11:31 +04:00
if ( j - > version < BCACHE_JSET_VERSION_UUIDv1 ) {
struct uuid_entry_v0 * u0 = ( void * ) c - > uuids ;
struct uuid_entry * u1 = ( void * ) c - > uuids ;
int i ;
closure_sync ( cl ) ;
/*
* Since the new uuid entry is bigger than the old , we have to
* convert starting at the highest memory address and work down
* in order to do it in place
*/
for ( i = c - > nr_uuids - 1 ;
i > = 0 ;
- - i ) {
memcpy ( u1 [ i ] . uuid , u0 [ i ] . uuid , 16 ) ;
memcpy ( u1 [ i ] . label , u0 [ i ] . label , 32 ) ;
u1 [ i ] . first_reg = u0 [ i ] . first_reg ;
u1 [ i ] . last_reg = u0 [ i ] . last_reg ;
u1 [ i ] . invalidated = u0 [ i ] . invalidated ;
u1 [ i ] . flags = 0 ;
u1 [ i ] . sectors = 0 ;
}
}
return NULL ;
}
static int __uuid_write ( struct cache_set * c )
{
BKEY_PADDED ( key ) k ;
struct closure cl ;
closure_init_stack ( & cl ) ;
lockdep_assert_held ( & bch_register_lock ) ;
2013-12-17 13:29:34 +04:00
if ( bch_bucket_alloc_set ( c , RESERVE_BTREE , & k . key , 1 , true ) )
2013-03-24 03:11:31 +04:00
return 1 ;
SET_KEY_SIZE ( & k . key , c - > sb . bucket_size ) ;
2016-06-05 22:32:05 +03:00
uuid_io ( c , REQ_OP_WRITE , 0 , & k . key , & cl ) ;
2013-03-24 03:11:31 +04:00
closure_sync ( & cl ) ;
bkey_copy ( & c - > uuid_bucket , & k . key ) ;
2013-07-25 03:46:42 +04:00
bkey_put ( c , & k . key ) ;
2013-03-24 03:11:31 +04:00
return 0 ;
}
int bch_uuid_write ( struct cache_set * c )
{
int ret = __uuid_write ( c ) ;
if ( ! ret )
bch_journal_meta ( c , NULL ) ;
return ret ;
}
static struct uuid_entry * uuid_find ( struct cache_set * c , const char * uuid )
{
struct uuid_entry * u ;
for ( u = c - > uuids ;
u < c - > uuids + c - > nr_uuids ; u + + )
if ( ! memcmp ( u - > uuid , uuid , 16 ) )
return u ;
return NULL ;
}
static struct uuid_entry * uuid_find_empty ( struct cache_set * c )
{
static const char zero_uuid [ 16 ] = " \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 " ;
return uuid_find ( c , zero_uuid ) ;
}
/*
* Bucket priorities / gens :
*
* For each bucket , we store on disk its
* 8 bit gen
* 16 bit priority
*
* See alloc . c for an explanation of the gen . The priority is used to implement
* lru ( and in the future other ) cache replacement policies ; for most purposes
* it ' s just an opaque integer .
*
* The gens and the priorities don ' t have a whole lot to do with each other , and
* it ' s actually the gens that must be written out at specific times - it ' s no
* big deal if the priorities don ' t get written , if we lose them we just reuse
* buckets in suboptimal order .
*
* On disk they ' re stored in a packed array , and in as many buckets are required
* to fit them all . The buckets we use to store them form a list ; the journal
* header points to the first bucket , the first bucket points to the second
* bucket , et cetera .
*
* This code is used by the allocation code ; periodically ( whenever it runs out
* of buckets to allocate from ) the allocation code will invalidate some
* buckets , but it can ' t use those buckets until their new gens are safely on
* disk .
*/
2015-07-20 16:29:37 +03:00
static void prio_endio ( struct bio * bio )
2013-03-24 03:11:31 +04:00
{
struct cache * ca = bio - > bi_private ;
2015-07-20 16:29:37 +03:00
cache_set_err_on ( bio - > bi_error , ca - > set , " accessing priorities " ) ;
2013-03-24 03:11:31 +04:00
bch_bbio_free ( bio , ca - > set ) ;
closure_put ( & ca - > prio ) ;
}
2016-06-05 22:32:05 +03:00
static void prio_io ( struct cache * ca , uint64_t bucket , int op ,
unsigned long op_flags )
2013-03-24 03:11:31 +04:00
{
struct closure * cl = & ca - > prio ;
struct bio * bio = bch_bbio_alloc ( ca - > set ) ;
closure_init_stack ( cl ) ;
2013-10-12 02:44:27 +04:00
bio - > bi_iter . bi_sector = bucket * ca - > sb . bucket_size ;
bio - > bi_bdev = ca - > bdev ;
bio - > bi_iter . bi_size = bucket_bytes ( ca ) ;
2013-03-24 03:11:31 +04:00
bio - > bi_end_io = prio_endio ;
bio - > bi_private = ca ;
2016-06-05 22:32:05 +03:00
bio_set_op_attrs ( bio , op , REQ_SYNC | REQ_META | op_flags ) ;
2013-03-28 22:50:55 +04:00
bch_bio_map ( bio , ca - > disk_buckets ) ;
2013-03-24 03:11:31 +04:00
2013-11-24 11:11:25 +04:00
closure_bio_submit ( bio , & ca - > prio ) ;
2013-03-24 03:11:31 +04:00
closure_sync ( cl ) ;
}
void bch_prio_write ( struct cache * ca )
{
int i ;
struct bucket * b ;
struct closure cl ;
closure_init_stack ( & cl ) ;
lockdep_assert_held ( & ca - > set - > bucket_lock ) ;
ca - > disk_buckets - > seq + + ;
atomic_long_add ( ca - > sb . bucket_size * prio_buckets ( ca ) ,
& ca - > meta_sectors_written ) ;
2013-12-17 13:29:34 +04:00
//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
// fifo_used(&ca->free_inc), fifo_used(&ca->unused));
2013-03-24 03:11:31 +04:00
for ( i = prio_buckets ( ca ) - 1 ; i > = 0 ; - - i ) {
long bucket ;
struct prio_set * p = ca - > disk_buckets ;
2013-03-25 22:46:44 +04:00
struct bucket_disk * d = p - > data ;
struct bucket_disk * end = d + prios_per_bucket ( ca ) ;
2013-03-24 03:11:31 +04:00
for ( b = ca - > buckets + i * prios_per_bucket ( ca ) ;
b < ca - > buckets + ca - > sb . nbuckets & & d < end ;
b + + , d + + ) {
d - > prio = cpu_to_le16 ( b - > prio ) ;
d - > gen = b - > gen ;
}
p - > next_bucket = ca - > prio_buckets [ i + 1 ] ;
2013-11-01 02:46:42 +04:00
p - > magic = pset_magic ( & ca - > sb ) ;
2013-03-28 22:50:55 +04:00
p - > csum = bch_crc64 ( & p - > magic , bucket_bytes ( ca ) - 8 ) ;
2013-03-24 03:11:31 +04:00
2013-12-17 13:29:34 +04:00
bucket = bch_bucket_alloc ( ca , RESERVE_PRIO , true ) ;
2013-03-24 03:11:31 +04:00
BUG_ON ( bucket = = - 1 ) ;
mutex_unlock ( & ca - > set - > bucket_lock ) ;
2016-06-05 22:32:05 +03:00
prio_io ( ca , bucket , REQ_OP_WRITE , 0 ) ;
2013-03-24 03:11:31 +04:00
mutex_lock ( & ca - > set - > bucket_lock ) ;
ca - > prio_buckets [ i ] = bucket ;
atomic_dec_bug ( & ca - > buckets [ bucket ] . pin ) ;
}
mutex_unlock ( & ca - > set - > bucket_lock ) ;
bch_journal_meta ( ca - > set , & cl ) ;
closure_sync ( & cl ) ;
mutex_lock ( & ca - > set - > bucket_lock ) ;
/*
* Don ' t want the old priorities to get garbage collected until after we
* finish writing the new ones , and they ' re journalled
*/
2014-03-18 03:55:55 +04:00
for ( i = 0 ; i < prio_buckets ( ca ) ; i + + ) {
if ( ca - > prio_last_buckets [ i ] )
__bch_bucket_free ( ca ,
& ca - > buckets [ ca - > prio_last_buckets [ i ] ] ) ;
2013-03-24 03:11:31 +04:00
ca - > prio_last_buckets [ i ] = ca - > prio_buckets [ i ] ;
2014-03-18 03:55:55 +04:00
}
2013-03-24 03:11:31 +04:00
}
static void prio_read ( struct cache * ca , uint64_t bucket )
{
struct prio_set * p = ca - > disk_buckets ;
struct bucket_disk * d = p - > data + prios_per_bucket ( ca ) , * end = d ;
struct bucket * b ;
unsigned bucket_nr = 0 ;
for ( b = ca - > buckets ;
b < ca - > buckets + ca - > sb . nbuckets ;
b + + , d + + ) {
if ( d = = end ) {
ca - > prio_buckets [ bucket_nr ] = bucket ;
ca - > prio_last_buckets [ bucket_nr ] = bucket ;
bucket_nr + + ;
2016-06-05 22:32:05 +03:00
prio_io ( ca , bucket , REQ_OP_READ , READ_SYNC ) ;
2013-03-24 03:11:31 +04:00
2013-03-28 22:50:55 +04:00
if ( p - > csum ! = bch_crc64 ( & p - > magic , bucket_bytes ( ca ) - 8 ) )
2013-03-24 03:11:31 +04:00
pr_warn ( " bad csum reading priorities " ) ;
2013-11-01 02:46:42 +04:00
if ( p - > magic ! = pset_magic ( & ca - > sb ) )
2013-03-24 03:11:31 +04:00
pr_warn ( " bad magic reading priorities " ) ;
bucket = p - > next_bucket ;
d = p - > data ;
}
b - > prio = le16_to_cpu ( d - > prio ) ;
2014-02-28 05:51:12 +04:00
b - > gen = b - > last_gc = d - > gen ;
2013-03-24 03:11:31 +04:00
}
}
/* Bcache device */
static int open_dev ( struct block_device * b , fmode_t mode )
{
struct bcache_device * d = b - > bd_disk - > private_data ;
2013-08-22 04:49:09 +04:00
if ( test_bit ( BCACHE_DEV_CLOSING , & d - > flags ) )
2013-03-24 03:11:31 +04:00
return - ENXIO ;
closure_get ( & d - > cl ) ;
return 0 ;
}
2013-05-10 00:39:26 +04:00
static void release_dev ( struct gendisk * b , fmode_t mode )
2013-03-24 03:11:31 +04:00
{
struct bcache_device * d = b - > private_data ;
closure_put ( & d - > cl ) ;
}
static int ioctl_dev ( struct block_device * b , fmode_t mode ,
unsigned int cmd , unsigned long arg )
{
struct bcache_device * d = b - > bd_disk - > private_data ;
return d - > ioctl ( d , mode , cmd , arg ) ;
}
static const struct block_device_operations bcache_ops = {
. open = open_dev ,
. release = release_dev ,
. ioctl = ioctl_dev ,
. owner = THIS_MODULE ,
} ;
void bcache_device_stop ( struct bcache_device * d )
{
2013-08-22 04:49:09 +04:00
if ( ! test_and_set_bit ( BCACHE_DEV_CLOSING , & d - > flags ) )
2013-03-24 03:11:31 +04:00
closure_queue ( & d - > cl ) ;
}
2013-02-01 19:29:41 +04:00
static void bcache_device_unlink ( struct bcache_device * d )
{
2013-08-22 04:49:09 +04:00
lockdep_assert_held ( & bch_register_lock ) ;
2013-02-01 19:29:41 +04:00
2013-08-22 04:49:09 +04:00
if ( d - > c & & ! test_and_set_bit ( BCACHE_DEV_UNLINK_DONE , & d - > flags ) ) {
unsigned i ;
struct cache * ca ;
2013-02-01 19:29:41 +04:00
2013-08-22 04:49:09 +04:00
sysfs_remove_link ( & d - > c - > kobj , d - > name ) ;
sysfs_remove_link ( & d - > kobj , " cache " ) ;
for_each_cache ( ca , d - > c , i )
bd_unlink_disk_holder ( ca - > bdev , d - > disk ) ;
}
2013-02-01 19:29:41 +04:00
}
static void bcache_device_link ( struct bcache_device * d , struct cache_set * c ,
const char * name )
{
unsigned i ;
struct cache * ca ;
for_each_cache ( ca , d - > c , i )
bd_link_disk_holder ( ca - > bdev , d - > disk ) ;
snprintf ( d - > name , BCACHEDEVNAME_SIZE ,
" %s%u " , name , d - > id ) ;
WARN ( sysfs_create_link ( & d - > kobj , & c - > kobj , " cache " ) | |
sysfs_create_link ( & c - > kobj , & d - > kobj , d - > name ) ,
" Couldn't create device <-> cache set symlinks " ) ;
2015-11-30 04:19:32 +03:00
clear_bit ( BCACHE_DEV_UNLINK_DONE , & d - > flags ) ;
2013-02-01 19:29:41 +04:00
}
2013-03-24 03:11:31 +04:00
static void bcache_device_detach ( struct bcache_device * d )
{
lockdep_assert_held ( & bch_register_lock ) ;
2013-08-22 04:49:09 +04:00
if ( test_bit ( BCACHE_DEV_DETACHING , & d - > flags ) ) {
2013-03-24 03:11:31 +04:00
struct uuid_entry * u = d - > c - > uuids + d - > id ;
SET_UUID_FLASH_ONLY ( u , 0 ) ;
memcpy ( u - > uuid , invalid_uuid , 16 ) ;
u - > invalidated = cpu_to_le32 ( get_seconds ( ) ) ;
bch_uuid_write ( d - > c ) ;
}
2013-08-22 04:49:09 +04:00
bcache_device_unlink ( d ) ;
2013-02-01 19:29:41 +04:00
2013-03-24 03:11:31 +04:00
d - > c - > devices [ d - > id ] = NULL ;
closure_put ( & d - > c - > caching ) ;
d - > c = NULL ;
}
static void bcache_device_attach ( struct bcache_device * d , struct cache_set * c ,
unsigned id )
{
d - > id = id ;
d - > c = c ;
c - > devices [ id ] = d ;
closure_get ( & c - > caching ) ;
}
static void bcache_device_free ( struct bcache_device * d )
{
lockdep_assert_held ( & bch_register_lock ) ;
pr_info ( " %s stopped " , d - > disk - > disk_name ) ;
if ( d - > c )
bcache_device_detach ( d ) ;
2013-05-15 11:11:26 +04:00
if ( d - > disk & & d - > disk - > flags & GENHD_FL_UP )
2013-03-24 03:11:31 +04:00
del_gendisk ( d - > disk ) ;
if ( d - > disk & & d - > disk - > queue )
blk_cleanup_queue ( d - > disk - > queue ) ;
2013-07-31 12:12:02 +04:00
if ( d - > disk ) {
ida_simple_remove ( & bcache_minor , d - > disk - > first_minor ) ;
2013-03-24 03:11:31 +04:00
put_disk ( d - > disk ) ;
2013-07-31 12:12:02 +04:00
}
2013-03-24 03:11:31 +04:00
if ( d - > bio_split )
bioset_free ( d - > bio_split ) ;
2015-07-01 00:59:30 +03:00
kvfree ( d - > full_dirty_stripes ) ;
kvfree ( d - > stripe_sectors_dirty ) ;
2013-03-24 03:11:31 +04:00
closure_debug_destroy ( & d - > cl ) ;
}
2013-06-05 17:21:07 +04:00
static int bcache_device_init ( struct bcache_device * d , unsigned block_size ,
sector_t sectors )
2013-03-24 03:11:31 +04:00
{
struct request_queue * q ;
2013-06-05 17:21:07 +04:00
size_t n ;
2013-07-31 12:12:02 +04:00
int minor ;
2013-06-05 17:21:07 +04:00
2013-08-17 13:13:15 +04:00
if ( ! d - > stripe_size )
d - > stripe_size = 1 < < 31 ;
2013-06-05 17:21:07 +04:00
2013-08-17 13:13:15 +04:00
d - > nr_stripes = DIV_ROUND_UP_ULL ( sectors , d - > stripe_size ) ;
2013-06-05 17:21:07 +04:00
2013-11-01 02:43:22 +04:00
if ( ! d - > nr_stripes | |
d - > nr_stripes > INT_MAX | |
d - > nr_stripes > SIZE_MAX / sizeof ( atomic_t ) ) {
pr_err ( " nr_stripes too large " ) ;
2013-06-05 17:21:07 +04:00
return - ENOMEM ;
2013-11-01 02:43:22 +04:00
}
2013-06-05 17:21:07 +04:00
n = d - > nr_stripes * sizeof ( atomic_t ) ;
d - > stripe_sectors_dirty = n < PAGE_SIZE < < 6
? kzalloc ( n , GFP_KERNEL )
: vzalloc ( n ) ;
if ( ! d - > stripe_sectors_dirty )
return - ENOMEM ;
2013-03-24 03:11:31 +04:00
2013-11-01 02:43:22 +04:00
n = BITS_TO_LONGS ( d - > nr_stripes ) * sizeof ( unsigned long ) ;
d - > full_dirty_stripes = n < PAGE_SIZE < < 6
? kzalloc ( n , GFP_KERNEL )
: vzalloc ( n ) ;
if ( ! d - > full_dirty_stripes )
return - ENOMEM ;
2013-07-31 12:12:02 +04:00
minor = ida_simple_get ( & bcache_minor , 0 , MINORMASK + 1 , GFP_KERNEL ) ;
if ( minor < 0 )
return minor ;
2013-03-24 03:11:31 +04:00
if ( ! ( d - > bio_split = bioset_create ( 4 , offsetof ( struct bbio , bio ) ) ) | |
2013-07-31 12:12:02 +04:00
! ( d - > disk = alloc_disk ( 1 ) ) ) {
ida_simple_remove ( & bcache_minor , minor ) ;
2013-03-24 03:11:31 +04:00
return - ENOMEM ;
2013-07-31 12:12:02 +04:00
}
2013-03-24 03:11:31 +04:00
2013-06-05 17:21:07 +04:00
set_capacity ( d - > disk , sectors ) ;
2013-07-31 12:12:02 +04:00
snprintf ( d - > disk - > disk_name , DISK_NAME_LEN , " bcache%i " , minor ) ;
2013-03-24 03:11:31 +04:00
d - > disk - > major = bcache_major ;
2013-07-31 12:12:02 +04:00
d - > disk - > first_minor = minor ;
2013-03-24 03:11:31 +04:00
d - > disk - > fops = & bcache_ops ;
d - > disk - > private_data = d ;
2013-07-31 12:12:02 +04:00
q = blk_alloc_queue ( GFP_KERNEL ) ;
if ( ! q )
return - ENOMEM ;
2013-03-24 03:11:31 +04:00
blk_queue_make_request ( q , NULL ) ;
d - > disk - > queue = q ;
q - > queuedata = d ;
q - > backing_dev_info . congested_data = d ;
q - > limits . max_hw_sectors = UINT_MAX ;
q - > limits . max_sectors = UINT_MAX ;
q - > limits . max_segment_size = UINT_MAX ;
q - > limits . max_segments = BIO_MAX_PAGES ;
2015-07-14 17:15:12 +03:00
blk_queue_max_discard_sectors ( q , UINT_MAX ) ;
2014-02-11 05:26:40 +04:00
q - > limits . discard_granularity = 512 ;
2013-03-24 03:11:31 +04:00
q - > limits . io_min = block_size ;
q - > limits . logical_block_size = block_size ;
q - > limits . physical_block_size = block_size ;
set_bit ( QUEUE_FLAG_NONROT , & d - > disk - > queue - > queue_flags ) ;
2014-10-04 20:55:32 +04:00
clear_bit ( QUEUE_FLAG_ADD_RANDOM , & d - > disk - > queue - > queue_flags ) ;
2013-03-24 03:11:31 +04:00
set_bit ( QUEUE_FLAG_DISCARD , & d - > disk - > queue - > queue_flags ) ;
2016-03-30 19:13:22 +03:00
blk_queue_write_cache ( q , true , true ) ;
2013-07-11 05:44:40 +04:00
2013-03-24 03:11:31 +04:00
return 0 ;
}
/* Cached device */
static void calc_cached_dev_sectors ( struct cache_set * c )
{
uint64_t sectors = 0 ;
struct cached_dev * dc ;
list_for_each_entry ( dc , & c - > cached_devs , list )
sectors + = bdev_sectors ( dc - > bdev ) ;
c - > cached_dev_sectors = sectors ;
}
void bch_cached_dev_run ( struct cached_dev * dc )
{
struct bcache_device * d = & dc - > disk ;
2013-06-09 02:54:48 +04:00
char buf [ SB_LABEL_SIZE + 1 ] ;
2013-06-08 01:27:01 +04:00
char * env [ ] = {
" DRIVER=bcache " ,
kasprintf ( GFP_KERNEL , " CACHED_UUID=%pU " , dc - > sb . uuid ) ,
2013-06-09 02:54:48 +04:00
NULL ,
NULL ,
2013-06-08 01:27:01 +04:00
} ;
2013-03-24 03:11:31 +04:00
2013-06-09 02:54:48 +04:00
memcpy ( buf , dc - > sb . label , SB_LABEL_SIZE ) ;
buf [ SB_LABEL_SIZE ] = ' \0 ' ;
env [ 2 ] = kasprintf ( GFP_KERNEL , " CACHED_LABEL=%s " , buf ) ;
2015-11-30 04:20:59 +03:00
if ( atomic_xchg ( & dc - > running , 1 ) ) {
kfree ( env [ 1 ] ) ;
kfree ( env [ 2 ] ) ;
2013-03-24 03:11:31 +04:00
return ;
2015-11-30 04:20:59 +03:00
}
2013-03-24 03:11:31 +04:00
if ( ! d - > c & &
BDEV_STATE ( & dc - > sb ) ! = BDEV_STATE_NONE ) {
struct closure cl ;
closure_init_stack ( & cl ) ;
SET_BDEV_STATE ( & dc - > sb , BDEV_STATE_STALE ) ;
bch_write_bdev_super ( dc , & cl ) ;
closure_sync ( & cl ) ;
}
add_disk ( d - > disk ) ;
2013-02-01 19:29:41 +04:00
bd_link_disk_holder ( dc - > bdev , dc - > disk . disk ) ;
2013-06-08 01:27:01 +04:00
/* won't show up in the uevent file, use udevadm monitor -e instead
* only class / kset properties are persistent */
2013-03-24 03:11:31 +04:00
kobject_uevent_env ( & disk_to_dev ( d - > disk ) - > kobj , KOBJ_CHANGE , env ) ;
2013-06-08 01:27:01 +04:00
kfree ( env [ 1 ] ) ;
2013-06-09 02:54:48 +04:00
kfree ( env [ 2 ] ) ;
2013-06-08 01:27:01 +04:00
2013-03-24 03:11:31 +04:00
if ( sysfs_create_link ( & d - > kobj , & disk_to_dev ( d - > disk ) - > kobj , " dev " ) | |
sysfs_create_link ( & disk_to_dev ( d - > disk ) - > kobj , & d - > kobj , " bcache " ) )
pr_debug ( " error creating sysfs link " ) ;
}
static void cached_dev_detach_finish ( struct work_struct * w )
{
struct cached_dev * dc = container_of ( w , struct cached_dev , detach ) ;
char buf [ BDEVNAME_SIZE ] ;
struct closure cl ;
closure_init_stack ( & cl ) ;
2013-08-22 04:49:09 +04:00
BUG_ON ( ! test_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) ) ;
2013-03-24 03:11:31 +04:00
BUG_ON ( atomic_read ( & dc - > count ) ) ;
mutex_lock ( & bch_register_lock ) ;
memset ( & dc - > sb . set_uuid , 0 , 16 ) ;
SET_BDEV_STATE ( & dc - > sb , BDEV_STATE_NONE ) ;
bch_write_bdev_super ( dc , & cl ) ;
closure_sync ( & cl ) ;
bcache_device_detach ( & dc - > disk ) ;
list_move ( & dc - > list , & uncached_devices ) ;
2013-08-22 04:49:09 +04:00
clear_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) ;
2014-03-20 04:49:37 +04:00
clear_bit ( BCACHE_DEV_UNLINK_DONE , & dc - > disk . flags ) ;
2013-08-22 04:49:09 +04:00
2013-03-24 03:11:31 +04:00
mutex_unlock ( & bch_register_lock ) ;
pr_info ( " Caching disabled for %s " , bdevname ( dc - > bdev , buf ) ) ;
/* Drop ref we took in cached_dev_detach() */
closure_put ( & dc - > disk . cl ) ;
}
void bch_cached_dev_detach ( struct cached_dev * dc )
{
lockdep_assert_held ( & bch_register_lock ) ;
2013-08-22 04:49:09 +04:00
if ( test_bit ( BCACHE_DEV_CLOSING , & dc - > disk . flags ) )
2013-03-24 03:11:31 +04:00
return ;
2013-08-22 04:49:09 +04:00
if ( test_and_set_bit ( BCACHE_DEV_DETACHING , & dc - > disk . flags ) )
2013-03-24 03:11:31 +04:00
return ;
/*
* Block the device from being closed and freed until we ' re finished
* detaching
*/
closure_get ( & dc - > disk . cl ) ;
bch_writeback_queue ( dc ) ;
cached_dev_put ( dc ) ;
}
int bch_cached_dev_attach ( struct cached_dev * dc , struct cache_set * c )
{
uint32_t rtime = cpu_to_le32 ( get_seconds ( ) ) ;
struct uuid_entry * u ;
char buf [ BDEVNAME_SIZE ] ;
bdevname ( dc - > bdev , buf ) ;
if ( memcmp ( dc - > sb . set_uuid , c - > sb . set_uuid , 16 ) )
return - ENOENT ;
if ( dc - > disk . c ) {
pr_err ( " Can't attach %s: already attached " , buf ) ;
return - EINVAL ;
}
if ( test_bit ( CACHE_SET_STOPPING , & c - > flags ) ) {
pr_err ( " Can't attach %s: shutting down " , buf ) ;
return - EINVAL ;
}
if ( dc - > sb . block_size < c - > sb . block_size ) {
/* Will die */
2013-03-25 22:46:44 +04:00
pr_err ( " Couldn't attach %s: block size less than set's block size " ,
buf ) ;
2013-03-24 03:11:31 +04:00
return - EINVAL ;
}
u = uuid_find ( c , dc - > sb . uuid ) ;
if ( u & &
( BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_STALE | |
BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_NONE ) ) {
memcpy ( u - > uuid , invalid_uuid , 16 ) ;
u - > invalidated = cpu_to_le32 ( get_seconds ( ) ) ;
u = NULL ;
}
if ( ! u ) {
if ( BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_DIRTY ) {
pr_err ( " Couldn't find uuid for %s in set " , buf ) ;
return - ENOENT ;
}
u = uuid_find_empty ( c ) ;
if ( ! u ) {
pr_err ( " Not caching %s, no room for UUID " , buf ) ;
return - EINVAL ;
}
}
/* Deadlocks since we're called via sysfs...
sysfs_remove_file ( & dc - > kobj , & sysfs_attach ) ;
*/
2013-03-28 22:50:55 +04:00
if ( bch_is_zero ( u - > uuid , 16 ) ) {
2013-03-24 03:11:31 +04:00
struct closure cl ;
closure_init_stack ( & cl ) ;
memcpy ( u - > uuid , dc - > sb . uuid , 16 ) ;
memcpy ( u - > label , dc - > sb . label , SB_LABEL_SIZE ) ;
u - > first_reg = u - > last_reg = rtime ;
bch_uuid_write ( c ) ;
memcpy ( dc - > sb . set_uuid , c - > sb . set_uuid , 16 ) ;
SET_BDEV_STATE ( & dc - > sb , BDEV_STATE_CLEAN ) ;
bch_write_bdev_super ( dc , & cl ) ;
closure_sync ( & cl ) ;
} else {
u - > last_reg = rtime ;
bch_uuid_write ( c ) ;
}
bcache_device_attach ( & dc - > disk , c , u - c - > uuids ) ;
list_move ( & dc - > list , & c - > cached_devs ) ;
calc_cached_dev_sectors ( c ) ;
smp_wmb ( ) ;
/*
* dc - > c must be set before dc - > count ! = 0 - paired with the mb in
* cached_dev_get ( )
*/
atomic_set ( & dc - > count , 1 ) ;
2016-02-27 01:39:06 +03:00
/* Block writeback thread, but spawn it */
down_write ( & dc - > writeback_lock ) ;
if ( bch_cached_dev_writeback_start ( dc ) ) {
up_write ( & dc - > writeback_lock ) ;
2014-05-02 00:48:57 +04:00
return - ENOMEM ;
2016-02-27 01:39:06 +03:00
}
2014-05-02 00:48:57 +04:00
2013-03-24 03:11:31 +04:00
if ( BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_DIRTY ) {
2013-05-12 04:07:26 +04:00
bch_sectors_dirty_init ( dc ) ;
2013-03-24 03:11:31 +04:00
atomic_set ( & dc - > has_dirty , 1 ) ;
atomic_inc ( & dc - > count ) ;
bch_writeback_queue ( dc ) ;
}
bch_cached_dev_run ( dc ) ;
2013-02-01 19:29:41 +04:00
bcache_device_link ( & dc - > disk , c , " bdev " ) ;
2013-03-24 03:11:31 +04:00
2016-02-27 01:39:06 +03:00
/* Allow the writeback thread to proceed */
up_write ( & dc - > writeback_lock ) ;
2013-03-24 03:11:31 +04:00
pr_info ( " Caching %s as %s on set %pU " ,
bdevname ( dc - > bdev , buf ) , dc - > disk . disk - > disk_name ,
dc - > disk . c - > sb . set_uuid ) ;
return 0 ;
}
void bch_cached_dev_release ( struct kobject * kobj )
{
struct cached_dev * dc = container_of ( kobj , struct cached_dev ,
disk . kobj ) ;
kfree ( dc ) ;
module_put ( THIS_MODULE ) ;
}
static void cached_dev_free ( struct closure * cl )
{
struct cached_dev * dc = container_of ( cl , struct cached_dev , disk . cl ) ;
cancel_delayed_work_sync ( & dc - > writeback_rate_update ) ;
2014-05-20 23:20:28 +04:00
if ( ! IS_ERR_OR_NULL ( dc - > writeback_thread ) )
kthread_stop ( dc - > writeback_thread ) ;
2013-03-24 03:11:31 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-05-15 11:11:26 +04:00
if ( atomic_read ( & dc - > running ) )
bd_unlink_disk_holder ( dc - > bdev , dc - > disk . disk ) ;
2013-03-24 03:11:31 +04:00
bcache_device_free ( & dc - > disk ) ;
list_del ( & dc - > list ) ;
mutex_unlock ( & bch_register_lock ) ;
2014-07-08 00:03:36 +04:00
if ( ! IS_ERR_OR_NULL ( dc - > bdev ) )
2013-03-24 03:11:31 +04:00
blkdev_put ( dc - > bdev , FMODE_READ | FMODE_WRITE | FMODE_EXCL ) ;
wake_up ( & unregister_wait ) ;
kobject_put ( & dc - > disk . kobj ) ;
}
static void cached_dev_flush ( struct closure * cl )
{
struct cached_dev * dc = container_of ( cl , struct cached_dev , disk . cl ) ;
struct bcache_device * d = & dc - > disk ;
2013-07-11 08:25:02 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-08-22 04:49:09 +04:00
bcache_device_unlink ( d ) ;
2013-07-11 08:25:02 +04:00
mutex_unlock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
bch_cache_accounting_destroy ( & dc - > accounting ) ;
kobject_del ( & d - > kobj ) ;
continue_at ( cl , cached_dev_free , system_wq ) ;
}
static int cached_dev_init ( struct cached_dev * dc , unsigned block_size )
{
2013-05-15 11:11:26 +04:00
int ret ;
2013-03-24 03:11:31 +04:00
struct io * io ;
2013-05-15 11:11:26 +04:00
struct request_queue * q = bdev_get_queue ( dc - > bdev ) ;
2013-03-24 03:11:31 +04:00
__module_get ( THIS_MODULE ) ;
INIT_LIST_HEAD ( & dc - > list ) ;
2013-05-15 11:11:26 +04:00
closure_init ( & dc - > disk . cl , NULL ) ;
set_closure_fn ( & dc - > disk . cl , cached_dev_flush , system_wq ) ;
2013-03-24 03:11:31 +04:00
kobject_init ( & dc - > disk . kobj , & bch_cached_dev_ktype ) ;
INIT_WORK ( & dc - > detach , cached_dev_detach_finish ) ;
2013-12-17 03:27:25 +04:00
sema_init ( & dc - > sb_write_mutex , 1 ) ;
2013-05-15 11:11:26 +04:00
INIT_LIST_HEAD ( & dc - > io_lru ) ;
spin_lock_init ( & dc - > io_lock ) ;
bch_cache_accounting_init ( & dc - > accounting , & dc - > disk . cl ) ;
2013-03-24 03:11:31 +04:00
dc - > sequential_cutoff = 4 < < 20 ;
for ( io = dc - > io ; io < dc - > io + RECENT_IO ; io + + ) {
list_add ( & io - > lru , & dc - > io_lru ) ;
hlist_add_head ( & io - > hash , dc - > io_hash + RECENT_IO ) ;
}
2013-07-12 09:39:53 +04:00
dc - > disk . stripe_size = q - > limits . io_opt > > 9 ;
if ( dc - > disk . stripe_size )
dc - > partial_stripes_expensive =
q - > limits . raid_partial_stripes_expensive ;
2013-06-05 17:21:07 +04:00
ret = bcache_device_init ( & dc - > disk , block_size ,
dc - > bdev - > bd_part - > nr_sects - dc - > sb . data_offset ) ;
2013-05-15 11:11:26 +04:00
if ( ret )
return ret ;
set_capacity ( dc - > disk . disk ,
dc - > bdev - > bd_part - > nr_sects - dc - > sb . data_offset ) ;
dc - > disk . disk - > queue - > backing_dev_info . ra_pages =
max ( dc - > disk . disk - > queue - > backing_dev_info . ra_pages ,
q - > backing_dev_info . ra_pages ) ;
bch_cached_dev_request_init ( dc ) ;
bch_cached_dev_writeback_init ( dc ) ;
2013-03-24 03:11:31 +04:00
return 0 ;
}
/* Cached device - bcache superblock */
2013-05-15 11:11:26 +04:00
static void register_bdev ( struct cache_sb * sb , struct page * sb_page ,
2013-03-24 03:11:31 +04:00
struct block_device * bdev ,
struct cached_dev * dc )
{
char name [ BDEVNAME_SIZE ] ;
const char * err = " cannot allocate memory " ;
struct cache_set * c ;
memcpy ( & dc - > sb , sb , sizeof ( struct cache_sb ) ) ;
dc - > bdev = bdev ;
dc - > bdev - > bd_holder = dc ;
2013-05-15 11:11:26 +04:00
bio_init ( & dc - > sb_bio ) ;
dc - > sb_bio . bi_max_vecs = 1 ;
dc - > sb_bio . bi_io_vec = dc - > sb_bio . bi_inline_vecs ;
dc - > sb_bio . bi_io_vec [ 0 ] . bv_page = sb_page ;
get_page ( sb_page ) ;
2013-03-27 22:09:23 +04:00
2013-05-15 11:11:26 +04:00
if ( cached_dev_init ( dc , sb - > block_size < < 9 ) )
goto err ;
2013-03-24 03:11:31 +04:00
err = " error creating kobject " ;
if ( kobject_add ( & dc - > disk . kobj , & part_to_dev ( bdev - > bd_part ) - > kobj ,
" bcache " ) )
goto err ;
if ( bch_cache_accounting_add_kobjs ( & dc - > accounting , & dc - > disk . kobj ) )
goto err ;
2013-05-15 11:11:26 +04:00
pr_info ( " registered backing device %s " , bdevname ( bdev , name ) ) ;
2013-03-24 03:11:31 +04:00
list_add ( & dc - > list , & uncached_devices ) ;
list_for_each_entry ( c , & bch_cache_sets , list )
bch_cached_dev_attach ( dc , c ) ;
if ( BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_NONE | |
BDEV_STATE ( & dc - > sb ) = = BDEV_STATE_STALE )
bch_cached_dev_run ( dc ) ;
2013-05-15 11:11:26 +04:00
return ;
2013-03-24 03:11:31 +04:00
err :
pr_notice ( " error opening %s: %s " , bdevname ( bdev , name ) , err ) ;
2013-05-15 11:11:26 +04:00
bcache_device_stop ( & dc - > disk ) ;
2013-03-24 03:11:31 +04:00
}
/* Flash only volumes */
void bch_flash_dev_release ( struct kobject * kobj )
{
struct bcache_device * d = container_of ( kobj , struct bcache_device ,
kobj ) ;
kfree ( d ) ;
}
static void flash_dev_free ( struct closure * cl )
{
struct bcache_device * d = container_of ( cl , struct bcache_device , cl ) ;
2014-04-30 02:39:27 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
bcache_device_free ( d ) ;
2014-04-30 02:39:27 +04:00
mutex_unlock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
kobject_put ( & d - > kobj ) ;
}
static void flash_dev_flush ( struct closure * cl )
{
struct bcache_device * d = container_of ( cl , struct bcache_device , cl ) ;
2014-04-30 02:39:27 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-02-01 19:29:41 +04:00
bcache_device_unlink ( d ) ;
2014-04-30 02:39:27 +04:00
mutex_unlock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
kobject_del ( & d - > kobj ) ;
continue_at ( cl , flash_dev_free , system_wq ) ;
}
static int flash_dev_run ( struct cache_set * c , struct uuid_entry * u )
{
struct bcache_device * d = kzalloc ( sizeof ( struct bcache_device ) ,
GFP_KERNEL ) ;
if ( ! d )
return - ENOMEM ;
closure_init ( & d - > cl , NULL ) ;
set_closure_fn ( & d - > cl , flash_dev_flush , system_wq ) ;
kobject_init ( & d - > kobj , & bch_flash_dev_ktype ) ;
2013-06-05 17:21:07 +04:00
if ( bcache_device_init ( d , block_bytes ( c ) , u - > sectors ) )
2013-03-24 03:11:31 +04:00
goto err ;
bcache_device_attach ( d , c , u - c - > uuids ) ;
bch_flash_dev_request_init ( d ) ;
add_disk ( d - > disk ) ;
if ( kobject_add ( & d - > kobj , & disk_to_dev ( d - > disk ) - > kobj , " bcache " ) )
goto err ;
bcache_device_link ( d , c , " volume " ) ;
return 0 ;
err :
kobject_put ( & d - > kobj ) ;
return - ENOMEM ;
}
static int flash_devs_run ( struct cache_set * c )
{
int ret = 0 ;
struct uuid_entry * u ;
for ( u = c - > uuids ;
u < c - > uuids + c - > nr_uuids & & ! ret ;
u + + )
if ( UUID_FLASH_ONLY ( u ) )
ret = flash_dev_run ( c , u ) ;
return ret ;
}
int bch_flash_dev_create ( struct cache_set * c , uint64_t size )
{
struct uuid_entry * u ;
if ( test_bit ( CACHE_SET_STOPPING , & c - > flags ) )
return - EINTR ;
2014-07-11 23:17:41 +04:00
if ( ! test_bit ( CACHE_SET_RUNNING , & c - > flags ) )
return - EPERM ;
2013-03-24 03:11:31 +04:00
u = uuid_find_empty ( c ) ;
if ( ! u ) {
pr_err ( " Can't create volume, no room for UUID " ) ;
return - EINVAL ;
}
get_random_bytes ( u - > uuid , 16 ) ;
memset ( u - > label , 0 , 32 ) ;
u - > first_reg = u - > last_reg = cpu_to_le32 ( get_seconds ( ) ) ;
SET_UUID_FLASH_ONLY ( u , 1 ) ;
u - > sectors = size > > 9 ;
bch_uuid_write ( c ) ;
return flash_dev_run ( c , u ) ;
}
/* Cache set */
__printf ( 2 , 3 )
bool bch_cache_set_error ( struct cache_set * c , const char * fmt , . . . )
{
va_list args ;
2013-07-12 06:42:51 +04:00
if ( c - > on_error ! = ON_ERROR_PANIC & &
test_bit ( CACHE_SET_STOPPING , & c - > flags ) )
2013-03-24 03:11:31 +04:00
return false ;
/* XXX: we can be called from atomic context
acquire_console_sem ( ) ;
*/
printk ( KERN_ERR " bcache: error on %pU: " , c - > sb . set_uuid ) ;
va_start ( args , fmt ) ;
vprintk ( fmt , args ) ;
va_end ( args ) ;
printk ( " , disabling caching \n " ) ;
2013-07-12 06:42:51 +04:00
if ( c - > on_error = = ON_ERROR_PANIC )
panic ( " panic forced after error \n " ) ;
2013-03-24 03:11:31 +04:00
bch_cache_set_unregister ( c ) ;
return true ;
}
void bch_cache_set_release ( struct kobject * kobj )
{
struct cache_set * c = container_of ( kobj , struct cache_set , kobj ) ;
kfree ( c ) ;
module_put ( THIS_MODULE ) ;
}
static void cache_set_free ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , cl ) ;
struct cache * ca ;
unsigned i ;
if ( ! IS_ERR_OR_NULL ( c - > debug ) )
debugfs_remove ( c - > debug ) ;
bch_open_buckets_free ( c ) ;
bch_btree_cache_free ( c ) ;
bch_journal_free ( c ) ;
for_each_cache ( ca , c , i )
2014-06-20 02:05:59 +04:00
if ( ca ) {
ca - > set = NULL ;
c - > cache [ ca - > sb . nr_this_dev ] = NULL ;
2013-03-24 03:11:31 +04:00
kobject_put ( & ca - > kobj ) ;
2014-06-20 02:05:59 +04:00
}
2013-03-24 03:11:31 +04:00
2013-09-11 09:53:34 +04:00
bch_bset_sort_state_free ( & c - > sort ) ;
2013-03-24 03:11:31 +04:00
free_pages ( ( unsigned long ) c - > uuids , ilog2 ( bucket_pages ( c ) ) ) ;
2014-01-10 04:03:04 +04:00
if ( c - > moving_gc_wq )
destroy_workqueue ( c - > moving_gc_wq ) ;
2013-03-24 03:11:31 +04:00
if ( c - > bio_split )
bioset_free ( c - > bio_split ) ;
2013-04-26 00:58:35 +04:00
if ( c - > fill_iter )
mempool_destroy ( c - > fill_iter ) ;
2013-03-24 03:11:31 +04:00
if ( c - > bio_meta )
mempool_destroy ( c - > bio_meta ) ;
if ( c - > search )
mempool_destroy ( c - > search ) ;
kfree ( c - > devices ) ;
mutex_lock ( & bch_register_lock ) ;
list_del ( & c - > list ) ;
mutex_unlock ( & bch_register_lock ) ;
pr_info ( " Cache set %pU unregistered " , c - > sb . set_uuid ) ;
wake_up ( & unregister_wait ) ;
closure_debug_destroy ( & c - > cl ) ;
kobject_put ( & c - > kobj ) ;
}
static void cache_set_flush ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , caching ) ;
2013-07-11 05:31:58 +04:00
struct cache * ca ;
2013-03-24 03:11:31 +04:00
struct btree * b ;
2013-07-11 05:31:58 +04:00
unsigned i ;
2013-03-24 03:11:31 +04:00
2016-03-08 02:17:50 +03:00
if ( ! c )
closure_return ( cl ) ;
2013-03-24 03:11:31 +04:00
bch_cache_accounting_destroy ( & c - > accounting ) ;
kobject_put ( & c - > internal ) ;
kobject_del ( & c - > kobj ) ;
2013-10-25 04:19:26 +04:00
if ( c - > gc_thread )
kthread_stop ( c - > gc_thread ) ;
2013-03-24 03:11:31 +04:00
if ( ! IS_ERR_OR_NULL ( c - > root ) )
list_add ( & c - > root - > list , & c - > btree_cache ) ;
/* Should skip this if we're unregistering because of an error */
2014-03-05 04:42:42 +04:00
list_for_each_entry ( b , & c - > btree_cache , list ) {
mutex_lock ( & b - > write_lock ) ;
2013-03-24 03:11:31 +04:00
if ( btree_node_dirty ( b ) )
2014-03-05 04:42:42 +04:00
__bch_btree_node_write ( b , NULL ) ;
mutex_unlock ( & b - > write_lock ) ;
}
2013-03-24 03:11:31 +04:00
2013-07-11 05:31:58 +04:00
for_each_cache ( ca , c , i )
if ( ca - > alloc_thread )
kthread_stop ( ca - > alloc_thread ) ;
2014-03-20 04:49:37 +04:00
if ( c - > journal . cur ) {
cancel_delayed_work_sync ( & c - > journal . work ) ;
/* flush last journal entry if needed */
c - > journal . work . work . func ( & c - > journal . work . work ) ;
}
2014-02-20 07:48:26 +04:00
2013-03-24 03:11:31 +04:00
closure_return ( cl ) ;
}
static void __cache_set_unregister ( struct closure * cl )
{
struct cache_set * c = container_of ( cl , struct cache_set , caching ) ;
2013-07-11 08:03:25 +04:00
struct cached_dev * dc ;
2013-03-24 03:11:31 +04:00
size_t i ;
mutex_lock ( & bch_register_lock ) ;
for ( i = 0 ; i < c - > nr_uuids ; i + + )
2013-07-11 08:03:25 +04:00
if ( c - > devices [ i ] ) {
if ( ! UUID_FLASH_ONLY ( & c - > uuids [ i ] ) & &
test_bit ( CACHE_SET_UNREGISTERING , & c - > flags ) ) {
dc = container_of ( c - > devices [ i ] ,
struct cached_dev , disk ) ;
bch_cached_dev_detach ( dc ) ;
} else {
bcache_device_stop ( c - > devices [ i ] ) ;
}
}
2013-03-24 03:11:31 +04:00
mutex_unlock ( & bch_register_lock ) ;
continue_at ( cl , cache_set_flush , system_wq ) ;
}
void bch_cache_set_stop ( struct cache_set * c )
{
if ( ! test_and_set_bit ( CACHE_SET_STOPPING , & c - > flags ) )
closure_queue ( & c - > caching ) ;
}
void bch_cache_set_unregister ( struct cache_set * c )
{
set_bit ( CACHE_SET_UNREGISTERING , & c - > flags ) ;
bch_cache_set_stop ( c ) ;
}
# define alloc_bucket_pages(gfp, c) \
( ( void * ) __get_free_pages ( __GFP_ZERO | gfp , ilog2 ( bucket_pages ( c ) ) ) )
struct cache_set * bch_cache_set_alloc ( struct cache_sb * sb )
{
int iter_size ;
struct cache_set * c = kzalloc ( sizeof ( struct cache_set ) , GFP_KERNEL ) ;
if ( ! c )
return NULL ;
__module_get ( THIS_MODULE ) ;
closure_init ( & c - > cl , NULL ) ;
set_closure_fn ( & c - > cl , cache_set_free , system_wq ) ;
closure_init ( & c - > caching , & c - > cl ) ;
set_closure_fn ( & c - > caching , __cache_set_unregister , system_wq ) ;
/* Maybe create continue_at_noreturn() and use it here? */
closure_set_stopped ( & c - > cl ) ;
closure_put ( & c - > cl ) ;
kobject_init ( & c - > kobj , & bch_cache_set_ktype ) ;
kobject_init ( & c - > internal , & bch_cache_set_internal_ktype ) ;
bch_cache_accounting_init ( & c - > accounting , & c - > cl ) ;
memcpy ( c - > sb . set_uuid , sb - > set_uuid , 16 ) ;
c - > sb . block_size = sb - > block_size ;
c - > sb . bucket_size = sb - > bucket_size ;
c - > sb . nr_in_set = sb - > nr_in_set ;
c - > sb . last_mount = sb - > last_mount ;
c - > bucket_bits = ilog2 ( sb - > bucket_size ) ;
c - > block_bits = ilog2 ( sb - > block_size ) ;
c - > nr_uuids = bucket_bytes ( c ) / sizeof ( struct uuid_entry ) ;
2013-12-18 11:49:49 +04:00
c - > btree_pages = bucket_pages ( c ) ;
2013-03-24 03:11:31 +04:00
if ( c - > btree_pages > BTREE_MAX_PAGES )
c - > btree_pages = max_t ( int , c - > btree_pages / 4 ,
BTREE_MAX_PAGES ) ;
2013-12-17 03:27:25 +04:00
sema_init ( & c - > sb_write_mutex , 1 ) ;
2013-07-25 04:27:07 +04:00
mutex_init ( & c - > bucket_lock ) ;
2014-03-18 04:15:53 +04:00
init_waitqueue_head ( & c - > btree_cache_wait ) ;
2013-07-25 04:29:09 +04:00
init_waitqueue_head ( & c - > bucket_wait ) ;
2013-12-17 03:27:25 +04:00
sema_init ( & c - > uuid_write_mutex , 1 ) ;
2013-07-31 11:03:54 +04:00
spin_lock_init ( & c - > btree_gc_time . lock ) ;
spin_lock_init ( & c - > btree_split_time . lock ) ;
spin_lock_init ( & c - > btree_read_time . lock ) ;
2013-07-25 04:27:07 +04:00
2013-03-24 03:11:31 +04:00
bch_moving_init_cache_set ( c ) ;
INIT_LIST_HEAD ( & c - > list ) ;
INIT_LIST_HEAD ( & c - > cached_devs ) ;
INIT_LIST_HEAD ( & c - > btree_cache ) ;
INIT_LIST_HEAD ( & c - > btree_cache_freeable ) ;
INIT_LIST_HEAD ( & c - > btree_cache_freed ) ;
INIT_LIST_HEAD ( & c - > data_buckets ) ;
c - > search = mempool_create_slab_pool ( 32 , bch_search_cache ) ;
if ( ! c - > search )
goto err ;
iter_size = ( sb - > bucket_size / sb - > block_size + 1 ) *
sizeof ( struct btree_iter_set ) ;
if ( ! ( c - > devices = kzalloc ( c - > nr_uuids * sizeof ( void * ) , GFP_KERNEL ) ) | |
! ( c - > bio_meta = mempool_create_kmalloc_pool ( 2 ,
sizeof ( struct bbio ) + sizeof ( struct bio_vec ) *
bucket_pages ( c ) ) ) | |
2013-04-26 00:58:35 +04:00
! ( c - > fill_iter = mempool_create_kmalloc_pool ( 1 , iter_size ) ) | |
2013-03-24 03:11:31 +04:00
! ( c - > bio_split = bioset_create ( 4 , offsetof ( struct bbio , bio ) ) ) | |
! ( c - > uuids = alloc_bucket_pages ( GFP_KERNEL , c ) ) | |
2016-06-07 23:27:19 +03:00
! ( c - > moving_gc_wq = alloc_workqueue ( " bcache_gc " ,
WQ_MEM_RECLAIM , 0 ) ) | |
2013-03-24 03:11:31 +04:00
bch_journal_alloc ( c ) | |
bch_btree_cache_alloc ( c ) | |
2013-09-11 09:53:34 +04:00
bch_open_buckets_alloc ( c ) | |
bch_bset_sort_state_init ( & c - > sort , ilog2 ( c - > btree_pages ) ) )
2013-03-24 03:11:31 +04:00
goto err ;
c - > congested_read_threshold_us = 2000 ;
c - > congested_write_threshold_us = 20000 ;
c - > error_limit = 8 < < IO_ERROR_SHIFT ;
return c ;
err :
bch_cache_set_unregister ( c ) ;
return NULL ;
}
static void run_cache_set ( struct cache_set * c )
{
const char * err = " cannot allocate memory " ;
struct cached_dev * dc , * t ;
struct cache * ca ;
2013-07-25 04:44:17 +04:00
struct closure cl ;
2013-03-24 03:11:31 +04:00
unsigned i ;
2013-07-25 04:44:17 +04:00
closure_init_stack ( & cl ) ;
2013-03-24 03:11:31 +04:00
for_each_cache ( ca , c , i )
c - > nbuckets + = ca - > sb . nbuckets ;
if ( CACHE_SYNC ( & c - > sb ) ) {
LIST_HEAD ( journal ) ;
struct bkey * k ;
struct jset * j ;
err = " cannot allocate memory for journal " ;
2013-07-25 04:44:17 +04:00
if ( bch_journal_read ( c , & journal ) )
2013-03-24 03:11:31 +04:00
goto err ;
pr_debug ( " btree_journal_read() done " ) ;
err = " no journal entries found " ;
if ( list_empty ( & journal ) )
goto err ;
j = & list_entry ( journal . prev , struct journal_replay , list ) - > j ;
err = " IO error reading priorities " ;
for_each_cache ( ca , c , i )
prio_read ( ca , j - > prio_bucket [ ca - > sb . nr_this_dev ] ) ;
/*
* If prio_read ( ) fails it ' ll call cache_set_error and we ' ll
* tear everything down right away , but if we perhaps checked
* sooner we could avoid journal replay .
*/
k = & j - > btree_root ;
err = " bad btree root " ;
2013-12-21 05:22:05 +04:00
if ( __bch_btree_ptr_invalid ( c , k ) )
2013-03-24 03:11:31 +04:00
goto err ;
err = " error reading btree root " ;
2014-07-12 11:22:53 +04:00
c - > root = bch_btree_node_get ( c , NULL , k , j - > btree_level , true , NULL ) ;
2013-03-24 03:11:31 +04:00
if ( IS_ERR_OR_NULL ( c - > root ) )
goto err ;
list_del_init ( & c - > root - > list ) ;
rw_unlock ( true , c - > root ) ;
2013-07-25 04:44:17 +04:00
err = uuid_read ( c , j , & cl ) ;
2013-03-24 03:11:31 +04:00
if ( err )
goto err ;
err = " error in recovery " ;
2013-07-25 04:44:17 +04:00
if ( bch_btree_check ( c ) )
2013-03-24 03:11:31 +04:00
goto err ;
bch_journal_mark ( c , & journal ) ;
2014-03-18 03:55:55 +04:00
bch_initial_gc_finish ( c ) ;
2013-03-24 03:11:31 +04:00
pr_debug ( " btree_check() done " ) ;
/*
* bcache_journal_next ( ) can ' t happen sooner , or
* btree_gc_finish ( ) will give spurious errors about last_gc >
* gc_gen - this is a hack but oh well .
*/
bch_journal_next ( & c - > journal ) ;
2013-04-25 06:01:12 +04:00
err = " error starting allocator thread " ;
2013-03-24 03:11:31 +04:00
for_each_cache ( ca , c , i )
2013-04-25 06:01:12 +04:00
if ( bch_cache_allocator_start ( ca ) )
goto err ;
2013-03-24 03:11:31 +04:00
/*
* First place it ' s safe to allocate : btree_check ( ) and
* btree_gc_finish ( ) have to run before we have buckets to
* allocate , and bch_bucket_alloc_set ( ) might cause a journal
* entry to be written so bcache_journal_next ( ) has to be called
* first .
*
* If the uuids were in the old format we have to rewrite them
* before the next journal entry is written :
*/
if ( j - > version < BCACHE_JSET_VERSION_UUID )
__uuid_write ( c ) ;
2013-07-25 04:44:17 +04:00
bch_journal_replay ( c , & journal ) ;
2013-03-24 03:11:31 +04:00
} else {
pr_notice ( " invalidating existing data " ) ;
for_each_cache ( ca , c , i ) {
unsigned j ;
ca - > sb . keys = clamp_t ( int , ca - > sb . nbuckets > > 7 ,
2 , SB_JOURNAL_BUCKETS ) ;
for ( j = 0 ; j < ca - > sb . keys ; j + + )
ca - > sb . d [ j ] = ca - > sb . first_bucket + j ;
}
2014-03-18 03:55:55 +04:00
bch_initial_gc_finish ( c ) ;
2013-03-24 03:11:31 +04:00
2013-04-25 06:01:12 +04:00
err = " error starting allocator thread " ;
2013-03-24 03:11:31 +04:00
for_each_cache ( ca , c , i )
2013-04-25 06:01:12 +04:00
if ( bch_cache_allocator_start ( ca ) )
goto err ;
2013-03-24 03:11:31 +04:00
mutex_lock ( & c - > bucket_lock ) ;
for_each_cache ( ca , c , i )
bch_prio_write ( ca ) ;
mutex_unlock ( & c - > bucket_lock ) ;
err = " cannot allocate new UUID bucket " ;
if ( __uuid_write ( c ) )
2013-10-25 04:19:26 +04:00
goto err ;
2013-03-24 03:11:31 +04:00
err = " cannot allocate new btree root " ;
2014-07-12 11:22:53 +04:00
c - > root = __bch_btree_node_alloc ( c , NULL , 0 , true , NULL ) ;
2013-03-24 03:11:31 +04:00
if ( IS_ERR_OR_NULL ( c - > root ) )
2013-10-25 04:19:26 +04:00
goto err ;
2013-03-24 03:11:31 +04:00
2014-03-05 04:42:42 +04:00
mutex_lock ( & c - > root - > write_lock ) ;
2013-03-24 03:11:31 +04:00
bkey_copy_key ( & c - > root - > key , & MAX_KEY ) ;
2013-07-25 04:44:17 +04:00
bch_btree_node_write ( c - > root , & cl ) ;
2014-03-05 04:42:42 +04:00
mutex_unlock ( & c - > root - > write_lock ) ;
2013-03-24 03:11:31 +04:00
bch_btree_set_root ( c - > root ) ;
rw_unlock ( true , c - > root ) ;
/*
* We don ' t want to write the first journal entry until
* everything is set up - fortunately journal entries won ' t be
* written until the SET_CACHE_SYNC ( ) here :
*/
SET_CACHE_SYNC ( & c - > sb , true ) ;
bch_journal_next ( & c - > journal ) ;
2013-07-25 04:44:17 +04:00
bch_journal_meta ( c , & cl ) ;
2013-03-24 03:11:31 +04:00
}
2013-10-25 04:19:26 +04:00
err = " error starting gc thread " ;
if ( bch_gc_thread_start ( c ) )
goto err ;
2013-07-25 04:44:17 +04:00
closure_sync ( & cl ) ;
2013-03-24 03:11:31 +04:00
c - > sb . last_mount = get_seconds ( ) ;
bcache_write_super ( c ) ;
list_for_each_entry_safe ( dc , t , & uncached_devices , list )
bch_cached_dev_attach ( dc , c ) ;
flash_devs_run ( c ) ;
2014-07-11 23:17:41 +04:00
set_bit ( CACHE_SET_RUNNING , & c - > flags ) ;
2013-03-24 03:11:31 +04:00
return ;
err :
2013-07-25 04:44:17 +04:00
closure_sync ( & cl ) ;
2013-03-24 03:11:31 +04:00
/* XXX: test this, it's broken */
2013-09-11 08:41:34 +04:00
bch_cache_set_error ( c , " %s " , err ) ;
2013-03-24 03:11:31 +04:00
}
static bool can_attach_cache ( struct cache * ca , struct cache_set * c )
{
return ca - > sb . block_size = = c - > sb . block_size & &
2013-10-23 00:19:23 +04:00
ca - > sb . bucket_size = = c - > sb . bucket_size & &
2013-03-24 03:11:31 +04:00
ca - > sb . nr_in_set = = c - > sb . nr_in_set ;
}
static const char * register_cache_set ( struct cache * ca )
{
char buf [ 12 ] ;
const char * err = " cannot allocate memory " ;
struct cache_set * c ;
list_for_each_entry ( c , & bch_cache_sets , list )
if ( ! memcmp ( c - > sb . set_uuid , ca - > sb . set_uuid , 16 ) ) {
if ( c - > cache [ ca - > sb . nr_this_dev ] )
return " duplicate cache set member " ;
if ( ! can_attach_cache ( ca , c ) )
return " cache sb does not match set " ;
if ( ! CACHE_SYNC ( & ca - > sb ) )
SET_CACHE_SYNC ( & c - > sb , false ) ;
goto found ;
}
c = bch_cache_set_alloc ( & ca - > sb ) ;
if ( ! c )
return err ;
err = " error creating kobject " ;
if ( kobject_add ( & c - > kobj , bcache_kobj , " %pU " , c - > sb . set_uuid ) | |
kobject_add ( & c - > internal , & c - > kobj , " internal " ) )
goto err ;
if ( bch_cache_accounting_add_kobjs ( & c - > accounting , & c - > kobj ) )
goto err ;
bch_debug_init_cache_set ( c ) ;
list_add ( & c - > list , & bch_cache_sets ) ;
found :
sprintf ( buf , " cache%i " , ca - > sb . nr_this_dev ) ;
if ( sysfs_create_link ( & ca - > kobj , & c - > kobj , " set " ) | |
sysfs_create_link ( & c - > kobj , & ca - > kobj , buf ) )
goto err ;
if ( ca - > sb . seq > c - > sb . seq ) {
c - > sb . version = ca - > sb . version ;
memcpy ( c - > sb . set_uuid , ca - > sb . set_uuid , 16 ) ;
c - > sb . flags = ca - > sb . flags ;
c - > sb . seq = ca - > sb . seq ;
pr_debug ( " set version = %llu " , c - > sb . version ) ;
}
2014-06-12 06:44:49 +04:00
kobject_get ( & ca - > kobj ) ;
2013-03-24 03:11:31 +04:00
ca - > set = c ;
ca - > set - > cache [ ca - > sb . nr_this_dev ] = ca ;
c - > cache_by_alloc [ c - > caches_loaded + + ] = ca ;
if ( c - > caches_loaded = = c - > sb . nr_in_set )
run_cache_set ( c ) ;
return NULL ;
err :
bch_cache_set_unregister ( c ) ;
return err ;
}
/* Cache device */
void bch_cache_release ( struct kobject * kobj )
{
struct cache * ca = container_of ( kobj , struct cache , kobj ) ;
2013-12-17 13:29:34 +04:00
unsigned i ;
2013-03-24 03:11:31 +04:00
2014-06-20 02:05:59 +04:00
if ( ca - > set ) {
BUG_ON ( ca - > set - > cache [ ca - > sb . nr_this_dev ] ! = ca ) ;
2013-03-24 03:11:31 +04:00
ca - > set - > cache [ ca - > sb . nr_this_dev ] = NULL ;
2014-06-20 02:05:59 +04:00
}
2013-03-24 03:11:31 +04:00
free_pages ( ( unsigned long ) ca - > disk_buckets , ilog2 ( bucket_pages ( ca ) ) ) ;
kfree ( ca - > prio_buckets ) ;
vfree ( ca - > buckets ) ;
free_heap ( & ca - > heap ) ;
free_fifo ( & ca - > free_inc ) ;
2013-12-17 13:29:34 +04:00
for ( i = 0 ; i < RESERVE_NR ; i + + )
free_fifo ( & ca - > free [ i ] ) ;
2013-03-24 03:11:31 +04:00
if ( ca - > sb_bio . bi_inline_vecs [ 0 ] . bv_page )
put_page ( ca - > sb_bio . bi_io_vec [ 0 ] . bv_page ) ;
2014-07-08 00:03:36 +04:00
if ( ! IS_ERR_OR_NULL ( ca - > bdev ) )
2013-03-24 03:11:31 +04:00
blkdev_put ( ca - > bdev , FMODE_READ | FMODE_WRITE | FMODE_EXCL ) ;
kfree ( ca ) ;
module_put ( THIS_MODULE ) ;
}
2016-07-04 04:23:25 +03:00
static int cache_alloc ( struct cache * ca )
2013-03-24 03:11:31 +04:00
{
size_t free ;
struct bucket * b ;
__module_get ( THIS_MODULE ) ;
kobject_init ( & ca - > kobj , & bch_cache_ktype ) ;
bio_init ( & ca - > journal . bio ) ;
ca - > journal . bio . bi_max_vecs = 8 ;
ca - > journal . bio . bi_io_vec = ca - > journal . bio . bi_inline_vecs ;
2013-12-17 13:29:34 +04:00
free = roundup_pow_of_two ( ca - > sb . nbuckets ) > > 10 ;
2013-03-24 03:11:31 +04:00
2013-12-17 13:29:34 +04:00
if ( ! init_fifo ( & ca - > free [ RESERVE_BTREE ] , 8 , GFP_KERNEL ) | |
! init_fifo ( & ca - > free [ RESERVE_PRIO ] , prio_buckets ( ca ) , GFP_KERNEL ) | |
! init_fifo ( & ca - > free [ RESERVE_MOVINGGC ] , free , GFP_KERNEL ) | |
! init_fifo ( & ca - > free [ RESERVE_NONE ] , free , GFP_KERNEL ) | |
2013-03-24 03:11:31 +04:00
! init_fifo ( & ca - > free_inc , free < < 2 , GFP_KERNEL ) | |
! init_heap ( & ca - > heap , free < < 3 , GFP_KERNEL ) | |
2013-05-15 11:11:26 +04:00
! ( ca - > buckets = vzalloc ( sizeof ( struct bucket ) *
2013-03-24 03:11:31 +04:00
ca - > sb . nbuckets ) ) | |
! ( ca - > prio_buckets = kzalloc ( sizeof ( uint64_t ) * prio_buckets ( ca ) *
2 , GFP_KERNEL ) ) | |
2013-11-24 11:11:25 +04:00
! ( ca - > disk_buckets = alloc_bucket_pages ( GFP_KERNEL , ca ) ) )
2013-05-15 11:11:26 +04:00
return - ENOMEM ;
2013-03-24 03:11:31 +04:00
ca - > prio_last_buckets = ca - > prio_buckets + prio_buckets ( ca ) ;
for_each_bucket ( b , ca )
atomic_set ( & b - > pin , 0 ) ;
return 0 ;
}
2016-02-27 01:33:56 +03:00
static int register_cache ( struct cache_sb * sb , struct page * sb_page ,
2014-06-20 02:05:59 +04:00
struct block_device * bdev , struct cache * ca )
2013-03-24 03:11:31 +04:00
{
char name [ BDEVNAME_SIZE ] ;
2016-02-27 01:33:56 +03:00
const char * err = NULL ;
int ret = 0 ;
2013-03-24 03:11:31 +04:00
2013-05-15 11:11:26 +04:00
memcpy ( & ca - > sb , sb , sizeof ( struct cache_sb ) ) ;
2013-03-24 03:11:31 +04:00
ca - > bdev = bdev ;
ca - > bdev - > bd_holder = ca ;
2013-05-15 11:11:26 +04:00
bio_init ( & ca - > sb_bio ) ;
ca - > sb_bio . bi_max_vecs = 1 ;
ca - > sb_bio . bi_io_vec = ca - > sb_bio . bi_inline_vecs ;
ca - > sb_bio . bi_io_vec [ 0 ] . bv_page = sb_page ;
get_page ( sb_page ) ;
2013-03-24 03:11:31 +04:00
if ( blk_queue_discard ( bdev_get_queue ( ca - > bdev ) ) )
ca - > discard = CACHE_DISCARD ( & ca - > sb ) ;
2016-07-04 04:23:25 +03:00
ret = cache_alloc ( ca ) ;
2016-02-27 01:33:56 +03:00
if ( ret ! = 0 )
2013-05-15 11:11:26 +04:00
goto err ;
2016-02-27 01:33:56 +03:00
if ( kobject_add ( & ca - > kobj , & part_to_dev ( bdev - > bd_part ) - > kobj , " bcache " ) ) {
err = " error calling kobject_add " ;
ret = - ENOMEM ;
goto out ;
}
2013-03-24 03:11:31 +04:00
2014-03-18 05:58:55 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
err = register_cache_set ( ca ) ;
2014-03-18 05:58:55 +04:00
mutex_unlock ( & bch_register_lock ) ;
2016-02-27 01:33:56 +03:00
if ( err ) {
ret = - ENODEV ;
goto out ;
}
2013-03-24 03:11:31 +04:00
pr_info ( " registered cache device %s " , bdevname ( bdev , name ) ) ;
2016-02-27 01:33:56 +03:00
2014-06-12 06:44:49 +04:00
out :
kobject_put ( & ca - > kobj ) ;
2016-02-27 01:33:56 +03:00
2013-03-24 03:11:31 +04:00
err :
2016-02-27 01:33:56 +03:00
if ( err )
pr_notice ( " error opening %s: %s " , bdevname ( bdev , name ) , err ) ;
return ret ;
2013-03-24 03:11:31 +04:00
}
/* Global interfaces/init */
static ssize_t register_bcache ( struct kobject * , struct kobj_attribute * ,
const char * , size_t ) ;
kobj_attribute_write ( register , register_bcache ) ;
kobj_attribute_write ( register_quiet , register_bcache ) ;
2013-05-04 14:19:41 +04:00
static bool bch_is_open_backing ( struct block_device * bdev ) {
struct cache_set * c , * tc ;
struct cached_dev * dc , * t ;
list_for_each_entry_safe ( c , tc , & bch_cache_sets , list )
list_for_each_entry_safe ( dc , t , & c - > cached_devs , list )
if ( dc - > bdev = = bdev )
return true ;
list_for_each_entry_safe ( dc , t , & uncached_devices , list )
if ( dc - > bdev = = bdev )
return true ;
return false ;
}
static bool bch_is_open_cache ( struct block_device * bdev ) {
struct cache_set * c , * tc ;
struct cache * ca ;
unsigned i ;
list_for_each_entry_safe ( c , tc , & bch_cache_sets , list )
for_each_cache ( ca , c , i )
if ( ca - > bdev = = bdev )
return true ;
return false ;
}
static bool bch_is_open ( struct block_device * bdev ) {
return bch_is_open_cache ( bdev ) | | bch_is_open_backing ( bdev ) ;
}
2013-03-24 03:11:31 +04:00
static ssize_t register_bcache ( struct kobject * k , struct kobj_attribute * attr ,
const char * buffer , size_t size )
{
ssize_t ret = size ;
const char * err = " cannot allocate memory " ;
char * path = NULL ;
struct cache_sb * sb = NULL ;
struct block_device * bdev = NULL ;
struct page * sb_page = NULL ;
if ( ! try_module_get ( THIS_MODULE ) )
return - EBUSY ;
if ( ! ( path = kstrndup ( buffer , size , GFP_KERNEL ) ) | |
! ( sb = kmalloc ( sizeof ( struct cache_sb ) , GFP_KERNEL ) ) )
goto err ;
err = " failed to open device " ;
bdev = blkdev_get_by_path ( strim ( path ) ,
FMODE_READ | FMODE_WRITE | FMODE_EXCL ,
sb ) ;
2013-05-15 11:11:26 +04:00
if ( IS_ERR ( bdev ) ) {
2013-05-04 14:19:41 +04:00
if ( bdev = = ERR_PTR ( - EBUSY ) ) {
bdev = lookup_bdev ( strim ( path ) ) ;
2014-07-13 20:08:59 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-05-04 14:19:41 +04:00
if ( ! IS_ERR ( bdev ) & & bch_is_open ( bdev ) )
err = " device already registered " ;
else
err = " device busy " ;
2014-07-13 20:08:59 +04:00
mutex_unlock ( & bch_register_lock ) ;
2015-11-30 05:40:23 +03:00
if ( attr = = & ksysfs_register_quiet )
goto out ;
2013-05-04 14:19:41 +04:00
}
2013-03-24 03:11:31 +04:00
goto err ;
2013-05-15 11:11:26 +04:00
}
err = " failed to set blocksize " ;
if ( set_blocksize ( bdev , 4096 ) )
goto err_close ;
2013-03-24 03:11:31 +04:00
err = read_super ( sb , bdev , & sb_page ) ;
if ( err )
goto err_close ;
2013-04-12 02:14:35 +04:00
if ( SB_IS_BDEV ( sb ) ) {
2013-03-24 03:11:31 +04:00
struct cached_dev * dc = kzalloc ( sizeof ( * dc ) , GFP_KERNEL ) ;
2013-05-15 11:11:26 +04:00
if ( ! dc )
goto err_close ;
2013-03-24 03:11:31 +04:00
2014-03-18 05:58:55 +04:00
mutex_lock ( & bch_register_lock ) ;
2013-05-15 11:11:26 +04:00
register_bdev ( sb , sb_page , bdev , dc ) ;
2014-03-18 05:58:55 +04:00
mutex_unlock ( & bch_register_lock ) ;
2013-03-24 03:11:31 +04:00
} else {
struct cache * ca = kzalloc ( sizeof ( * ca ) , GFP_KERNEL ) ;
2013-05-15 11:11:26 +04:00
if ( ! ca )
goto err_close ;
2013-03-24 03:11:31 +04:00
2016-02-27 01:33:56 +03:00
if ( register_cache ( sb , sb_page , bdev , ca ) ! = 0 )
goto err_close ;
2013-03-24 03:11:31 +04:00
}
2013-05-15 11:11:26 +04:00
out :
if ( sb_page )
2013-03-24 03:11:31 +04:00
put_page ( sb_page ) ;
kfree ( sb ) ;
kfree ( path ) ;
module_put ( THIS_MODULE ) ;
return ret ;
2013-05-15 11:11:26 +04:00
err_close :
blkdev_put ( bdev , FMODE_READ | FMODE_WRITE | FMODE_EXCL ) ;
err :
2015-11-30 05:40:23 +03:00
pr_info ( " error opening %s: %s " , path , err ) ;
2013-05-15 11:11:26 +04:00
ret = - EINVAL ;
goto out ;
2013-03-24 03:11:31 +04:00
}
static int bcache_reboot ( struct notifier_block * n , unsigned long code , void * x )
{
if ( code = = SYS_DOWN | |
code = = SYS_HALT | |
code = = SYS_POWER_OFF ) {
DEFINE_WAIT ( wait ) ;
unsigned long start = jiffies ;
bool stopped = false ;
struct cache_set * c , * tc ;
struct cached_dev * dc , * tdc ;
mutex_lock ( & bch_register_lock ) ;
if ( list_empty ( & bch_cache_sets ) & &
list_empty ( & uncached_devices ) )
goto out ;
pr_info ( " Stopping all devices: " ) ;
list_for_each_entry_safe ( c , tc , & bch_cache_sets , list )
bch_cache_set_stop ( c ) ;
list_for_each_entry_safe ( dc , tdc , & uncached_devices , list )
bcache_device_stop ( & dc - > disk ) ;
/* What's a condition variable? */
while ( 1 ) {
long timeout = start + 2 * HZ - jiffies ;
stopped = list_empty ( & bch_cache_sets ) & &
list_empty ( & uncached_devices ) ;
if ( timeout < 0 | | stopped )
break ;
prepare_to_wait ( & unregister_wait , & wait ,
TASK_UNINTERRUPTIBLE ) ;
mutex_unlock ( & bch_register_lock ) ;
schedule_timeout ( timeout ) ;
mutex_lock ( & bch_register_lock ) ;
}
finish_wait ( & unregister_wait , & wait ) ;
if ( stopped )
pr_info ( " All devices stopped " ) ;
else
pr_notice ( " Timeout waiting for devices to be closed " ) ;
out :
mutex_unlock ( & bch_register_lock ) ;
}
return NOTIFY_DONE ;
}
static struct notifier_block reboot = {
. notifier_call = bcache_reboot ,
. priority = INT_MAX , /* before any real devices */
} ;
static void bcache_exit ( void )
{
bch_debug_exit ( ) ;
bch_request_exit ( ) ;
if ( bcache_kobj )
kobject_put ( bcache_kobj ) ;
if ( bcache_wq )
destroy_workqueue ( bcache_wq ) ;
2013-07-09 04:53:26 +04:00
if ( bcache_major )
unregister_blkdev ( bcache_major , " bcache " ) ;
2013-03-24 03:11:31 +04:00
unregister_reboot_notifier ( & reboot ) ;
}
static int __init bcache_init ( void )
{
static const struct attribute * files [ ] = {
& ksysfs_register . attr ,
& ksysfs_register_quiet . attr ,
NULL
} ;
mutex_init ( & bch_register_lock ) ;
init_waitqueue_head ( & unregister_wait ) ;
register_reboot_notifier ( & reboot ) ;
2013-03-25 22:46:43 +04:00
closure_debug_init ( ) ;
2013-03-24 03:11:31 +04:00
bcache_major = register_blkdev ( 0 , " bcache " ) ;
2015-11-30 04:21:57 +03:00
if ( bcache_major < 0 ) {
unregister_reboot_notifier ( & reboot ) ;
2013-03-24 03:11:31 +04:00
return bcache_major ;
2015-11-30 04:21:57 +03:00
}
2013-03-24 03:11:31 +04:00
2016-06-07 23:27:19 +03:00
if ( ! ( bcache_wq = alloc_workqueue ( " bcache " , WQ_MEM_RECLAIM , 0 ) ) | |
2013-03-24 03:11:31 +04:00
! ( bcache_kobj = kobject_create_and_add ( " bcache " , fs_kobj ) ) | |
sysfs_create_files ( bcache_kobj , files ) | |
bch_request_init ( ) | |
bch_debug_init ( bcache_kobj ) )
goto err ;
return 0 ;
err :
bcache_exit ( ) ;
return - ENOMEM ;
}
module_exit ( bcache_exit ) ;
module_init ( bcache_init ) ;