2018-01-30 15:13:48 +03:00
/*
* Copyright ( C ) 2018 Red Hat , Inc . All rights reserved .
*
* This file is part of LVM2 .
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v .2 .1 .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program ; if not , write to the Free Software Foundation ,
* Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
*/
2018-01-30 13:46:08 +03:00
# define _GNU_SOURCE
2018-05-14 12:30:20 +03:00
# include "lib/device/bcache.h"
2018-05-30 16:17:26 +03:00
# include "base/data-struct/radix-tree.h"
2018-05-16 15:43:02 +03:00
# include "lib/log/lvm-logging.h"
2018-05-14 12:30:20 +03:00
# include "lib/log/log.h"
2018-05-02 21:45:06 +03:00
2018-01-30 13:46:08 +03:00
# include <errno.h>
# include <fcntl.h>
# include <sys/stat.h>
# include <sys/types.h>
# include <stdbool.h>
# include <stdlib.h>
# include <stdio.h>
# include <stdint.h>
# include <libaio.h>
# include <unistd.h>
# include <linux/fs.h>
# include <sys/ioctl.h>
# include <sys/user.h>
# define SECTOR_SHIFT 9L
//----------------------------------------------------------------
2018-05-01 15:21:53 +03:00
static void log_sys_warn ( const char * call )
2018-01-30 15:13:48 +03:00
{
2018-05-01 15:21:53 +03:00
log_warn ( " %s failed: %s " , call , strerror ( errno ) ) ;
2018-01-30 15:13:48 +03:00
}
2018-01-30 13:46:08 +03:00
// Assumes the list is not empty.
static inline struct dm_list * _list_pop ( struct dm_list * head )
{
struct dm_list * l ;
l = head - > n ;
dm_list_del ( l ) ;
return l ;
}
//----------------------------------------------------------------
struct control_block {
struct dm_list list ;
void * context ;
struct iocb cb ;
} ;
struct cb_set {
struct dm_list free ;
struct dm_list allocated ;
struct control_block * vec ;
} control_block_set ;
static struct cb_set * _cb_set_create ( unsigned nr )
{
int i ;
2018-06-08 15:40:53 +03:00
struct cb_set * cbs = malloc ( sizeof ( * cbs ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cbs )
return NULL ;
2018-06-08 15:40:53 +03:00
cbs - > vec = malloc ( nr * sizeof ( * cbs - > vec ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cbs - > vec ) {
2018-06-08 15:40:53 +03:00
free ( cbs ) ;
2018-01-30 13:46:08 +03:00
return NULL ;
}
dm_list_init ( & cbs - > free ) ;
dm_list_init ( & cbs - > allocated ) ;
for ( i = 0 ; i < nr ; i + + )
dm_list_add ( & cbs - > free , & cbs - > vec [ i ] . list ) ;
return cbs ;
}
2018-01-30 15:13:48 +03:00
static void _cb_set_destroy ( struct cb_set * cbs )
2018-01-30 13:46:08 +03:00
{
2018-01-30 15:13:48 +03:00
// We know this is always called after a wait_all. So there should
// never be in flight IO.
2018-01-30 13:46:08 +03:00
if ( ! dm_list_empty ( & cbs - > allocated ) ) {
2018-01-30 15:13:48 +03:00
// bail out
2018-01-30 13:46:08 +03:00
log_error ( " async io still in flight " ) ;
2018-01-30 15:13:48 +03:00
return ;
2018-01-30 13:46:08 +03:00
}
2018-06-08 15:40:53 +03:00
free ( cbs - > vec ) ;
free ( cbs ) ;
2018-01-30 13:46:08 +03:00
}
static struct control_block * _cb_alloc ( struct cb_set * cbs , void * context )
{
struct control_block * cb ;
if ( dm_list_empty ( & cbs - > free ) )
return NULL ;
cb = dm_list_item ( _list_pop ( & cbs - > free ) , struct control_block ) ;
cb - > context = context ;
dm_list_add ( & cbs - > allocated , & cb - > list ) ;
return cb ;
}
static void _cb_free ( struct cb_set * cbs , struct control_block * cb )
{
dm_list_del ( & cb - > list ) ;
dm_list_add_h ( & cbs - > free , & cb - > list ) ;
}
static struct control_block * _iocb_to_cb ( struct iocb * icb )
{
return dm_list_struct_base ( icb , struct control_block , cb ) ;
}
//----------------------------------------------------------------
2018-02-05 19:04:23 +03:00
struct async_engine {
struct io_engine e ;
2018-01-30 13:46:08 +03:00
io_context_t aio_context ;
struct cb_set * cbs ;
2018-05-17 12:05:10 +03:00
unsigned page_mask ;
2018-01-30 13:46:08 +03:00
} ;
2018-02-05 19:04:23 +03:00
static struct async_engine * _to_async ( struct io_engine * e )
2018-01-30 13:46:08 +03:00
{
2018-02-05 19:04:23 +03:00
return container_of ( e , struct async_engine , e ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-05 19:04:23 +03:00
static void _async_destroy ( struct io_engine * ioe )
2018-01-30 13:46:08 +03:00
{
2018-02-01 17:52:43 +03:00
int r ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-02-01 17:52:43 +03:00
2018-01-30 13:46:08 +03:00
_cb_set_destroy ( e - > cbs ) ;
2018-02-01 17:52:43 +03:00
// io_destroy is really slow
r = io_destroy ( e - > aio_context ) ;
if ( r )
log_sys_warn ( " io_destroy " ) ;
2018-06-08 15:40:53 +03:00
free ( e ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-05 19:04:23 +03:00
static bool _async_issue ( struct io_engine * ioe , enum dir d , int fd ,
sector_t sb , sector_t se , void * data , void * context )
2018-01-30 13:46:08 +03:00
{
int r ;
struct iocb * cb_array [ 1 ] ;
struct control_block * cb ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-01-30 13:46:08 +03:00
2018-05-17 12:05:10 +03:00
if ( ( ( uintptr_t ) data ) & e - > page_mask ) {
2018-01-30 15:13:48 +03:00
log_warn ( " misaligned data buffer " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
cb = _cb_alloc ( e - > cbs , context ) ;
if ( ! cb ) {
2018-01-30 15:13:48 +03:00
log_warn ( " couldn't allocate control block " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
memset ( & cb - > cb , 0 , sizeof ( cb - > cb ) ) ;
cb - > cb . aio_fildes = ( int ) fd ;
cb - > cb . u . c . buf = data ;
cb - > cb . u . c . offset = sb < < SECTOR_SHIFT ;
cb - > cb . u . c . nbytes = ( se - sb ) < < SECTOR_SHIFT ;
cb - > cb . aio_lio_opcode = ( d = = DIR_READ ) ? IO_CMD_PREAD : IO_CMD_PWRITE ;
cb_array [ 0 ] = & cb - > cb ;
2018-02-20 18:33:27 +03:00
do {
r = io_submit ( e - > aio_context , 1 , cb_array ) ;
} while ( r = = - EAGAIN ) ;
2018-01-30 13:46:08 +03:00
if ( r < 0 ) {
_cb_free ( e - > cbs , cb ) ;
return false ;
}
return true ;
}
2018-08-24 22:46:51 +03:00
/*
* MAX_IO is returned to the layer above via bcache_max_prefetches ( ) which
* tells the caller how many devices to submit io for concurrently . There will
* be an open file descriptor for each of these , so keep it low enough to avoid
* reaching the default max open file limit ( 1024 ) when there are over 1024
* devices being scanned .
*/
# define MAX_IO 256
2018-02-02 15:06:14 +03:00
# define MAX_EVENT 64
2018-01-30 13:46:08 +03:00
2018-02-05 19:04:23 +03:00
static bool _async_wait ( struct io_engine * ioe , io_complete_fn fn )
2018-01-30 13:46:08 +03:00
{
int i , r ;
2018-02-02 15:06:14 +03:00
struct io_event event [ MAX_EVENT ] ;
2018-01-30 13:46:08 +03:00
struct control_block * cb ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-01-30 13:46:08 +03:00
memset ( & event , 0 , sizeof ( event ) ) ;
2018-02-20 18:33:27 +03:00
do {
r = io_getevents ( e - > aio_context , 1 , MAX_EVENT , event , NULL ) ;
} while ( r = = - EINTR ) ;
2018-01-30 13:46:08 +03:00
if ( r < 0 ) {
2018-01-30 15:13:48 +03:00
log_sys_warn ( " io_getevents " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
for ( i = 0 ; i < r ; i + + ) {
struct io_event * ev = event + i ;
cb = _iocb_to_cb ( ( struct iocb * ) ev - > obj ) ;
if ( ev - > res = = cb - > cb . u . c . nbytes )
fn ( ( void * ) cb - > context , 0 ) ;
else if ( ( int ) ev - > res < 0 )
fn ( cb - > context , ( int ) ev - > res ) ;
2018-02-20 18:33:27 +03:00
// FIXME: dct added this. a short read is ok?!
2018-02-07 00:18:11 +03:00
else if ( ev - > res > = ( 1 < < SECTOR_SHIFT ) ) {
/* minimum acceptable read is 1 sector */
fn ( ( void * ) cb - > context , 0 ) ;
} else {
2018-01-30 13:46:08 +03:00
fn ( cb - > context , - ENODATA ) ;
}
_cb_free ( e - > cbs , cb ) ;
}
return true ;
}
2018-02-20 18:33:27 +03:00
static unsigned _async_max_io ( struct io_engine * e )
2018-02-08 19:10:31 +03:00
{
2018-02-20 18:33:27 +03:00
return MAX_IO ;
2018-02-08 19:10:31 +03:00
}
2018-02-20 18:33:27 +03:00
struct io_engine * create_async_io_engine ( void )
2018-02-05 19:04:23 +03:00
{
int r ;
2018-06-08 15:40:53 +03:00
struct async_engine * e = malloc ( sizeof ( * e ) ) ;
2018-02-05 19:04:23 +03:00
if ( ! e )
return NULL ;
e - > e . destroy = _async_destroy ;
e - > e . issue = _async_issue ;
e - > e . wait = _async_wait ;
2018-02-08 19:10:31 +03:00
e - > e . max_io = _async_max_io ;
2018-02-05 19:04:23 +03:00
e - > aio_context = 0 ;
2018-02-20 18:33:27 +03:00
r = io_setup ( MAX_IO , & e - > aio_context ) ;
2018-02-05 19:04:23 +03:00
if ( r < 0 ) {
log_warn ( " io_setup failed " ) ;
2018-06-08 15:40:53 +03:00
free ( e ) ;
2018-02-05 19:04:23 +03:00
return NULL ;
}
2018-02-20 18:33:27 +03:00
e - > cbs = _cb_set_create ( MAX_IO ) ;
2018-02-05 19:04:23 +03:00
if ( ! e - > cbs ) {
log_warn ( " couldn't create control block set " ) ;
2018-06-08 15:40:53 +03:00
free ( e ) ;
2018-02-05 19:04:23 +03:00
return NULL ;
}
2018-05-17 12:05:10 +03:00
e - > page_mask = sysconf ( _SC_PAGESIZE ) - 1 ;
2018-02-05 19:04:23 +03:00
return & e - > e ;
}
2018-01-30 13:46:08 +03:00
//----------------------------------------------------------------
2018-05-10 16:29:26 +03:00
struct sync_io {
struct dm_list list ;
void * context ;
} ;
struct sync_engine {
struct io_engine e ;
struct dm_list complete ;
} ;
static struct sync_engine * _to_sync ( struct io_engine * e )
{
return container_of ( e , struct sync_engine , e ) ;
}
static void _sync_destroy ( struct io_engine * ioe )
{
struct sync_engine * e = _to_sync ( ioe ) ;
2018-06-08 15:40:53 +03:00
free ( e ) ;
2018-05-10 16:29:26 +03:00
}
static bool _sync_issue ( struct io_engine * ioe , enum dir d , int fd ,
sector_t sb , sector_t se , void * data , void * context )
{
int r ;
2018-05-11 07:37:47 +03:00
uint64_t len = ( se - sb ) * 512 , where ;
2018-05-10 16:29:26 +03:00
struct sync_engine * e = _to_sync ( ioe ) ;
struct sync_io * io = malloc ( sizeof ( * io ) ) ;
2018-05-11 07:37:47 +03:00
if ( ! io ) {
log_warn ( " unable to allocate sync_io " ) ;
2018-05-10 16:29:26 +03:00
return false ;
2018-05-11 07:37:47 +03:00
}
2018-05-10 16:29:26 +03:00
2018-05-11 07:37:47 +03:00
where = sb * 512 ;
r = lseek ( fd , where , SEEK_SET ) ;
if ( r < 0 ) {
log_warn ( " unable to seek to position %llu " , ( unsigned long long ) where ) ;
2018-05-10 16:29:26 +03:00
return false ;
2018-05-11 07:37:47 +03:00
}
2018-05-10 16:29:26 +03:00
2018-05-11 07:37:47 +03:00
while ( len ) {
do {
if ( d = = DIR_READ )
r = read ( fd , data , len ) ;
else
r = write ( fd , data , len ) ;
} while ( ( r < 0 ) & & ( ( r = = EINTR ) | | ( r = = EAGAIN ) ) ) ;
2018-05-10 16:29:26 +03:00
2018-05-11 07:37:47 +03:00
if ( r < 0 ) {
log_warn ( " io failed %d " , r ) ;
return false ;
}
len - = r ;
}
if ( len ) {
log_warn ( " short io %u bytes remaining " , ( unsigned ) len ) ;
return false ;
}
2018-05-10 16:29:26 +03:00
dm_list_add ( & e - > complete , & io - > list ) ;
io - > context = context ;
return true ;
}
static bool _sync_wait ( struct io_engine * ioe , io_complete_fn fn )
{
struct sync_io * io , * tmp ;
struct sync_engine * e = _to_sync ( ioe ) ;
dm_list_iterate_items_safe ( io , tmp , & e - > complete ) {
2018-05-11 07:37:47 +03:00
fn ( io - > context , 0 ) ;
2018-05-10 16:29:26 +03:00
dm_list_del ( & io - > list ) ;
2018-06-08 15:40:53 +03:00
free ( io ) ;
2018-05-10 16:29:26 +03:00
}
return true ;
}
static unsigned _sync_max_io ( struct io_engine * e )
{
return 1 ;
}
struct io_engine * create_sync_io_engine ( void )
{
2018-06-08 15:40:53 +03:00
struct sync_engine * e = malloc ( sizeof ( * e ) ) ;
2018-05-10 16:29:26 +03:00
if ( ! e )
return NULL ;
e - > e . destroy = _sync_destroy ;
e - > e . issue = _sync_issue ;
e - > e . wait = _sync_wait ;
e - > e . max_io = _sync_max_io ;
dm_list_init ( & e - > complete ) ;
return & e - > e ;
}
//----------------------------------------------------------------
2018-01-30 13:46:08 +03:00
# define MIN_BLOCKS 16
# define WRITEBACK_LOW_THRESHOLD_PERCENT 33
# define WRITEBACK_HIGH_THRESHOLD_PERCENT 66
//----------------------------------------------------------------
static void * _alloc_aligned ( size_t len , size_t alignment )
{
void * result = NULL ;
int r = posix_memalign ( & result , alignment , len ) ;
if ( r )
return NULL ;
return result ;
}
//----------------------------------------------------------------
static bool _test_flags ( struct block * b , unsigned bits )
{
return ( b - > flags & bits ) ! = 0 ;
}
static void _set_flags ( struct block * b , unsigned bits )
{
b - > flags | = bits ;
}
static void _clear_flags ( struct block * b , unsigned bits )
{
b - > flags & = ~ bits ;
}
//----------------------------------------------------------------
enum block_flags {
BF_IO_PENDING = ( 1 < < 0 ) ,
BF_DIRTY = ( 1 < < 1 ) ,
} ;
struct bcache {
sector_t block_sectors ;
uint64_t nr_data_blocks ;
uint64_t nr_cache_blocks ;
2018-02-02 15:06:14 +03:00
unsigned max_io ;
2018-01-30 13:46:08 +03:00
struct io_engine * engine ;
void * raw_data ;
struct block * raw_blocks ;
/*
* Lists that categorise the blocks .
*/
unsigned nr_locked ;
unsigned nr_dirty ;
unsigned nr_io_pending ;
struct dm_list free ;
struct dm_list errored ;
struct dm_list dirty ;
struct dm_list clean ;
struct dm_list io_pending ;
2018-05-30 16:17:26 +03:00
struct radix_tree * rtree ;
2018-01-30 13:46:08 +03:00
/*
* Statistics
*/
unsigned read_hits ;
unsigned read_misses ;
unsigned write_zeroes ;
unsigned write_hits ;
unsigned write_misses ;
unsigned prefetches ;
} ;
//----------------------------------------------------------------
2018-05-30 16:17:26 +03:00
struct key_parts {
uint32_t fd ;
uint64_t b ;
} __attribute__ ( ( packed ) ) ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
union key {
struct key_parts parts ;
uint8_t bytes [ 12 ] ;
} ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
static struct block * _block_lookup ( struct bcache * cache , int fd , uint64_t i )
2018-01-30 13:46:08 +03:00
{
2018-05-30 16:17:26 +03:00
union key k ;
union radix_value v ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
k . parts . fd = fd ;
k . parts . b = i ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
if ( radix_tree_lookup ( cache - > rtree , k . bytes , k . bytes + sizeof ( k . bytes ) , & v ) )
return v . ptr ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
return NULL ;
2018-01-30 13:46:08 +03:00
}
2018-05-30 16:17:26 +03:00
static bool _block_insert ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-05-30 16:17:26 +03:00
union key k ;
union radix_value v ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
k . parts . fd = b - > fd ;
k . parts . b = b - > index ;
v . ptr = b ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
return radix_tree_insert ( b - > cache - > rtree , k . bytes , k . bytes + sizeof ( k . bytes ) , v ) ;
2018-01-30 13:46:08 +03:00
}
2018-05-30 16:17:26 +03:00
static void _block_remove ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-05-30 16:17:26 +03:00
union key k ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
k . parts . fd = b - > fd ;
k . parts . b = b - > index ;
2018-01-30 13:46:08 +03:00
2018-05-30 16:17:26 +03:00
radix_tree_remove ( b - > cache - > rtree , k . bytes , k . bytes + sizeof ( k . bytes ) ) ;
2018-01-30 13:46:08 +03:00
}
//----------------------------------------------------------------
2018-05-17 12:05:10 +03:00
static bool _init_free_list ( struct bcache * cache , unsigned count , unsigned pgsize )
2018-01-30 13:46:08 +03:00
{
unsigned i ;
size_t block_size = cache - > block_sectors < < SECTOR_SHIFT ;
unsigned char * data =
2018-05-16 23:19:03 +03:00
( unsigned char * ) _alloc_aligned ( count * block_size , pgsize ) ;
2018-01-30 13:46:08 +03:00
/* Allocate the data for each block. We page align the data. */
if ( ! data )
2018-01-30 15:13:48 +03:00
return false ;
2018-01-30 13:46:08 +03:00
2018-06-08 15:40:53 +03:00
cache - > raw_blocks = malloc ( count * sizeof ( * cache - > raw_blocks ) ) ;
2018-06-26 18:04:18 +03:00
if ( ! cache - > raw_blocks ) {
free ( data ) ;
return false ;
}
2018-01-30 13:46:08 +03:00
2018-06-26 18:04:18 +03:00
cache - > raw_data = data ;
2018-01-30 13:46:08 +03:00
for ( i = 0 ; i < count ; i + + ) {
struct block * b = cache - > raw_blocks + i ;
b - > cache = cache ;
b - > data = data + ( block_size * i ) ;
dm_list_add ( & cache - > free , & b - > list ) ;
}
2018-01-30 15:13:48 +03:00
return true ;
2018-01-30 13:46:08 +03:00
}
static void _exit_free_list ( struct bcache * cache )
{
2018-06-08 15:40:53 +03:00
free ( cache - > raw_data ) ;
free ( cache - > raw_blocks ) ;
2018-01-30 13:46:08 +03:00
}
static struct block * _alloc_block ( struct bcache * cache )
{
2018-02-02 17:34:45 +03:00
if ( dm_list_empty ( & cache - > free ) )
return NULL ;
return dm_list_struct_base ( _list_pop ( & cache - > free ) , struct block , list ) ;
2018-01-30 13:46:08 +03:00
}
2018-05-30 16:17:26 +03:00
static void _free_block ( struct block * b )
{
dm_list_add ( & b - > cache - > free , & b - > list ) ;
}
2018-01-30 13:46:08 +03:00
/*----------------------------------------------------------------
* Clean / dirty list management .
* Always use these methods to ensure nr_dirty_ is correct .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _unlink_block ( struct block * b )
{
if ( _test_flags ( b , BF_DIRTY ) )
b - > cache - > nr_dirty - - ;
dm_list_del ( & b - > list ) ;
}
static void _link_block ( struct block * b )
{
struct bcache * cache = b - > cache ;
if ( _test_flags ( b , BF_DIRTY ) ) {
dm_list_add ( & cache - > dirty , & b - > list ) ;
cache - > nr_dirty + + ;
} else
dm_list_add ( & cache - > clean , & b - > list ) ;
}
static void _relink ( struct block * b )
{
_unlink_block ( b ) ;
_link_block ( b ) ;
}
/*----------------------------------------------------------------
* Low level IO handling
*
* We cannot have two concurrent writes on the same block .
* eg , background writeback , put with dirty , flush ?
*
* To avoid this we introduce some restrictions :
*
* i ) A held block can never be written back .
* ii ) You cannot get a block until writeback has completed .
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2018-02-01 17:52:43 +03:00
static void _complete_io ( void * context , int err )
{
struct block * b = context ;
struct bcache * cache = b - > cache ;
b - > error = err ;
_clear_flags ( b , BF_IO_PENDING ) ;
cache - > nr_io_pending - - ;
/*
* b is on the io_pending list , so we don ' t want to use unlink_block .
* Which would incorrectly adjust nr_dirty .
*/
dm_list_del ( & b - > list ) ;
2018-03-01 19:17:32 +03:00
if ( b - > error ) {
2018-04-26 15:13:27 +03:00
dm_list_add ( & cache - > errored , & b - > list ) ;
2018-02-20 18:33:27 +03:00
} else {
2018-02-01 17:52:43 +03:00
_clear_flags ( b , BF_DIRTY ) ;
_link_block ( b ) ;
}
}
2018-01-30 13:46:08 +03:00
/*
* | b - > list | should be valid ( either pointing to itself , on one of the other
* lists .
*/
2018-02-20 18:33:27 +03:00
static void _issue_low_level ( struct block * b , enum dir d )
2018-01-30 13:46:08 +03:00
{
struct bcache * cache = b - > cache ;
sector_t sb = b - > index * cache - > block_sectors ;
sector_t se = sb + cache - > block_sectors ;
if ( _test_flags ( b , BF_IO_PENDING ) )
2018-02-20 18:33:27 +03:00
return ;
2018-01-30 13:46:08 +03:00
2018-02-20 18:33:27 +03:00
b - > io_dir = d ;
2018-01-30 13:46:08 +03:00
_set_flags ( b , BF_IO_PENDING ) ;
2018-05-16 12:09:17 +03:00
cache - > nr_io_pending + + ;
2018-04-27 12:56:13 +03:00
dm_list_move ( & cache - > io_pending , & b - > list ) ;
2018-02-05 19:04:23 +03:00
if ( ! cache - > engine - > issue ( cache - > engine , d , b - > fd , sb , se , b - > data , b ) ) {
2018-04-06 21:11:39 +03:00
/* FIXME: if io_submit() set an errno, return that instead of EIO? */
2018-02-01 17:52:43 +03:00
_complete_io ( b , - EIO ) ;
2018-02-20 18:33:27 +03:00
return ;
2018-02-01 17:52:43 +03:00
}
2018-01-30 13:46:08 +03:00
}
2018-02-20 18:33:27 +03:00
static inline void _issue_read ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
_issue_low_level ( b , DIR_READ ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-20 18:33:27 +03:00
static inline void _issue_write ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
_issue_low_level ( b , DIR_WRITE ) ;
2018-01-30 13:46:08 +03:00
}
2018-01-30 15:13:48 +03:00
static bool _wait_io ( struct bcache * cache )
2018-01-30 13:46:08 +03:00
{
2018-02-05 19:04:23 +03:00
return cache - > engine - > wait ( cache - > engine , _complete_io ) ;
2018-01-30 13:46:08 +03:00
}
/*----------------------------------------------------------------
* High level IO handling
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _wait_all ( struct bcache * cache )
{
while ( ! dm_list_empty ( & cache - > io_pending ) )
_wait_io ( cache ) ;
}
static void _wait_specific ( struct block * b )
{
while ( _test_flags ( b , BF_IO_PENDING ) )
_wait_io ( b - > cache ) ;
}
static unsigned _writeback ( struct bcache * cache , unsigned count )
{
unsigned actual = 0 ;
struct block * b , * tmp ;
dm_list_iterate_items_gen_safe ( b , tmp , & cache - > dirty , list ) {
if ( actual = = count )
break ;
// We can't writeback anything that's still in use.
if ( ! b - > ref_count ) {
_issue_write ( b ) ;
actual + + ;
}
}
return actual ;
}
/*----------------------------------------------------------------
* High level allocation
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static struct block * _find_unused_clean_block ( struct bcache * cache )
{
struct block * b ;
dm_list_iterate_items ( b , & cache - > clean ) {
if ( ! b - > ref_count ) {
_unlink_block ( b ) ;
2018-05-30 16:17:26 +03:00
_block_remove ( b ) ;
2018-01-30 13:46:08 +03:00
return b ;
}
}
return NULL ;
}
2018-05-01 15:21:53 +03:00
static struct block * _new_block ( struct bcache * cache , int fd , block_address i , bool can_wait )
2018-01-30 13:46:08 +03:00
{
struct block * b ;
b = _alloc_block ( cache ) ;
2018-02-05 19:04:23 +03:00
while ( ! b & & ! dm_list_empty ( & cache - > clean ) ) {
2018-01-30 13:46:08 +03:00
b = _find_unused_clean_block ( cache ) ;
if ( ! b ) {
2018-02-05 19:04:23 +03:00
if ( can_wait ) {
if ( dm_list_empty ( & cache - > io_pending ) )
_writeback ( cache , 16 ) ; // FIXME: magic number
_wait_io ( cache ) ;
2018-02-27 21:37:25 +03:00
} else {
log_error ( " bcache no new blocks for fd %d index %u " ,
2018-05-01 15:21:53 +03:00
fd , ( uint32_t ) i ) ;
2018-02-05 19:04:23 +03:00
return NULL ;
2018-02-27 21:37:25 +03:00
}
2018-01-30 13:46:08 +03:00
}
}
if ( b ) {
dm_list_init ( & b - > list ) ;
b - > flags = 0 ;
2018-02-01 17:52:43 +03:00
b - > fd = fd ;
2018-05-01 15:21:53 +03:00
b - > index = i ;
2018-01-30 13:46:08 +03:00
b - > ref_count = 0 ;
b - > error = 0 ;
2018-05-30 16:17:26 +03:00
if ( ! _block_insert ( b ) ) {
log_error ( " bcache unable to insert block in radix tree (OOM?) " ) ;
_free_block ( b ) ;
return NULL ;
}
2018-02-27 21:37:25 +03:00
}
2018-01-30 13:46:08 +03:00
return b ;
}
/*----------------------------------------------------------------
* Block reference counting
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _zero_block ( struct block * b )
{
b - > cache - > write_zeroes + + ;
memset ( b - > data , 0 , b - > cache - > block_sectors < < SECTOR_SHIFT ) ;
_set_flags ( b , BF_DIRTY ) ;
}
static void _hit ( struct block * b , unsigned flags )
{
struct bcache * cache = b - > cache ;
if ( flags & ( GF_ZERO | GF_DIRTY ) )
cache - > write_hits + + ;
else
cache - > read_hits + + ;
_relink ( b ) ;
}
static void _miss ( struct bcache * cache , unsigned flags )
{
if ( flags & ( GF_ZERO | GF_DIRTY ) )
cache - > write_misses + + ;
else
cache - > read_misses + + ;
}
static struct block * _lookup_or_read_block ( struct bcache * cache ,
2018-05-01 15:21:53 +03:00
int fd , block_address i ,
2018-01-30 13:46:08 +03:00
unsigned flags )
{
2018-05-30 16:17:26 +03:00
struct block * b = _block_lookup ( cache , fd , i ) ;
2018-01-30 13:46:08 +03:00
if ( b ) {
// FIXME: this is insufficient. We need to also catch a read
// lock of a write locked block. Ref count needs to distinguish.
if ( b - > ref_count & & ( flags & ( GF_DIRTY | GF_ZERO ) ) ) {
2018-01-30 15:13:48 +03:00
log_warn ( " concurrent write lock attempted " ) ;
2018-01-30 13:46:08 +03:00
return NULL ;
}
if ( _test_flags ( b , BF_IO_PENDING ) ) {
_miss ( cache , flags ) ;
_wait_specific ( b ) ;
} else
_hit ( b , flags ) ;
_unlink_block ( b ) ;
if ( flags & GF_ZERO )
_zero_block ( b ) ;
} else {
_miss ( cache , flags ) ;
2018-05-01 15:21:53 +03:00
b = _new_block ( cache , fd , i , true ) ;
2018-01-30 13:46:08 +03:00
if ( b ) {
if ( flags & GF_ZERO )
_zero_block ( b ) ;
else {
_issue_read ( b ) ;
_wait_specific ( b ) ;
// we know the block is clean and unerrored.
_unlink_block ( b ) ;
}
}
}
2018-03-01 19:17:32 +03:00
if ( b ) {
2018-01-30 13:46:08 +03:00
if ( flags & ( GF_DIRTY | GF_ZERO ) )
_set_flags ( b , BF_DIRTY ) ;
_link_block ( b ) ;
return b ;
}
return NULL ;
}
static void _preemptive_writeback ( struct bcache * cache )
{
// FIXME: this ignores those blocks that are in the error state. Track
// nr_clean instead?
unsigned nr_available = cache - > nr_cache_blocks - ( cache - > nr_dirty - cache - > nr_io_pending ) ;
if ( nr_available < ( WRITEBACK_LOW_THRESHOLD_PERCENT * cache - > nr_cache_blocks / 100 ) )
_writeback ( cache , ( WRITEBACK_HIGH_THRESHOLD_PERCENT * cache - > nr_cache_blocks / 100 ) - nr_available ) ;
}
/*----------------------------------------------------------------
* Public interface
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2018-02-05 19:04:23 +03:00
struct bcache * bcache_create ( sector_t block_sectors , unsigned nr_cache_blocks ,
struct io_engine * engine )
2018-01-30 13:46:08 +03:00
{
struct bcache * cache ;
2018-02-05 19:04:23 +03:00
unsigned max_io = engine - > max_io ( engine ) ;
2018-05-16 23:19:03 +03:00
long pgsize = sysconf ( _SC_PAGESIZE ) ;
2018-01-30 13:46:08 +03:00
2018-02-01 17:52:43 +03:00
if ( ! nr_cache_blocks ) {
log_warn ( " bcache must have at least one cache block " ) ;
return NULL ;
}
if ( ! block_sectors ) {
log_warn ( " bcache must have a non zero block size " ) ;
return NULL ;
}
2018-05-16 23:19:03 +03:00
if ( block_sectors & ( ( pgsize > > SECTOR_SHIFT ) - 1 ) ) {
2018-02-01 17:52:43 +03:00
log_warn ( " bcache block size must be a multiple of page size " ) ;
return NULL ;
}
2018-06-08 15:40:53 +03:00
cache = malloc ( sizeof ( * cache ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cache )
return NULL ;
cache - > block_sectors = block_sectors ;
cache - > nr_cache_blocks = nr_cache_blocks ;
2018-02-05 19:04:23 +03:00
cache - > max_io = nr_cache_blocks < max_io ? nr_cache_blocks : max_io ;
cache - > engine = engine ;
2018-01-30 13:46:08 +03:00
cache - > nr_locked = 0 ;
cache - > nr_dirty = 0 ;
cache - > nr_io_pending = 0 ;
dm_list_init ( & cache - > free ) ;
dm_list_init ( & cache - > errored ) ;
dm_list_init ( & cache - > dirty ) ;
dm_list_init ( & cache - > clean ) ;
dm_list_init ( & cache - > io_pending ) ;
2018-05-30 16:17:26 +03:00
cache - > rtree = radix_tree_create ( NULL , NULL ) ;
if ( ! cache - > rtree ) {
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-06-08 15:40:53 +03:00
free ( cache ) ;
2018-01-30 15:13:48 +03:00
return NULL ;
2018-01-30 13:46:08 +03:00
}
cache - > read_hits = 0 ;
cache - > read_misses = 0 ;
cache - > write_zeroes = 0 ;
cache - > write_hits = 0 ;
cache - > write_misses = 0 ;
cache - > prefetches = 0 ;
2018-05-17 12:05:10 +03:00
if ( ! _init_free_list ( cache , nr_cache_blocks , pgsize ) ) {
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-05-30 16:17:26 +03:00
radix_tree_destroy ( cache - > rtree ) ;
2018-06-08 15:40:53 +03:00
free ( cache ) ;
2018-01-30 15:13:48 +03:00
return NULL ;
2018-01-30 13:46:08 +03:00
}
return cache ;
}
void bcache_destroy ( struct bcache * cache )
{
if ( cache - > nr_locked )
2018-01-30 15:13:48 +03:00
log_warn ( " some blocks are still locked " ) ;
2018-01-30 13:46:08 +03:00
bcache_flush ( cache ) ;
_wait_all ( cache ) ;
_exit_free_list ( cache ) ;
2018-05-30 16:17:26 +03:00
radix_tree_destroy ( cache - > rtree ) ;
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-06-08 15:40:53 +03:00
free ( cache ) ;
2018-01-30 13:46:08 +03:00
}
2018-05-03 11:33:55 +03:00
sector_t bcache_block_sectors ( struct bcache * cache )
{
return cache - > block_sectors ;
}
2018-02-01 17:52:43 +03:00
unsigned bcache_nr_cache_blocks ( struct bcache * cache )
{
return cache - > nr_cache_blocks ;
}
2018-02-02 15:06:14 +03:00
unsigned bcache_max_prefetches ( struct bcache * cache )
{
return cache - > max_io ;
}
2018-05-01 15:21:53 +03:00
void bcache_prefetch ( struct bcache * cache , int fd , block_address i )
2018-01-30 13:46:08 +03:00
{
2018-05-30 16:17:26 +03:00
struct block * b = _block_lookup ( cache , fd , i ) ;
2018-01-30 13:46:08 +03:00
if ( ! b ) {
2018-02-05 19:04:23 +03:00
if ( cache - > nr_io_pending < cache - > max_io ) {
2018-05-01 15:21:53 +03:00
b = _new_block ( cache , fd , i , false ) ;
2018-02-05 19:04:23 +03:00
if ( b ) {
cache - > prefetches + + ;
_issue_read ( b ) ;
}
2018-02-02 15:06:14 +03:00
}
2018-01-30 13:46:08 +03:00
}
}
2018-05-30 16:17:26 +03:00
//----------------------------------------------------------------
2018-04-27 12:56:13 +03:00
static void _recycle_block ( struct bcache * cache , struct block * b )
{
_unlink_block ( b ) ;
2018-05-30 16:17:26 +03:00
_block_remove ( b ) ;
_free_block ( b ) ;
2018-04-27 12:56:13 +03:00
}
2018-05-01 15:21:53 +03:00
bool bcache_get ( struct bcache * cache , int fd , block_address i ,
2018-05-10 15:26:08 +03:00
unsigned flags , struct block * * result )
2018-01-30 13:46:08 +03:00
{
2018-02-01 17:52:43 +03:00
struct block * b ;
2018-05-01 15:21:53 +03:00
b = _lookup_or_read_block ( cache , fd , i , flags ) ;
2018-01-30 13:46:08 +03:00
if ( b ) {
2018-04-26 15:13:27 +03:00
if ( b - > error ) {
if ( b - > io_dir = = DIR_READ ) {
// Now we know the read failed we can just forget
// about this block, since there's no dirty data to
// be written back.
2018-04-27 12:56:13 +03:00
_recycle_block ( cache , b ) ;
2018-04-26 15:13:27 +03:00
}
return false ;
}
2018-01-30 13:46:08 +03:00
if ( ! b - > ref_count )
cache - > nr_locked + + ;
b - > ref_count + + ;
* result = b ;
return true ;
}
* result = NULL ;
2018-04-06 21:11:39 +03:00
2018-05-01 15:21:53 +03:00
log_error ( " bcache failed to get block %u fd %d " , ( uint32_t ) i , fd ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
2018-05-30 16:17:26 +03:00
//----------------------------------------------------------------
2018-02-20 00:40:44 +03:00
static void _put_ref ( struct block * b )
2018-01-30 13:46:08 +03:00
{
if ( ! b - > ref_count ) {
2018-01-30 15:13:48 +03:00
log_warn ( " ref count on bcache block already zero " ) ;
2018-01-30 13:46:08 +03:00
return ;
}
b - > ref_count - - ;
if ( ! b - > ref_count )
b - > cache - > nr_locked - - ;
2018-02-20 00:40:44 +03:00
}
void bcache_put ( struct block * b )
{
_put_ref ( b ) ;
2018-01-30 13:46:08 +03:00
if ( _test_flags ( b , BF_DIRTY ) )
_preemptive_writeback ( b - > cache ) ;
}
2018-05-30 16:17:26 +03:00
//----------------------------------------------------------------
2018-02-20 18:33:27 +03:00
bool bcache_flush ( struct bcache * cache )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
// Only dirty data is on the errored list, since bad read blocks get
// recycled straight away. So we put these back on the dirty list, and
// try and rewrite everything.
dm_list_splice ( & cache - > dirty , & cache - > errored ) ;
2018-01-30 13:46:08 +03:00
while ( ! dm_list_empty ( & cache - > dirty ) ) {
struct block * b = dm_list_item ( _list_pop ( & cache - > dirty ) , struct block ) ;
2018-02-05 19:04:23 +03:00
if ( b - > ref_count | | _test_flags ( b , BF_IO_PENDING ) ) {
2018-01-30 13:46:08 +03:00
// The superblock may well be still locked.
continue ;
2018-02-05 19:04:23 +03:00
}
2018-04-27 12:56:13 +03:00
2018-01-30 13:46:08 +03:00
_issue_write ( b ) ;
}
_wait_all ( cache ) ;
2018-02-20 18:33:27 +03:00
return dm_list_empty ( & cache - > errored ) ;
2018-01-30 13:46:08 +03:00
}
2018-05-30 16:17:26 +03:00
//----------------------------------------------------------------
2018-02-02 10:59:49 +03:00
/*
* You can safely call this with a NULL block .
*/
2018-04-27 12:56:13 +03:00
static bool _invalidate_block ( struct bcache * cache , struct block * b )
2018-02-02 10:59:49 +03:00
{
if ( ! b )
2018-04-27 12:56:13 +03:00
return true ;
2018-02-02 10:59:49 +03:00
if ( _test_flags ( b , BF_IO_PENDING ) )
_wait_specific ( b ) ;
2018-04-27 12:56:13 +03:00
if ( b - > ref_count ) {
2018-02-02 10:59:49 +03:00
log_warn ( " bcache_invalidate: block (%d, %llu) still held " ,
2018-05-01 15:21:53 +03:00
b - > fd , ( unsigned long long ) b - > index ) ;
2018-04-27 12:56:13 +03:00
return false ;
}
2018-02-02 10:59:49 +03:00
2018-04-27 12:56:13 +03:00
if ( _test_flags ( b , BF_DIRTY ) ) {
_issue_write ( b ) ;
_wait_specific ( b ) ;
if ( b - > error )
return false ;
2018-02-02 10:59:49 +03:00
}
2018-04-27 12:56:13 +03:00
_recycle_block ( cache , b ) ;
return true ;
2018-02-02 10:59:49 +03:00
}
2018-05-01 15:21:53 +03:00
bool bcache_invalidate ( struct bcache * cache , int fd , block_address i )
2018-02-02 10:59:49 +03:00
{
2018-05-30 16:17:26 +03:00
return _invalidate_block ( cache , _block_lookup ( cache , fd , i ) ) ;
}
//----------------------------------------------------------------
struct invalidate_iterator {
bool success ;
struct radix_tree_iterator it ;
} ;
static bool _writeback_v ( struct radix_tree_iterator * it ,
uint8_t * kb , uint8_t * ke , union radix_value v )
{
struct block * b = v . ptr ;
if ( _test_flags ( b , BF_DIRTY ) )
_issue_write ( b ) ;
return true ;
}
static bool _invalidate_v ( struct radix_tree_iterator * it ,
uint8_t * kb , uint8_t * ke , union radix_value v )
{
struct block * b = v . ptr ;
struct invalidate_iterator * iit = container_of ( it , struct invalidate_iterator , it ) ;
if ( b - > error | | _test_flags ( b , BF_DIRTY ) ) {
log_warn ( " bcache_invalidate: block (%d, %llu) still dirty " ,
b - > fd , ( unsigned long long ) b - > index ) ;
iit - > success = false ;
return true ;
}
if ( b - > ref_count ) {
log_warn ( " bcache_invalidate: block (%d, %llu) still held " ,
b - > fd , ( unsigned long long ) b - > index ) ;
iit - > success = false ;
return true ;
}
_unlink_block ( b ) ;
_free_block ( b ) ;
// We can't remove the block from the radix tree yet because
// we're in the middle of an iteration.
return true ;
2018-02-02 10:59:49 +03:00
}
2018-04-27 12:56:13 +03:00
bool bcache_invalidate_fd ( struct bcache * cache , int fd )
2018-02-02 10:59:49 +03:00
{
2018-05-30 16:17:26 +03:00
union key k ;
struct invalidate_iterator it ;
2018-02-02 10:59:49 +03:00
2018-05-30 16:17:26 +03:00
k . parts . fd = fd ;
2018-02-02 10:59:49 +03:00
2018-05-30 16:17:26 +03:00
it . it . visit = _writeback_v ;
radix_tree_iterate ( cache - > rtree , k . bytes , k . bytes + sizeof ( k . parts . fd ) , & it . it ) ;
2018-02-02 10:59:49 +03:00
2018-05-30 16:17:26 +03:00
_wait_all ( cache ) ;
2018-02-02 10:59:49 +03:00
2018-05-30 16:17:26 +03:00
it . success = true ;
it . it . visit = _invalidate_v ;
radix_tree_iterate ( cache - > rtree , k . bytes , k . bytes + sizeof ( k . parts . fd ) , & it . it ) ;
radix_tree_remove_prefix ( cache - > rtree , k . bytes , k . bytes + sizeof ( k . parts . fd ) ) ;
return it . success ;
2018-02-02 10:59:49 +03:00
}
2018-01-30 13:46:08 +03:00
//----------------------------------------------------------------