2018-01-30 15:13:48 +03:00
/*
* Copyright ( C ) 2018 Red Hat , Inc . All rights reserved .
*
* This file is part of LVM2 .
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v .2 .1 .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program ; if not , write to the Free Software Foundation ,
* Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
*/
2018-01-30 13:46:08 +03:00
# define _GNU_SOURCE
# include <errno.h>
# include <fcntl.h>
# include <sys/stat.h>
# include <sys/types.h>
# include <stdbool.h>
# include <stdlib.h>
# include <stdio.h>
# include <stdint.h>
# include <libaio.h>
# include <unistd.h>
# include <linux/fs.h>
# include <sys/ioctl.h>
# include <sys/user.h>
# include "bcache.h"
# include "dm-logging.h"
# include "log.h"
# define SECTOR_SHIFT 9L
//----------------------------------------------------------------
2018-01-30 15:13:48 +03:00
static void log_sys_warn ( const char * syscall )
{
log_warn ( " %s failed: %s " , syscall , strerror ( errno ) ) ;
}
2018-01-30 13:46:08 +03:00
// Assumes the list is not empty.
static inline struct dm_list * _list_pop ( struct dm_list * head )
{
struct dm_list * l ;
l = head - > n ;
dm_list_del ( l ) ;
return l ;
}
//----------------------------------------------------------------
struct control_block {
struct dm_list list ;
void * context ;
struct iocb cb ;
} ;
struct cb_set {
struct dm_list free ;
struct dm_list allocated ;
struct control_block * vec ;
} control_block_set ;
static struct cb_set * _cb_set_create ( unsigned nr )
{
int i ;
2018-01-30 15:13:48 +03:00
struct cb_set * cbs = dm_malloc ( sizeof ( * cbs ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cbs )
return NULL ;
2018-01-30 15:13:48 +03:00
cbs - > vec = dm_malloc ( nr * sizeof ( * cbs - > vec ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cbs - > vec ) {
2018-01-30 15:13:48 +03:00
dm_free ( cbs ) ;
2018-01-30 13:46:08 +03:00
return NULL ;
}
dm_list_init ( & cbs - > free ) ;
dm_list_init ( & cbs - > allocated ) ;
for ( i = 0 ; i < nr ; i + + )
dm_list_add ( & cbs - > free , & cbs - > vec [ i ] . list ) ;
return cbs ;
}
2018-01-30 15:13:48 +03:00
static void _cb_set_destroy ( struct cb_set * cbs )
2018-01-30 13:46:08 +03:00
{
2018-01-30 15:13:48 +03:00
// We know this is always called after a wait_all. So there should
// never be in flight IO.
2018-01-30 13:46:08 +03:00
if ( ! dm_list_empty ( & cbs - > allocated ) ) {
2018-01-30 15:13:48 +03:00
// bail out
2018-01-30 13:46:08 +03:00
log_error ( " async io still in flight " ) ;
2018-01-30 15:13:48 +03:00
return ;
2018-01-30 13:46:08 +03:00
}
2018-01-30 15:13:48 +03:00
dm_free ( cbs - > vec ) ;
dm_free ( cbs ) ;
2018-01-30 13:46:08 +03:00
}
static struct control_block * _cb_alloc ( struct cb_set * cbs , void * context )
{
struct control_block * cb ;
if ( dm_list_empty ( & cbs - > free ) )
return NULL ;
cb = dm_list_item ( _list_pop ( & cbs - > free ) , struct control_block ) ;
cb - > context = context ;
dm_list_add ( & cbs - > allocated , & cb - > list ) ;
return cb ;
}
static void _cb_free ( struct cb_set * cbs , struct control_block * cb )
{
dm_list_del ( & cb - > list ) ;
dm_list_add_h ( & cbs - > free , & cb - > list ) ;
}
static struct control_block * _iocb_to_cb ( struct iocb * icb )
{
return dm_list_struct_base ( icb , struct control_block , cb ) ;
}
//----------------------------------------------------------------
// FIXME: write a sync engine too
2018-02-05 19:04:23 +03:00
struct async_engine {
struct io_engine e ;
2018-01-30 13:46:08 +03:00
io_context_t aio_context ;
struct cb_set * cbs ;
} ;
2018-02-05 19:04:23 +03:00
static struct async_engine * _to_async ( struct io_engine * e )
2018-01-30 13:46:08 +03:00
{
2018-02-05 19:04:23 +03:00
return container_of ( e , struct async_engine , e ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-05 19:04:23 +03:00
static void _async_destroy ( struct io_engine * ioe )
2018-01-30 13:46:08 +03:00
{
2018-02-01 17:52:43 +03:00
int r ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-02-01 17:52:43 +03:00
2018-01-30 13:46:08 +03:00
_cb_set_destroy ( e - > cbs ) ;
2018-02-01 17:52:43 +03:00
// io_destroy is really slow
r = io_destroy ( e - > aio_context ) ;
if ( r )
log_sys_warn ( " io_destroy " ) ;
2018-01-30 15:13:48 +03:00
dm_free ( e ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-05 19:04:23 +03:00
static bool _async_issue ( struct io_engine * ioe , enum dir d , int fd ,
sector_t sb , sector_t se , void * data , void * context )
2018-01-30 13:46:08 +03:00
{
int r ;
struct iocb * cb_array [ 1 ] ;
struct control_block * cb ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-01-30 13:46:08 +03:00
if ( ( ( uint64_t ) data ) & ( PAGE_SIZE - 1 ) ) {
2018-01-30 15:13:48 +03:00
log_warn ( " misaligned data buffer " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
cb = _cb_alloc ( e - > cbs , context ) ;
if ( ! cb ) {
2018-01-30 15:13:48 +03:00
log_warn ( " couldn't allocate control block " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
memset ( & cb - > cb , 0 , sizeof ( cb - > cb ) ) ;
cb - > cb . aio_fildes = ( int ) fd ;
cb - > cb . u . c . buf = data ;
cb - > cb . u . c . offset = sb < < SECTOR_SHIFT ;
cb - > cb . u . c . nbytes = ( se - sb ) < < SECTOR_SHIFT ;
cb - > cb . aio_lio_opcode = ( d = = DIR_READ ) ? IO_CMD_PREAD : IO_CMD_PWRITE ;
cb_array [ 0 ] = & cb - > cb ;
2018-02-20 18:33:27 +03:00
do {
r = io_submit ( e - > aio_context , 1 , cb_array ) ;
} while ( r = = - EAGAIN ) ;
2018-01-30 13:46:08 +03:00
if ( r < 0 ) {
2018-01-30 15:13:48 +03:00
log_sys_warn ( " io_submit " ) ;
2018-01-30 13:46:08 +03:00
_cb_free ( e - > cbs , cb ) ;
return false ;
}
return true ;
}
2018-02-02 15:06:14 +03:00
# define MAX_IO 1024
# define MAX_EVENT 64
2018-01-30 13:46:08 +03:00
2018-02-05 19:04:23 +03:00
static bool _async_wait ( struct io_engine * ioe , io_complete_fn fn )
2018-01-30 13:46:08 +03:00
{
int i , r ;
2018-02-02 15:06:14 +03:00
struct io_event event [ MAX_EVENT ] ;
2018-01-30 13:46:08 +03:00
struct control_block * cb ;
2018-02-05 19:04:23 +03:00
struct async_engine * e = _to_async ( ioe ) ;
2018-01-30 13:46:08 +03:00
memset ( & event , 0 , sizeof ( event ) ) ;
2018-02-20 18:33:27 +03:00
do {
r = io_getevents ( e - > aio_context , 1 , MAX_EVENT , event , NULL ) ;
} while ( r = = - EINTR ) ;
2018-01-30 13:46:08 +03:00
if ( r < 0 ) {
2018-01-30 15:13:48 +03:00
log_sys_warn ( " io_getevents " ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
for ( i = 0 ; i < r ; i + + ) {
struct io_event * ev = event + i ;
cb = _iocb_to_cb ( ( struct iocb * ) ev - > obj ) ;
if ( ev - > res = = cb - > cb . u . c . nbytes )
fn ( ( void * ) cb - > context , 0 ) ;
else if ( ( int ) ev - > res < 0 )
fn ( cb - > context , ( int ) ev - > res ) ;
2018-02-20 18:33:27 +03:00
// FIXME: dct added this. a short read is ok?!
2018-02-07 00:18:11 +03:00
else if ( ev - > res > = ( 1 < < SECTOR_SHIFT ) ) {
/* minimum acceptable read is 1 sector */
fn ( ( void * ) cb - > context , 0 ) ;
} else {
2018-01-30 13:46:08 +03:00
fn ( cb - > context , - ENODATA ) ;
}
_cb_free ( e - > cbs , cb ) ;
}
return true ;
}
2018-02-20 18:33:27 +03:00
static unsigned _async_max_io ( struct io_engine * e )
2018-02-08 19:10:31 +03:00
{
2018-02-20 18:33:27 +03:00
return MAX_IO ;
2018-02-08 19:10:31 +03:00
}
2018-02-20 18:33:27 +03:00
struct io_engine * create_async_io_engine ( void )
2018-02-05 19:04:23 +03:00
{
int r ;
struct async_engine * e = dm_malloc ( sizeof ( * e ) ) ;
if ( ! e )
return NULL ;
e - > e . destroy = _async_destroy ;
e - > e . issue = _async_issue ;
e - > e . wait = _async_wait ;
2018-02-08 19:10:31 +03:00
e - > e . max_io = _async_max_io ;
2018-02-05 19:04:23 +03:00
e - > aio_context = 0 ;
2018-02-20 18:33:27 +03:00
r = io_setup ( MAX_IO , & e - > aio_context ) ;
2018-02-05 19:04:23 +03:00
if ( r < 0 ) {
log_warn ( " io_setup failed " ) ;
dm_free ( e ) ;
return NULL ;
}
2018-02-20 18:33:27 +03:00
e - > cbs = _cb_set_create ( MAX_IO ) ;
2018-02-05 19:04:23 +03:00
if ( ! e - > cbs ) {
log_warn ( " couldn't create control block set " ) ;
dm_free ( e ) ;
return NULL ;
}
return & e - > e ;
}
2018-01-30 13:46:08 +03:00
//----------------------------------------------------------------
# define MIN_BLOCKS 16
# define WRITEBACK_LOW_THRESHOLD_PERCENT 33
# define WRITEBACK_HIGH_THRESHOLD_PERCENT 66
//----------------------------------------------------------------
static void * _alloc_aligned ( size_t len , size_t alignment )
{
void * result = NULL ;
int r = posix_memalign ( & result , alignment , len ) ;
if ( r )
return NULL ;
return result ;
}
//----------------------------------------------------------------
static bool _test_flags ( struct block * b , unsigned bits )
{
return ( b - > flags & bits ) ! = 0 ;
}
static void _set_flags ( struct block * b , unsigned bits )
{
b - > flags | = bits ;
}
static void _clear_flags ( struct block * b , unsigned bits )
{
b - > flags & = ~ bits ;
}
//----------------------------------------------------------------
enum block_flags {
BF_IO_PENDING = ( 1 < < 0 ) ,
BF_DIRTY = ( 1 < < 1 ) ,
} ;
struct bcache {
sector_t block_sectors ;
uint64_t nr_data_blocks ;
uint64_t nr_cache_blocks ;
2018-02-02 15:06:14 +03:00
unsigned max_io ;
2018-01-30 13:46:08 +03:00
struct io_engine * engine ;
void * raw_data ;
struct block * raw_blocks ;
/*
* Lists that categorise the blocks .
*/
unsigned nr_locked ;
unsigned nr_dirty ;
unsigned nr_io_pending ;
struct dm_list free ;
struct dm_list errored ;
struct dm_list dirty ;
struct dm_list clean ;
struct dm_list io_pending ;
/*
* Hash table .
*/
unsigned nr_buckets ;
unsigned hash_mask ;
struct dm_list * buckets ;
/*
* Statistics
*/
unsigned read_hits ;
unsigned read_misses ;
unsigned write_zeroes ;
unsigned write_hits ;
unsigned write_misses ;
unsigned prefetches ;
} ;
//----------------------------------------------------------------
/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
# define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
static unsigned _hash ( struct bcache * cache , int fd , uint64_t index )
{
uint64_t h = ( index < < 10 ) & fd ;
h * = GOLDEN_RATIO_PRIME_64 ;
return h & cache - > hash_mask ;
}
static struct block * _hash_lookup ( struct bcache * cache , int fd , uint64_t index )
{
struct block * b ;
unsigned h = _hash ( cache , fd , index ) ;
dm_list_iterate_items_gen ( b , cache - > buckets + h , hash )
2018-02-06 18:10:44 +03:00
if ( b - > fd = = fd & & b - > index = = index )
2018-01-30 13:46:08 +03:00
return b ;
return NULL ;
}
static void _hash_insert ( struct block * b )
{
unsigned h = _hash ( b - > cache , b - > fd , b - > index ) ;
dm_list_add_h ( b - > cache - > buckets + h , & b - > hash ) ;
}
2018-01-30 15:13:48 +03:00
static inline void _hash_remove ( struct block * b )
2018-01-30 13:46:08 +03:00
{
dm_list_del ( & b - > hash ) ;
}
/*
* Must return a power of 2.
*/
static unsigned _calc_nr_buckets ( unsigned nr_blocks )
{
unsigned r = 8 ;
unsigned n = nr_blocks / 4 ;
if ( n < 8 )
n = 8 ;
while ( r < n )
r < < = 1 ;
return r ;
}
2018-01-30 15:13:48 +03:00
static bool _hash_table_init ( struct bcache * cache , unsigned nr_entries )
2018-01-30 13:46:08 +03:00
{
unsigned i ;
cache - > nr_buckets = _calc_nr_buckets ( nr_entries ) ;
cache - > hash_mask = cache - > nr_buckets - 1 ;
2018-01-30 15:13:48 +03:00
cache - > buckets = dm_malloc ( cache - > nr_buckets * sizeof ( * cache - > buckets ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cache - > buckets )
2018-01-30 15:13:48 +03:00
return false ;
2018-01-30 13:46:08 +03:00
for ( i = 0 ; i < cache - > nr_buckets ; i + + )
dm_list_init ( cache - > buckets + i ) ;
2018-01-30 15:13:48 +03:00
return true ;
2018-01-30 13:46:08 +03:00
}
static void _hash_table_exit ( struct bcache * cache )
{
2018-01-30 15:13:48 +03:00
dm_free ( cache - > buckets ) ;
2018-01-30 13:46:08 +03:00
}
//----------------------------------------------------------------
2018-01-30 15:13:48 +03:00
static bool _init_free_list ( struct bcache * cache , unsigned count )
2018-01-30 13:46:08 +03:00
{
unsigned i ;
size_t block_size = cache - > block_sectors < < SECTOR_SHIFT ;
unsigned char * data =
( unsigned char * ) _alloc_aligned ( count * block_size , PAGE_SIZE ) ;
/* Allocate the data for each block. We page align the data. */
if ( ! data )
2018-01-30 15:13:48 +03:00
return false ;
2018-01-30 13:46:08 +03:00
cache - > raw_data = data ;
2018-01-30 15:13:48 +03:00
cache - > raw_blocks = dm_malloc ( count * sizeof ( * cache - > raw_blocks ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cache - > raw_blocks )
2018-01-30 15:13:48 +03:00
dm_free ( cache - > raw_data ) ;
2018-01-30 13:46:08 +03:00
for ( i = 0 ; i < count ; i + + ) {
struct block * b = cache - > raw_blocks + i ;
b - > cache = cache ;
b - > data = data + ( block_size * i ) ;
dm_list_add ( & cache - > free , & b - > list ) ;
}
2018-01-30 15:13:48 +03:00
return true ;
2018-01-30 13:46:08 +03:00
}
static void _exit_free_list ( struct bcache * cache )
{
2018-01-30 15:13:48 +03:00
dm_free ( cache - > raw_data ) ;
dm_free ( cache - > raw_blocks ) ;
2018-01-30 13:46:08 +03:00
}
static struct block * _alloc_block ( struct bcache * cache )
{
2018-02-02 17:34:45 +03:00
if ( dm_list_empty ( & cache - > free ) )
return NULL ;
return dm_list_struct_base ( _list_pop ( & cache - > free ) , struct block , list ) ;
2018-01-30 13:46:08 +03:00
}
/*----------------------------------------------------------------
* Clean / dirty list management .
* Always use these methods to ensure nr_dirty_ is correct .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _unlink_block ( struct block * b )
{
if ( _test_flags ( b , BF_DIRTY ) )
b - > cache - > nr_dirty - - ;
dm_list_del ( & b - > list ) ;
}
static void _link_block ( struct block * b )
{
struct bcache * cache = b - > cache ;
if ( _test_flags ( b , BF_DIRTY ) ) {
dm_list_add ( & cache - > dirty , & b - > list ) ;
cache - > nr_dirty + + ;
} else
dm_list_add ( & cache - > clean , & b - > list ) ;
}
static void _relink ( struct block * b )
{
_unlink_block ( b ) ;
_link_block ( b ) ;
}
/*----------------------------------------------------------------
* Low level IO handling
*
* We cannot have two concurrent writes on the same block .
* eg , background writeback , put with dirty , flush ?
*
* To avoid this we introduce some restrictions :
*
* i ) A held block can never be written back .
* ii ) You cannot get a block until writeback has completed .
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2018-02-01 17:52:43 +03:00
static void _complete_io ( void * context , int err )
{
struct block * b = context ;
struct bcache * cache = b - > cache ;
b - > error = err ;
_clear_flags ( b , BF_IO_PENDING ) ;
cache - > nr_io_pending - - ;
/*
* b is on the io_pending list , so we don ' t want to use unlink_block .
* Which would incorrectly adjust nr_dirty .
*/
dm_list_del ( & b - > list ) ;
2018-03-01 19:17:32 +03:00
/* Things don't work with this block of code, but work without it. */
if ( b - > error ) {
log_warn ( " bcache io error %d fd %d " , b - > error , b - > fd ) ;
2018-02-01 17:52:43 +03:00
2018-04-26 15:13:27 +03:00
dm_list_add ( & cache - > errored , & b - > list ) ;
2018-02-20 18:33:27 +03:00
} else {
2018-02-01 17:52:43 +03:00
_clear_flags ( b , BF_DIRTY ) ;
_link_block ( b ) ;
}
}
2018-01-30 13:46:08 +03:00
/*
* | b - > list | should be valid ( either pointing to itself , on one of the other
* lists .
*/
2018-02-20 18:33:27 +03:00
static void _issue_low_level ( struct block * b , enum dir d )
2018-01-30 13:46:08 +03:00
{
struct bcache * cache = b - > cache ;
sector_t sb = b - > index * cache - > block_sectors ;
sector_t se = sb + cache - > block_sectors ;
if ( _test_flags ( b , BF_IO_PENDING ) )
2018-02-20 18:33:27 +03:00
return ;
2018-01-30 13:46:08 +03:00
2018-02-20 18:33:27 +03:00
b - > io_dir = d ;
2018-01-30 13:46:08 +03:00
_set_flags ( b , BF_IO_PENDING ) ;
2018-02-05 19:04:23 +03:00
dm_list_add ( & cache - > io_pending , & b - > list ) ;
if ( ! cache - > engine - > issue ( cache - > engine , d , b - > fd , sb , se , b - > data , b ) ) {
2018-04-06 21:11:39 +03:00
/* FIXME: if io_submit() set an errno, return that instead of EIO? */
2018-02-01 17:52:43 +03:00
_complete_io ( b , - EIO ) ;
2018-02-20 18:33:27 +03:00
return ;
2018-02-01 17:52:43 +03:00
}
2018-01-30 13:46:08 +03:00
}
2018-02-20 18:33:27 +03:00
static inline void _issue_read ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
_issue_low_level ( b , DIR_READ ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-20 18:33:27 +03:00
static inline void _issue_write ( struct block * b )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
_issue_low_level ( b , DIR_WRITE ) ;
2018-01-30 13:46:08 +03:00
}
2018-01-30 15:13:48 +03:00
static bool _wait_io ( struct bcache * cache )
2018-01-30 13:46:08 +03:00
{
2018-02-05 19:04:23 +03:00
return cache - > engine - > wait ( cache - > engine , _complete_io ) ;
2018-01-30 13:46:08 +03:00
}
/*----------------------------------------------------------------
* High level IO handling
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _wait_all ( struct bcache * cache )
{
while ( ! dm_list_empty ( & cache - > io_pending ) )
_wait_io ( cache ) ;
}
static void _wait_specific ( struct block * b )
{
while ( _test_flags ( b , BF_IO_PENDING ) )
_wait_io ( b - > cache ) ;
}
static unsigned _writeback ( struct bcache * cache , unsigned count )
{
unsigned actual = 0 ;
struct block * b , * tmp ;
dm_list_iterate_items_gen_safe ( b , tmp , & cache - > dirty , list ) {
if ( actual = = count )
break ;
// We can't writeback anything that's still in use.
if ( ! b - > ref_count ) {
_issue_write ( b ) ;
actual + + ;
}
}
return actual ;
}
/*----------------------------------------------------------------
* High level allocation
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static struct block * _find_unused_clean_block ( struct bcache * cache )
{
struct block * b ;
dm_list_iterate_items ( b , & cache - > clean ) {
if ( ! b - > ref_count ) {
_unlink_block ( b ) ;
_hash_remove ( b ) ;
return b ;
}
}
return NULL ;
}
2018-02-05 19:04:23 +03:00
static struct block * _new_block ( struct bcache * cache , int fd , block_address index , bool can_wait )
2018-01-30 13:46:08 +03:00
{
struct block * b ;
b = _alloc_block ( cache ) ;
2018-02-05 19:04:23 +03:00
while ( ! b & & ! dm_list_empty ( & cache - > clean ) ) {
2018-01-30 13:46:08 +03:00
b = _find_unused_clean_block ( cache ) ;
if ( ! b ) {
2018-02-05 19:04:23 +03:00
if ( can_wait ) {
if ( dm_list_empty ( & cache - > io_pending ) )
_writeback ( cache , 16 ) ; // FIXME: magic number
_wait_io ( cache ) ;
2018-02-27 21:37:25 +03:00
} else {
log_error ( " bcache no new blocks for fd %d index %u " ,
fd , ( uint32_t ) index ) ;
2018-02-05 19:04:23 +03:00
return NULL ;
2018-02-27 21:37:25 +03:00
}
2018-01-30 13:46:08 +03:00
}
}
if ( b ) {
dm_list_init ( & b - > list ) ;
dm_list_init ( & b - > hash ) ;
b - > flags = 0 ;
2018-02-01 17:52:43 +03:00
b - > fd = fd ;
2018-01-30 13:46:08 +03:00
b - > index = index ;
b - > ref_count = 0 ;
b - > error = 0 ;
_hash_insert ( b ) ;
}
2018-02-27 21:37:25 +03:00
if ( ! b ) {
log_error ( " bcache no new blocks for fd %d index %u "
" clean %u free %u dirty %u pending %u nr_data_blocks %u nr_cache_blocks %u " ,
fd , ( uint32_t ) index ,
dm_list_size ( & cache - > clean ) ,
dm_list_size ( & cache - > free ) ,
dm_list_size ( & cache - > dirty ) ,
dm_list_size ( & cache - > io_pending ) ,
( uint32_t ) cache - > nr_data_blocks ,
( uint32_t ) cache - > nr_cache_blocks ) ;
}
2018-01-30 13:46:08 +03:00
return b ;
}
/*----------------------------------------------------------------
* Block reference counting
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void _zero_block ( struct block * b )
{
b - > cache - > write_zeroes + + ;
memset ( b - > data , 0 , b - > cache - > block_sectors < < SECTOR_SHIFT ) ;
_set_flags ( b , BF_DIRTY ) ;
}
static void _hit ( struct block * b , unsigned flags )
{
struct bcache * cache = b - > cache ;
if ( flags & ( GF_ZERO | GF_DIRTY ) )
cache - > write_hits + + ;
else
cache - > read_hits + + ;
_relink ( b ) ;
}
static void _miss ( struct bcache * cache , unsigned flags )
{
if ( flags & ( GF_ZERO | GF_DIRTY ) )
cache - > write_misses + + ;
else
cache - > read_misses + + ;
}
static struct block * _lookup_or_read_block ( struct bcache * cache ,
int fd , block_address index ,
unsigned flags )
{
struct block * b = _hash_lookup ( cache , fd , index ) ;
if ( b ) {
// FIXME: this is insufficient. We need to also catch a read
// lock of a write locked block. Ref count needs to distinguish.
if ( b - > ref_count & & ( flags & ( GF_DIRTY | GF_ZERO ) ) ) {
2018-01-30 15:13:48 +03:00
log_warn ( " concurrent write lock attempted " ) ;
2018-01-30 13:46:08 +03:00
return NULL ;
}
if ( _test_flags ( b , BF_IO_PENDING ) ) {
_miss ( cache , flags ) ;
_wait_specific ( b ) ;
} else
_hit ( b , flags ) ;
_unlink_block ( b ) ;
if ( flags & GF_ZERO )
_zero_block ( b ) ;
} else {
_miss ( cache , flags ) ;
2018-02-05 19:04:23 +03:00
b = _new_block ( cache , fd , index , true ) ;
2018-01-30 13:46:08 +03:00
if ( b ) {
if ( flags & GF_ZERO )
_zero_block ( b ) ;
else {
_issue_read ( b ) ;
_wait_specific ( b ) ;
// we know the block is clean and unerrored.
_unlink_block ( b ) ;
}
}
}
2018-03-01 19:17:32 +03:00
if ( b ) {
2018-01-30 13:46:08 +03:00
if ( flags & ( GF_DIRTY | GF_ZERO ) )
_set_flags ( b , BF_DIRTY ) ;
_link_block ( b ) ;
return b ;
}
return NULL ;
}
static void _preemptive_writeback ( struct bcache * cache )
{
// FIXME: this ignores those blocks that are in the error state. Track
// nr_clean instead?
unsigned nr_available = cache - > nr_cache_blocks - ( cache - > nr_dirty - cache - > nr_io_pending ) ;
if ( nr_available < ( WRITEBACK_LOW_THRESHOLD_PERCENT * cache - > nr_cache_blocks / 100 ) )
_writeback ( cache , ( WRITEBACK_HIGH_THRESHOLD_PERCENT * cache - > nr_cache_blocks / 100 ) - nr_available ) ;
}
/*----------------------------------------------------------------
* Public interface
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2018-02-05 19:04:23 +03:00
struct bcache * bcache_create ( sector_t block_sectors , unsigned nr_cache_blocks ,
struct io_engine * engine )
2018-01-30 13:46:08 +03:00
{
struct bcache * cache ;
2018-02-05 19:04:23 +03:00
unsigned max_io = engine - > max_io ( engine ) ;
2018-01-30 13:46:08 +03:00
2018-02-01 17:52:43 +03:00
if ( ! nr_cache_blocks ) {
log_warn ( " bcache must have at least one cache block " ) ;
return NULL ;
}
if ( ! block_sectors ) {
log_warn ( " bcache must have a non zero block size " ) ;
return NULL ;
}
if ( block_sectors & ( ( PAGE_SIZE > > SECTOR_SHIFT ) - 1 ) ) {
log_warn ( " bcache block size must be a multiple of page size " ) ;
return NULL ;
}
2018-01-30 15:13:48 +03:00
cache = dm_malloc ( sizeof ( * cache ) ) ;
2018-01-30 13:46:08 +03:00
if ( ! cache )
return NULL ;
cache - > block_sectors = block_sectors ;
cache - > nr_cache_blocks = nr_cache_blocks ;
2018-02-05 19:04:23 +03:00
cache - > max_io = nr_cache_blocks < max_io ? nr_cache_blocks : max_io ;
cache - > engine = engine ;
2018-01-30 13:46:08 +03:00
cache - > nr_locked = 0 ;
cache - > nr_dirty = 0 ;
cache - > nr_io_pending = 0 ;
dm_list_init ( & cache - > free ) ;
dm_list_init ( & cache - > errored ) ;
dm_list_init ( & cache - > dirty ) ;
dm_list_init ( & cache - > clean ) ;
dm_list_init ( & cache - > io_pending ) ;
2018-01-30 15:13:48 +03:00
if ( ! _hash_table_init ( cache , nr_cache_blocks ) ) {
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-01-30 15:13:48 +03:00
dm_free ( cache ) ;
return NULL ;
2018-01-30 13:46:08 +03:00
}
cache - > read_hits = 0 ;
cache - > read_misses = 0 ;
cache - > write_zeroes = 0 ;
cache - > write_hits = 0 ;
cache - > write_misses = 0 ;
cache - > prefetches = 0 ;
2018-01-30 15:13:48 +03:00
if ( ! _init_free_list ( cache , nr_cache_blocks ) ) {
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-01-30 13:46:08 +03:00
_hash_table_exit ( cache ) ;
2018-01-30 15:13:48 +03:00
dm_free ( cache ) ;
return NULL ;
2018-01-30 13:46:08 +03:00
}
return cache ;
}
void bcache_destroy ( struct bcache * cache )
{
if ( cache - > nr_locked )
2018-01-30 15:13:48 +03:00
log_warn ( " some blocks are still locked " ) ;
2018-01-30 13:46:08 +03:00
bcache_flush ( cache ) ;
_wait_all ( cache ) ;
_exit_free_list ( cache ) ;
_hash_table_exit ( cache ) ;
2018-02-05 19:04:23 +03:00
cache - > engine - > destroy ( cache - > engine ) ;
2018-01-30 15:13:48 +03:00
dm_free ( cache ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-01 17:52:43 +03:00
unsigned bcache_nr_cache_blocks ( struct bcache * cache )
{
return cache - > nr_cache_blocks ;
}
2018-02-02 15:06:14 +03:00
unsigned bcache_max_prefetches ( struct bcache * cache )
{
return cache - > max_io ;
}
2018-01-30 13:46:08 +03:00
void bcache_prefetch ( struct bcache * cache , int fd , block_address index )
{
struct block * b = _hash_lookup ( cache , fd , index ) ;
if ( ! b ) {
2018-02-05 19:04:23 +03:00
if ( cache - > nr_io_pending < cache - > max_io ) {
b = _new_block ( cache , fd , index , false ) ;
if ( b ) {
cache - > prefetches + + ;
_issue_read ( b ) ;
}
2018-02-02 15:06:14 +03:00
}
2018-01-30 13:46:08 +03:00
}
}
bool bcache_get ( struct bcache * cache , int fd , block_address index ,
2018-04-06 21:11:39 +03:00
unsigned flags , struct block * * result , int * error )
2018-01-30 13:46:08 +03:00
{
2018-02-01 17:52:43 +03:00
struct block * b ;
b = _lookup_or_read_block ( cache , fd , index , flags ) ;
2018-01-30 13:46:08 +03:00
if ( b ) {
2018-04-26 15:13:27 +03:00
if ( b - > error ) {
if ( b - > io_dir = = DIR_READ ) {
// Now we know the read failed we can just forget
// about this block, since there's no dirty data to
// be written back.
_hash_remove ( b ) ;
dm_list_add ( & cache - > free , & b - > list ) ;
}
* error = b - > error ;
return false ;
}
2018-01-30 13:46:08 +03:00
if ( ! b - > ref_count )
cache - > nr_locked + + ;
b - > ref_count + + ;
* result = b ;
return true ;
}
* result = NULL ;
2018-04-06 21:11:39 +03:00
if ( error )
* error = - BCACHE_NO_BLOCK ;
2018-02-27 21:37:25 +03:00
log_error ( " bcache failed to get block %u fd %d " , ( uint32_t ) index , fd ) ;
2018-01-30 13:46:08 +03:00
return false ;
}
2018-02-20 00:40:44 +03:00
static void _put_ref ( struct block * b )
2018-01-30 13:46:08 +03:00
{
if ( ! b - > ref_count ) {
2018-01-30 15:13:48 +03:00
log_warn ( " ref count on bcache block already zero " ) ;
2018-01-30 13:46:08 +03:00
return ;
}
b - > ref_count - - ;
if ( ! b - > ref_count )
b - > cache - > nr_locked - - ;
2018-02-20 00:40:44 +03:00
}
void bcache_put ( struct block * b )
{
_put_ref ( b ) ;
2018-01-30 13:46:08 +03:00
if ( _test_flags ( b , BF_DIRTY ) )
_preemptive_writeback ( b - > cache ) ;
}
2018-02-20 18:33:27 +03:00
bool bcache_flush ( struct bcache * cache )
2018-01-30 13:46:08 +03:00
{
2018-02-20 18:33:27 +03:00
// Only dirty data is on the errored list, since bad read blocks get
// recycled straight away. So we put these back on the dirty list, and
// try and rewrite everything.
dm_list_splice ( & cache - > dirty , & cache - > errored ) ;
2018-01-30 13:46:08 +03:00
while ( ! dm_list_empty ( & cache - > dirty ) ) {
struct block * b = dm_list_item ( _list_pop ( & cache - > dirty ) , struct block ) ;
2018-02-05 19:04:23 +03:00
if ( b - > ref_count | | _test_flags ( b , BF_IO_PENDING ) ) {
2018-01-30 13:46:08 +03:00
// The superblock may well be still locked.
continue ;
2018-02-05 19:04:23 +03:00
}
2018-02-20 00:40:44 +03:00
2018-01-30 13:46:08 +03:00
_issue_write ( b ) ;
}
_wait_all ( cache ) ;
2018-02-20 18:33:27 +03:00
return dm_list_empty ( & cache - > errored ) ;
2018-01-30 13:46:08 +03:00
}
2018-02-02 10:59:49 +03:00
static void _recycle_block ( struct bcache * cache , struct block * b )
{
_unlink_block ( b ) ;
_hash_remove ( b ) ;
dm_list_add ( & cache - > free , & b - > list ) ;
}
/*
* You can safely call this with a NULL block .
*/
static void _invalidate_block ( struct bcache * cache , struct block * b )
{
if ( ! b )
return ;
if ( _test_flags ( b , BF_IO_PENDING ) )
_wait_specific ( b ) ;
if ( b - > ref_count )
log_warn ( " bcache_invalidate: block (%d, %llu) still held " ,
b - > fd , ( unsigned long long ) index ) ;
else {
if ( _test_flags ( b , BF_DIRTY ) ) {
_issue_write ( b ) ;
_wait_specific ( b ) ;
}
_recycle_block ( cache , b ) ;
}
}
void bcache_invalidate ( struct bcache * cache , int fd , block_address index )
{
_invalidate_block ( cache , _hash_lookup ( cache , fd , index ) ) ;
}
// FIXME: switch to a trie, or maybe 1 hash table per fd? To save iterating
// through the whole cache.
void bcache_invalidate_fd ( struct bcache * cache , int fd )
{
struct block * b , * tmp ;
// Start writing back any dirty blocks on this fd.
dm_list_iterate_items_safe ( b , tmp , & cache - > dirty )
if ( b - > fd = = fd )
_issue_write ( b ) ;
_wait_all ( cache ) ;
// Everything should be in the clean list now.
dm_list_iterate_items_safe ( b , tmp , & cache - > clean )
if ( b - > fd = = fd )
_invalidate_block ( cache , b ) ;
// Except they could be in the errored list :)
dm_list_iterate_items_safe ( b , tmp , & cache - > errored )
if ( b - > fd = = fd )
_recycle_block ( cache , b ) ;
}
2018-02-05 19:56:56 +03:00
static void byte_range_to_block_range ( struct bcache * cache , off_t start , size_t len ,
block_address * bb , block_address * be )
{
block_address block_size = cache - > block_sectors < < SECTOR_SHIFT ;
* bb = start / block_size ;
* be = ( start + len + block_size - 1 ) / block_size ;
}
void bcache_prefetch_bytes ( struct bcache * cache , int fd , off_t start , size_t len )
{
block_address bb , be ;
byte_range_to_block_range ( cache , start , len , & bb , & be ) ;
while ( bb < be ) {
bcache_prefetch ( cache , fd , bb ) ;
bb + + ;
}
}
static off_t _min ( off_t lhs , off_t rhs )
{
2018-02-08 20:16:19 +03:00
if ( rhs < lhs )
2018-02-05 19:56:56 +03:00
return rhs ;
return lhs ;
}
bool bcache_read_bytes ( struct bcache * cache , int fd , off_t start , size_t len , void * data )
{
struct block * b ;
block_address bb , be , i ;
unsigned char * udata = data ;
off_t block_size = cache - > block_sectors < < SECTOR_SHIFT ;
2018-02-08 22:44:54 +03:00
int errors = 0 ;
2018-02-05 19:56:56 +03:00
byte_range_to_block_range ( cache , start , len , & bb , & be ) ;
for ( i = bb ; i < be ; i + + )
bcache_prefetch ( cache , fd , i ) ;
for ( i = bb ; i < be ; i + + ) {
2018-04-06 21:11:39 +03:00
if ( ! bcache_get ( cache , fd , i , 0 , & b , NULL ) ) {
log_error ( " bcache_read failed to get block %u fd %d bb %u be %u " ,
( uint32_t ) i , fd , ( uint32_t ) bb , ( uint32_t ) be ) ;
2018-02-08 22:44:54 +03:00
errors + + ;
continue ;
}
2018-02-05 19:56:56 +03:00
if ( i = = bb ) {
off_t block_offset = start % block_size ;
size_t blen = _min ( block_size - block_offset , len ) ;
memcpy ( udata , ( ( unsigned char * ) b - > data ) + block_offset , blen ) ;
len - = blen ;
udata + = blen ;
} else {
size_t blen = _min ( block_size , len ) ;
memcpy ( udata , b - > data , blen ) ;
len - = blen ;
udata + = blen ;
}
2018-02-08 22:44:54 +03:00
bcache_put ( b ) ;
2018-02-05 19:56:56 +03:00
}
2018-02-08 22:44:54 +03:00
return errors ? false : true ;
2018-02-05 19:56:56 +03:00
}
2018-02-20 00:40:44 +03:00
bool bcache_write_bytes ( struct bcache * cache , int fd , off_t start , size_t len , void * data )
{
struct block * b ;
block_address bb , be , i ;
unsigned char * udata = data ;
off_t block_size = cache - > block_sectors < < SECTOR_SHIFT ;
int errors = 0 ;
byte_range_to_block_range ( cache , start , len , & bb , & be ) ;
for ( i = bb ; i < be ; i + + )
bcache_prefetch ( cache , fd , i ) ;
for ( i = bb ; i < be ; i + + ) {
2018-04-06 21:11:39 +03:00
if ( ! bcache_get ( cache , fd , i , 0 , & b , NULL ) ) {
2018-02-27 21:37:25 +03:00
log_error ( " bcache_write failed to get block %u fd %d bb %u be %u " ,
( uint32_t ) i , fd , ( uint32_t ) bb , ( uint32_t ) be ) ;
2018-02-20 00:40:44 +03:00
errors + + ;
break ;
}
if ( i = = bb ) {
off_t block_offset = start % block_size ;
size_t blen = _min ( block_size - block_offset , len ) ;
memcpy ( ( ( unsigned char * ) b - > data ) + block_offset , udata , blen ) ;
len - = blen ;
udata + = blen ;
} else {
size_t blen = _min ( block_size , len ) ;
memcpy ( b - > data , udata , blen ) ;
len - = blen ;
udata + = blen ;
}
_set_flags ( b , BF_DIRTY ) ;
_unlink_block ( b ) ;
_link_block ( b ) ;
_put_ref ( b ) ;
}
if ( ! bcache_flush ( cache ) )
errors + + ;
return errors ? false : true ;
}
# define ZERO_BUF_LEN 4096
bool bcache_write_zeros ( struct bcache * cache , int fd , off_t start , size_t len )
{
char zerobuf [ ZERO_BUF_LEN ] ;
size_t plen ;
size_t poff ;
memset ( zerobuf , 0 , sizeof ( zerobuf ) ) ;
if ( len < = ZERO_BUF_LEN )
return bcache_write_bytes ( cache , fd , start , len , & zerobuf ) ;
poff = 0 ;
plen = ZERO_BUF_LEN ;
while ( 1 ) {
if ( ! bcache_write_bytes ( cache , fd , start + poff , plen , & zerobuf ) )
return false ;
poff + = plen ;
len - = plen ;
if ( ! len )
break ;
if ( len < ZERO_BUF_LEN )
plen = len ;
}
return true ;
}
2018-01-30 13:46:08 +03:00
//----------------------------------------------------------------