2013-03-23 16:11:31 -07:00
/*
* Code for working with individual keys , and sorted sets of keys with in a
* btree node
*
* Copyright 2012 Google , Inc .
*/
2013-11-11 18:38:51 -08:00
# define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
# include "util.h"
# include "bset.h"
2013-03-23 16:11:31 -07:00
2013-12-17 23:47:33 -08:00
# include <linux/console.h>
2013-03-23 16:11:31 -07:00
# include <linux/random.h>
2013-03-27 18:56:28 +01:00
# include <linux/prefetch.h>
2013-03-23 16:11:31 -07:00
2013-12-17 23:47:33 -08:00
# ifdef CONFIG_BCACHE_DEBUG
void bch_dump_bset ( struct btree_keys * b , struct bset * i , unsigned set )
{
struct bkey * k , * next ;
for ( k = i - > start ; k < bset_bkey_last ( i ) ; k = next ) {
next = bkey_next ( k ) ;
2014-02-17 13:44:06 -08:00
printk ( KERN_ERR " block %u key %u/%u: " , set ,
( unsigned ) ( ( u64 * ) k - i - > d ) , i - > keys ) ;
2013-12-17 23:47:33 -08:00
if ( b - > ops - > key_dump )
b - > ops - > key_dump ( b , k ) ;
else
printk ( " %llu:%llu \n " , KEY_INODE ( k ) , KEY_OFFSET ( k ) ) ;
if ( next < bset_bkey_last ( i ) & &
bkey_cmp ( k , b - > ops - > is_extents ?
& START_KEY ( next ) : next ) > 0 )
printk ( KERN_ERR " Key skipped backwards \n " ) ;
}
}
void bch_dump_bucket ( struct btree_keys * b )
{
unsigned i ;
console_lock ( ) ;
for ( i = 0 ; i < = b - > nsets ; i + + )
bch_dump_bset ( b , b - > set [ i ] . data ,
bset_sector_offset ( b , b - > set [ i ] . data ) ) ;
console_unlock ( ) ;
}
int __bch_count_data ( struct btree_keys * b )
{
unsigned ret = 0 ;
struct btree_iter iter ;
struct bkey * k ;
if ( b - > ops - > is_extents )
for_each_key ( b , k , & iter )
ret + = KEY_SIZE ( k ) ;
return ret ;
}
void __bch_check_keys ( struct btree_keys * b , const char * fmt , . . . )
{
va_list args ;
struct bkey * k , * p = NULL ;
struct btree_iter iter ;
const char * err ;
for_each_key ( b , k , & iter ) {
if ( b - > ops - > is_extents ) {
err = " Keys out of order " ;
if ( p & & bkey_cmp ( & START_KEY ( p ) , & START_KEY ( k ) ) > 0 )
goto bug ;
if ( bch_ptr_invalid ( b , k ) )
continue ;
err = " Overlapping keys " ;
if ( p & & bkey_cmp ( p , & START_KEY ( k ) ) > 0 )
goto bug ;
} else {
if ( bch_ptr_bad ( b , k ) )
continue ;
err = " Duplicate keys " ;
if ( p & & ! bkey_cmp ( p , k ) )
goto bug ;
}
p = k ;
}
#if 0
err = " Key larger than btree node key " ;
if ( p & & bkey_cmp ( p , & b - > key ) > 0 )
goto bug ;
# endif
return ;
bug :
bch_dump_bucket ( b ) ;
va_start ( args , fmt ) ;
vprintk ( fmt , args ) ;
va_end ( args ) ;
panic ( " bch_check_keys error: %s: \n " , err ) ;
}
static void bch_btree_iter_next_check ( struct btree_iter * iter )
{
struct bkey * k = iter - > data - > k , * next = bkey_next ( k ) ;
if ( next < iter - > data - > end & &
bkey_cmp ( k , iter - > b - > ops - > is_extents ?
& START_KEY ( next ) : next ) > 0 ) {
bch_dump_bucket ( iter - > b ) ;
panic ( " Key skipped backwards \n " ) ;
}
}
# else
static inline void bch_btree_iter_next_check ( struct btree_iter * iter ) { }
# endif
2013-03-23 16:11:31 -07:00
/* Keylists */
2013-11-11 18:20:51 -08:00
int __bch_keylist_realloc ( struct keylist * l , unsigned u64s )
2013-03-23 16:11:31 -07:00
{
2013-07-24 17:24:25 -07:00
size_t oldsize = bch_keylist_nkeys ( l ) ;
2013-11-11 18:20:51 -08:00
size_t newsize = oldsize + u64s ;
2013-07-24 17:24:25 -07:00
uint64_t * old_keys = l - > keys_p = = l - > inline_keys ? NULL : l - > keys_p ;
uint64_t * new_keys ;
2013-03-23 16:11:31 -07:00
newsize = roundup_pow_of_two ( newsize ) ;
if ( newsize < = KEYLIST_INLINE | |
roundup_pow_of_two ( oldsize ) = = newsize )
return 0 ;
2013-07-24 17:24:25 -07:00
new_keys = krealloc ( old_keys , sizeof ( uint64_t ) * newsize , GFP_NOIO ) ;
2013-03-23 16:11:31 -07:00
2013-07-24 17:24:25 -07:00
if ( ! new_keys )
2013-03-23 16:11:31 -07:00
return - ENOMEM ;
2013-07-24 17:24:25 -07:00
if ( ! old_keys )
memcpy ( new_keys , l - > inline_keys , sizeof ( uint64_t ) * oldsize ) ;
2013-03-23 16:11:31 -07:00
2013-07-24 17:24:25 -07:00
l - > keys_p = new_keys ;
l - > top_p = new_keys + oldsize ;
2013-03-23 16:11:31 -07:00
return 0 ;
}
struct bkey * bch_keylist_pop ( struct keylist * l )
{
2013-07-24 17:24:25 -07:00
struct bkey * k = l - > keys ;
2013-03-23 16:11:31 -07:00
if ( k = = l - > top )
return NULL ;
while ( bkey_next ( k ) ! = l - > top )
k = bkey_next ( k ) ;
return l - > top = k ;
}
2013-09-10 18:41:15 -07:00
void bch_keylist_pop_front ( struct keylist * l )
{
2013-07-24 17:24:25 -07:00
l - > top_p - = bkey_u64s ( l - > keys ) ;
2013-09-10 18:41:15 -07:00
2013-07-24 17:24:25 -07:00
memmove ( l - > keys ,
bkey_next ( l - > keys ) ,
bch_keylist_bytes ( l ) ) ;
2013-09-10 18:41:15 -07:00
}
2013-03-23 16:11:31 -07:00
/* Key/pointer manipulation */
void bch_bkey_copy_single_ptr ( struct bkey * dest , const struct bkey * src ,
unsigned i )
{
BUG_ON ( i > KEY_PTRS ( src ) ) ;
/* Only copy the header, key, and one pointer. */
memcpy ( dest , src , 2 * sizeof ( uint64_t ) ) ;
dest - > ptr [ 0 ] = src - > ptr [ i ] ;
SET_KEY_PTRS ( dest , 1 ) ;
/* We didn't copy the checksum so clear that bit. */
SET_KEY_CSUM ( dest , 0 ) ;
}
bool __bch_cut_front ( const struct bkey * where , struct bkey * k )
{
unsigned i , len = 0 ;
if ( bkey_cmp ( where , & START_KEY ( k ) ) < = 0 )
return false ;
if ( bkey_cmp ( where , k ) < 0 )
len = KEY_OFFSET ( k ) - KEY_OFFSET ( where ) ;
else
bkey_copy_key ( k , where ) ;
for ( i = 0 ; i < KEY_PTRS ( k ) ; i + + )
SET_PTR_OFFSET ( k , i , PTR_OFFSET ( k , i ) + KEY_SIZE ( k ) - len ) ;
BUG_ON ( len > KEY_SIZE ( k ) ) ;
SET_KEY_SIZE ( k , len ) ;
return true ;
}
bool __bch_cut_back ( const struct bkey * where , struct bkey * k )
{
unsigned len = 0 ;
if ( bkey_cmp ( where , k ) > = 0 )
return false ;
BUG_ON ( KEY_INODE ( where ) ! = KEY_INODE ( k ) ) ;
if ( bkey_cmp ( where , & START_KEY ( k ) ) > 0 )
len = KEY_OFFSET ( where ) - KEY_START ( k ) ;
bkey_copy_key ( k , where ) ;
BUG_ON ( len > KEY_SIZE ( k ) ) ;
SET_KEY_SIZE ( k , len ) ;
return true ;
}
2013-12-17 23:49:49 -08:00
/* Auxiliary search trees */
/* 32 bits total: */
# define BKEY_MID_BITS 3
# define BKEY_EXPONENT_BITS 7
# define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
# define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
struct bkey_float {
unsigned exponent : BKEY_EXPONENT_BITS ;
unsigned m : BKEY_MID_BITS ;
unsigned mantissa : BKEY_MANTISSA_BITS ;
} __packed ;
/*
* BSET_CACHELINE was originally intended to match the hardware cacheline size -
* it used to be 64 , but I realized the lookup code would touch slightly less
* memory if it was 128.
*
* It definites the number of bytes ( in struct bset ) per struct bkey_float in
* the auxiliar search tree - when we ' re done searching the bset_float tree we
* have this many bytes left that we do a linear search over .
*
* Since ( after level 5 ) every level of the bset_tree is on a new cacheline ,
* we ' re touching one fewer cacheline in the bset tree in exchange for one more
* cacheline in the linear search - but the linear search might stop before it
* gets to the second cacheline .
*/
# define BSET_CACHELINE 128
/* Space required for the btree node keys */
2013-12-20 17:28:16 -08:00
static inline size_t btree_keys_bytes ( struct btree_keys * b )
2013-12-17 23:49:49 -08:00
{
return PAGE_SIZE < < b - > page_order ;
}
2013-12-20 17:28:16 -08:00
static inline size_t btree_keys_cachelines ( struct btree_keys * b )
2013-12-17 23:49:49 -08:00
{
return btree_keys_bytes ( b ) / BSET_CACHELINE ;
}
/* Space required for the auxiliary search trees */
2013-12-20 17:28:16 -08:00
static inline size_t bset_tree_bytes ( struct btree_keys * b )
2013-12-17 23:49:49 -08:00
{
return btree_keys_cachelines ( b ) * sizeof ( struct bkey_float ) ;
}
/* Space required for the prev pointers */
2013-12-20 17:28:16 -08:00
static inline size_t bset_prev_bytes ( struct btree_keys * b )
2013-12-17 23:49:49 -08:00
{
return btree_keys_cachelines ( b ) * sizeof ( uint8_t ) ;
}
/* Memory allocation */
2013-12-20 17:28:16 -08:00
void bch_btree_keys_free ( struct btree_keys * b )
2013-12-17 23:49:49 -08:00
{
2013-12-20 17:28:16 -08:00
struct bset_tree * t = b - > set ;
2013-12-17 23:49:49 -08:00
if ( bset_prev_bytes ( b ) < PAGE_SIZE )
kfree ( t - > prev ) ;
else
free_pages ( ( unsigned long ) t - > prev ,
get_order ( bset_prev_bytes ( b ) ) ) ;
if ( bset_tree_bytes ( b ) < PAGE_SIZE )
kfree ( t - > tree ) ;
else
free_pages ( ( unsigned long ) t - > tree ,
get_order ( bset_tree_bytes ( b ) ) ) ;
free_pages ( ( unsigned long ) t - > data , b - > page_order ) ;
t - > prev = NULL ;
t - > tree = NULL ;
t - > data = NULL ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_btree_keys_free ) ;
2013-12-17 23:49:49 -08:00
2013-12-20 17:28:16 -08:00
int bch_btree_keys_alloc ( struct btree_keys * b , unsigned page_order , gfp_t gfp )
2013-12-17 23:49:49 -08:00
{
2013-12-20 17:28:16 -08:00
struct bset_tree * t = b - > set ;
2013-12-17 23:49:49 -08:00
BUG_ON ( t - > data ) ;
b - > page_order = page_order ;
t - > data = ( void * ) __get_free_pages ( gfp , b - > page_order ) ;
if ( ! t - > data )
goto err ;
t - > tree = bset_tree_bytes ( b ) < PAGE_SIZE
? kmalloc ( bset_tree_bytes ( b ) , gfp )
: ( void * ) __get_free_pages ( gfp , get_order ( bset_tree_bytes ( b ) ) ) ;
if ( ! t - > tree )
goto err ;
t - > prev = bset_prev_bytes ( b ) < PAGE_SIZE
? kmalloc ( bset_prev_bytes ( b ) , gfp )
: ( void * ) __get_free_pages ( gfp , get_order ( bset_prev_bytes ( b ) ) ) ;
if ( ! t - > prev )
goto err ;
return 0 ;
err :
bch_btree_keys_free ( b ) ;
return - ENOMEM ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_btree_keys_alloc ) ;
void bch_btree_keys_init ( struct btree_keys * b , const struct btree_keys_ops * ops ,
bool * expensive_debug_checks )
{
unsigned i ;
b - > ops = ops ;
b - > expensive_debug_checks = expensive_debug_checks ;
b - > nsets = 0 ;
b - > last_set_unwritten = 0 ;
/* XXX: shouldn't be needed */
for ( i = 0 ; i < MAX_BSETS ; i + + )
b - > set [ i ] . size = 0 ;
/*
* Second loop starts at 1 because b - > keys [ 0 ] - > data is the memory we
* allocated
*/
for ( i = 1 ; i < MAX_BSETS ; i + + )
b - > set [ i ] . data = NULL ;
}
EXPORT_SYMBOL ( bch_btree_keys_init ) ;
2013-12-17 23:49:49 -08:00
2013-03-23 16:11:31 -07:00
/* Binary tree stuff for auxiliary search trees */
static unsigned inorder_next ( unsigned j , unsigned size )
{
if ( j * 2 + 1 < size ) {
j = j * 2 + 1 ;
while ( j * 2 < size )
j * = 2 ;
} else
j > > = ffz ( j ) + 1 ;
return j ;
}
static unsigned inorder_prev ( unsigned j , unsigned size )
{
if ( j * 2 < size ) {
j = j * 2 ;
while ( j * 2 + 1 < size )
j = j * 2 + 1 ;
} else
j > > = ffs ( j ) ;
return j ;
}
/* I have no idea why this code works... and I'm the one who wrote it
*
* However , I do know what it does :
* Given a binary tree constructed in an array ( i . e . how you normally implement
* a heap ) , it converts a node in the tree - referenced by array index - to the
* index it would have if you did an inorder traversal .
*
* Also tested for every j , size up to size somewhere around 6 million .
*
* The binary tree starts at array index 1 , not 0
* extra is a function of size :
* extra = ( size - rounddown_pow_of_two ( size - 1 ) ) < < 1 ;
*/
static unsigned __to_inorder ( unsigned j , unsigned size , unsigned extra )
{
unsigned b = fls ( j ) ;
unsigned shift = fls ( size - 1 ) - b ;
j ^ = 1U < < ( b - 1 ) ;
j < < = 1 ;
j | = 1 ;
j < < = shift ;
if ( j > extra )
j - = ( j - extra ) > > 1 ;
return j ;
}
static unsigned to_inorder ( unsigned j , struct bset_tree * t )
{
return __to_inorder ( j , t - > size , t - > extra ) ;
}
static unsigned __inorder_to_tree ( unsigned j , unsigned size , unsigned extra )
{
unsigned shift ;
if ( j > extra )
j + = j - extra ;
shift = ffs ( j ) ;
j > > = shift ;
j | = roundup_pow_of_two ( size ) > > shift ;
return j ;
}
static unsigned inorder_to_tree ( unsigned j , struct bset_tree * t )
{
return __inorder_to_tree ( j , t - > size , t - > extra ) ;
}
#if 0
void inorder_test ( void )
{
unsigned long done = 0 ;
ktime_t start = ktime_get ( ) ;
for ( unsigned size = 2 ;
size < 65536000 ;
size + + ) {
unsigned extra = ( size - rounddown_pow_of_two ( size - 1 ) ) < < 1 ;
unsigned i = 1 , j = rounddown_pow_of_two ( size - 1 ) ;
if ( ! ( size % 4096 ) )
printk ( KERN_NOTICE " loop %u, %llu per us \n " , size ,
done / ktime_us_delta ( ktime_get ( ) , start ) ) ;
while ( 1 ) {
if ( __inorder_to_tree ( i , size , extra ) ! = j )
panic ( " size %10u j %10u i %10u " , size , j , i ) ;
if ( __to_inorder ( j , size , extra ) ! = i )
panic ( " size %10u j %10u i %10u " , size , j , i ) ;
if ( j = = rounddown_pow_of_two ( size ) - 1 )
break ;
BUG_ON ( inorder_prev ( inorder_next ( j , size ) , size ) ! = j ) ;
j = inorder_next ( j , size ) ;
i + + ;
}
done + = size - 1 ;
}
}
# endif
/*
2013-06-03 09:51:42 -03:00
* Cacheline / offset < - > bkey pointer arithmetic :
2013-03-23 16:11:31 -07:00
*
* t - > tree is a binary search tree in an array ; each node corresponds to a key
* in one cacheline in t - > set ( BSET_CACHELINE bytes ) .
*
* This means we don ' t have to store the full index of the key that a node in
* the binary tree points to ; to_inorder ( ) gives us the cacheline , and then
* bkey_float - > m gives us the offset within that cacheline , in units of 8 bytes .
*
2013-06-03 09:51:42 -03:00
* cacheline_to_bkey ( ) and friends abstract out all the pointer arithmetic to
2013-03-23 16:11:31 -07:00
* make this work .
*
* To construct the bfloat for an arbitrary key we need to know what the key
* immediately preceding it is : we have to check if the two keys differ in the
* bits we ' re going to store in bkey_float - > mantissa . t - > prev [ j ] stores the size
* of the previous key so we can walk backwards to it from t - > tree [ j ] ' s key .
*/
static struct bkey * cacheline_to_bkey ( struct bset_tree * t , unsigned cacheline ,
unsigned offset )
{
return ( ( void * ) t - > data ) + cacheline * BSET_CACHELINE + offset * 8 ;
}
static unsigned bkey_to_cacheline ( struct bset_tree * t , struct bkey * k )
{
return ( ( void * ) k - ( void * ) t - > data ) / BSET_CACHELINE ;
}
2013-12-17 03:11:06 -08:00
static unsigned bkey_to_cacheline_offset ( struct bset_tree * t ,
unsigned cacheline ,
struct bkey * k )
2013-03-23 16:11:31 -07:00
{
2013-12-17 03:11:06 -08:00
return ( u64 * ) k - ( u64 * ) cacheline_to_bkey ( t , cacheline , 0 ) ;
2013-03-23 16:11:31 -07:00
}
static struct bkey * tree_to_bkey ( struct bset_tree * t , unsigned j )
{
return cacheline_to_bkey ( t , to_inorder ( j , t ) , t - > tree [ j ] . m ) ;
}
static struct bkey * tree_to_prev_bkey ( struct bset_tree * t , unsigned j )
{
return ( void * ) ( ( ( uint64_t * ) tree_to_bkey ( t , j ) ) - t - > prev [ j ] ) ;
}
/*
* For the write set - the one we ' re currently inserting keys into - we don ' t
* maintain a full search tree , we just keep a simple lookup table in t - > prev .
*/
static struct bkey * table_to_bkey ( struct bset_tree * t , unsigned cacheline )
{
return cacheline_to_bkey ( t , cacheline , t - > prev [ cacheline ] ) ;
}
static inline uint64_t shrd128 ( uint64_t high , uint64_t low , uint8_t shift )
{
low > > = shift ;
low | = ( high < < 1 ) < < ( 63U - shift ) ;
return low ;
}
static inline unsigned bfloat_mantissa ( const struct bkey * k ,
struct bkey_float * f )
{
const uint64_t * p = & k - > low - ( f - > exponent > > 6 ) ;
return shrd128 ( p [ - 1 ] , p [ 0 ] , f - > exponent & 63 ) & BKEY_MANTISSA_MASK ;
}
static void make_bfloat ( struct bset_tree * t , unsigned j )
{
struct bkey_float * f = & t - > tree [ j ] ;
struct bkey * m = tree_to_bkey ( t , j ) ;
struct bkey * p = tree_to_prev_bkey ( t , j ) ;
struct bkey * l = is_power_of_2 ( j )
? t - > data - > start
: tree_to_prev_bkey ( t , j > > ffs ( j ) ) ;
struct bkey * r = is_power_of_2 ( j + 1 )
2013-12-17 21:56:21 -08:00
? bset_bkey_idx ( t - > data , t - > data - > keys - bkey_u64s ( & t - > end ) )
2013-03-23 16:11:31 -07:00
: tree_to_bkey ( t , j > > ( ffz ( j ) + 1 ) ) ;
BUG_ON ( m < l | | m > r ) ;
BUG_ON ( bkey_next ( p ) ! = m ) ;
if ( KEY_INODE ( l ) ! = KEY_INODE ( r ) )
f - > exponent = fls64 ( KEY_INODE ( r ) ^ KEY_INODE ( l ) ) + 64 ;
else
f - > exponent = fls64 ( r - > low ^ l - > low ) ;
f - > exponent = max_t ( int , f - > exponent - BKEY_MANTISSA_BITS , 0 ) ;
/*
* Setting f - > exponent = 127 flags this node as failed , and causes the
* lookup code to fall back to comparing against the original key .
*/
if ( bfloat_mantissa ( m , f ) ! = bfloat_mantissa ( p , f ) )
f - > mantissa = bfloat_mantissa ( m , f ) - 1 ;
else
f - > exponent = 127 ;
}
2013-12-20 17:28:16 -08:00
static void bset_alloc_tree ( struct btree_keys * b , struct bset_tree * t )
2013-03-23 16:11:31 -07:00
{
2013-12-20 17:28:16 -08:00
if ( t ! = b - > set ) {
2013-03-23 16:11:31 -07:00
unsigned j = roundup ( t [ - 1 ] . size ,
64 / sizeof ( struct bkey_float ) ) ;
t - > tree = t [ - 1 ] . tree + j ;
t - > prev = t [ - 1 ] . prev + j ;
}
2013-12-20 17:28:16 -08:00
while ( t < b - > set + MAX_BSETS )
2013-03-23 16:11:31 -07:00
t + + - > size = 0 ;
}
2013-12-20 17:28:16 -08:00
static void bch_bset_build_unwritten_tree ( struct btree_keys * b )
2013-03-23 16:11:31 -07:00
{
2013-12-17 23:49:49 -08:00
struct bset_tree * t = bset_tree_last ( b ) ;
2013-03-23 16:11:31 -07:00
2013-12-20 17:28:16 -08:00
BUG_ON ( b - > last_set_unwritten ) ;
b - > last_set_unwritten = 1 ;
2013-03-23 16:11:31 -07:00
bset_alloc_tree ( b , t ) ;
2013-12-20 17:28:16 -08:00
if ( t - > tree ! = b - > set - > tree + btree_keys_cachelines ( b ) ) {
2013-12-17 03:11:06 -08:00
t - > prev [ 0 ] = bkey_to_cacheline_offset ( t , 0 , t - > data - > start ) ;
2013-03-23 16:11:31 -07:00
t - > size = 1 ;
}
}
2013-12-20 17:28:16 -08:00
void bch_bset_init_next ( struct btree_keys * b , struct bset * i , uint64_t magic )
2013-12-17 23:49:49 -08:00
{
2013-12-20 17:28:16 -08:00
if ( i ! = b - > set - > data ) {
b - > set [ + + b - > nsets ] . data = i ;
i - > seq = b - > set - > data - > seq ;
2013-12-17 23:49:49 -08:00
} else
get_random_bytes ( & i - > seq , sizeof ( uint64_t ) ) ;
i - > magic = magic ;
i - > version = 0 ;
i - > keys = 0 ;
bch_bset_build_unwritten_tree ( b ) ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_bset_init_next ) ;
2013-12-17 23:49:49 -08:00
2013-12-20 17:28:16 -08:00
void bch_bset_build_written_tree ( struct btree_keys * b )
2013-03-23 16:11:31 -07:00
{
2013-12-17 23:49:49 -08:00
struct bset_tree * t = bset_tree_last ( b ) ;
2013-12-17 03:11:06 -08:00
struct bkey * prev = NULL , * k = t - > data - > start ;
2013-03-23 16:11:31 -07:00
unsigned j , cacheline = 1 ;
2013-12-20 17:28:16 -08:00
b - > last_set_unwritten = 0 ;
2013-03-23 16:11:31 -07:00
bset_alloc_tree ( b , t ) ;
t - > size = min_t ( unsigned ,
2013-12-17 21:56:21 -08:00
bkey_to_cacheline ( t , bset_bkey_last ( t - > data ) ) ,
2013-12-20 17:28:16 -08:00
b - > set - > tree + btree_keys_cachelines ( b ) - t - > tree ) ;
2013-03-23 16:11:31 -07:00
if ( t - > size < 2 ) {
t - > size = 0 ;
return ;
}
t - > extra = ( t - > size - rounddown_pow_of_two ( t - > size - 1 ) ) < < 1 ;
/* First we figure out where the first key in each cacheline is */
for ( j = inorder_next ( 0 , t - > size ) ;
j ;
j = inorder_next ( j , t - > size ) ) {
2013-12-17 03:11:06 -08:00
while ( bkey_to_cacheline ( t , k ) < cacheline )
prev = k , k = bkey_next ( k ) ;
2013-03-23 16:11:31 -07:00
2013-12-17 03:11:06 -08:00
t - > prev [ j ] = bkey_u64s ( prev ) ;
t - > tree [ j ] . m = bkey_to_cacheline_offset ( t , cacheline + + , k ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-17 21:56:21 -08:00
while ( bkey_next ( k ) ! = bset_bkey_last ( t - > data ) )
2013-03-23 16:11:31 -07:00
k = bkey_next ( k ) ;
t - > end = * k ;
/* Then we build the tree */
for ( j = inorder_next ( 0 , t - > size ) ;
j ;
j = inorder_next ( j , t - > size ) )
make_bfloat ( t , j ) ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_bset_build_written_tree ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 17:02:31 -08:00
/* Insert */
2013-12-20 17:28:16 -08:00
void bch_bset_fix_invalidated_key ( struct btree_keys * b , struct bkey * k )
2013-03-23 16:11:31 -07:00
{
struct bset_tree * t ;
unsigned inorder , j = 1 ;
2013-12-20 17:28:16 -08:00
for ( t = b - > set ; t < = bset_tree_last ( b ) ; t + + )
2013-12-17 21:56:21 -08:00
if ( k < bset_bkey_last ( t - > data ) )
2013-03-23 16:11:31 -07:00
goto found_set ;
BUG ( ) ;
found_set :
if ( ! t - > size | | ! bset_written ( b , t ) )
return ;
inorder = bkey_to_cacheline ( t , k ) ;
if ( k = = t - > data - > start )
goto fix_left ;
2013-12-17 21:56:21 -08:00
if ( bkey_next ( k ) = = bset_bkey_last ( t - > data ) ) {
2013-03-23 16:11:31 -07:00
t - > end = * k ;
goto fix_right ;
}
j = inorder_to_tree ( inorder , t ) ;
if ( j & &
j < t - > size & &
k = = tree_to_bkey ( t , j ) )
fix_left : do {
make_bfloat ( t , j ) ;
j = j * 2 ;
} while ( j < t - > size ) ;
j = inorder_to_tree ( inorder + 1 , t ) ;
if ( j & &
j < t - > size & &
k = = tree_to_prev_bkey ( t , j ) )
fix_right : do {
make_bfloat ( t , j ) ;
j = j * 2 + 1 ;
} while ( j < t - > size ) ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_bset_fix_invalidated_key ) ;
2013-03-23 16:11:31 -07:00
2013-12-20 17:28:16 -08:00
static void bch_bset_fix_lookup_table ( struct btree_keys * b ,
2013-12-17 23:49:49 -08:00
struct bset_tree * t ,
struct bkey * k )
2013-03-23 16:11:31 -07:00
{
unsigned shift = bkey_u64s ( k ) ;
unsigned j = bkey_to_cacheline ( t , k ) ;
/* We're getting called from btree_split() or btree_gc, just bail out */
if ( ! t - > size )
return ;
/* k is the key we just inserted; we need to find the entry in the
* lookup table for the first key that is strictly greater than k :
* it ' s either k ' s cacheline or the next one
*/
2013-12-17 03:11:06 -08:00
while ( j < t - > size & &
table_to_bkey ( t , j ) < = k )
2013-03-23 16:11:31 -07:00
j + + ;
/* Adjust all the lookup table entries, and find a new key for any that
* have gotten too big
*/
for ( ; j < t - > size ; j + + ) {
t - > prev [ j ] + = shift ;
if ( t - > prev [ j ] > 7 ) {
k = table_to_bkey ( t , j - 1 ) ;
while ( k < cacheline_to_bkey ( t , j , 0 ) )
k = bkey_next ( k ) ;
2013-12-17 03:11:06 -08:00
t - > prev [ j ] = bkey_to_cacheline_offset ( t , j , k ) ;
2013-03-23 16:11:31 -07:00
}
}
2013-12-20 17:28:16 -08:00
if ( t - > size = = b - > set - > tree + btree_keys_cachelines ( b ) - t - > tree )
2013-03-23 16:11:31 -07:00
return ;
/* Possibly add a new entry to the end of the lookup table */
for ( k = table_to_bkey ( t , t - > size - 1 ) ;
2013-12-17 21:56:21 -08:00
k ! = bset_bkey_last ( t - > data ) ;
2013-03-23 16:11:31 -07:00
k = bkey_next ( k ) )
if ( t - > size = = bkey_to_cacheline ( t , k ) ) {
2013-12-17 03:11:06 -08:00
t - > prev [ t - > size ] = bkey_to_cacheline_offset ( t , t - > size , k ) ;
2013-03-23 16:11:31 -07:00
t - > size + + ;
}
}
2013-10-14 18:53:16 -07:00
/*
* Tries to merge l and r : l should be lower than r
* Returns true if we were able to merge . If we did merge , l will be the merged
* key , r will be untouched .
*/
bool bch_bkey_try_merge ( struct btree_keys * b , struct bkey * l , struct bkey * r )
{
if ( ! b - > ops - > key_merge )
return false ;
/*
* Generic header checks
* Assumes left and right are in order
* Left and right must be exactly aligned
*/
2013-11-11 19:36:25 -08:00
if ( ! bch_bkey_equal_header ( l , r ) | |
bkey_cmp ( l , & START_KEY ( r ) ) )
2013-10-14 18:53:16 -07:00
return false ;
return b - > ops - > key_merge ( b , l , r ) ;
}
EXPORT_SYMBOL ( bch_bkey_try_merge ) ;
2013-12-20 17:28:16 -08:00
void bch_bset_insert ( struct btree_keys * b , struct bkey * where ,
2013-12-17 23:49:49 -08:00
struct bkey * insert )
2013-03-23 16:11:31 -07:00
{
2013-12-17 23:49:49 -08:00
struct bset_tree * t = bset_tree_last ( b ) ;
2013-03-23 16:11:31 -07:00
2013-12-20 17:28:16 -08:00
BUG_ON ( ! b - > last_set_unwritten ) ;
2013-12-17 23:49:49 -08:00
BUG_ON ( bset_byte_offset ( b , t - > data ) +
__set_bytes ( t - > data , t - > data - > keys + bkey_u64s ( insert ) ) >
PAGE_SIZE < < b - > page_order ) ;
2013-03-23 16:11:31 -07:00
2013-12-17 23:49:49 -08:00
memmove ( ( uint64_t * ) where + bkey_u64s ( insert ) ,
where ,
( void * ) bset_bkey_last ( t - > data ) - ( void * ) where ) ;
2013-03-23 16:11:31 -07:00
2013-12-17 23:49:49 -08:00
t - > data - > keys + = bkey_u64s ( insert ) ;
bkey_copy ( where , insert ) ;
bch_bset_fix_lookup_table ( b , t , where ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_bset_insert ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 17:02:31 -08:00
unsigned bch_btree_insert_key ( struct btree_keys * b , struct bkey * k ,
struct bkey * replace_key )
{
unsigned status = BTREE_INSERT_STATUS_NO_INSERT ;
struct bset * i = bset_tree_last ( b ) - > data ;
struct bkey * m , * prev = NULL ;
struct btree_iter iter ;
BUG_ON ( b - > ops - > is_extents & & ! KEY_SIZE ( k ) ) ;
m = bch_btree_iter_init ( b , & iter , b - > ops - > is_extents
? PRECEDING_KEY ( & START_KEY ( k ) )
: PRECEDING_KEY ( k ) ) ;
if ( b - > ops - > insert_fixup ( b , k , & iter , replace_key ) )
return status ;
status = BTREE_INSERT_STATUS_INSERT ;
while ( m ! = bset_bkey_last ( i ) & &
bkey_cmp ( k , b - > ops - > is_extents ? & START_KEY ( m ) : m ) > 0 )
prev = m , m = bkey_next ( m ) ;
/* prev is in the tree, if we merge we're done */
status = BTREE_INSERT_STATUS_BACK_MERGE ;
if ( prev & &
bch_bkey_try_merge ( b , prev , k ) )
goto merged ;
#if 0
status = BTREE_INSERT_STATUS_OVERWROTE ;
if ( m ! = bset_bkey_last ( i ) & &
KEY_PTRS ( m ) = = KEY_PTRS ( k ) & & ! KEY_SIZE ( m ) )
goto copy ;
# endif
status = BTREE_INSERT_STATUS_FRONT_MERGE ;
if ( m ! = bset_bkey_last ( i ) & &
bch_bkey_try_merge ( b , k , m ) )
goto copy ;
bch_bset_insert ( b , m , k ) ;
copy : bkey_copy ( m , k ) ;
merged :
return status ;
}
EXPORT_SYMBOL ( bch_btree_insert_key ) ;
/* Lookup */
2013-03-23 16:11:31 -07:00
struct bset_search_iter {
struct bkey * l , * r ;
} ;
2013-12-20 17:28:16 -08:00
static struct bset_search_iter bset_search_write_set ( struct bset_tree * t ,
2013-03-23 16:11:31 -07:00
const struct bkey * search )
{
unsigned li = 0 , ri = t - > size ;
while ( li + 1 ! = ri ) {
unsigned m = ( li + ri ) > > 1 ;
if ( bkey_cmp ( table_to_bkey ( t , m ) , search ) > 0 )
ri = m ;
else
li = m ;
}
return ( struct bset_search_iter ) {
table_to_bkey ( t , li ) ,
2013-12-17 21:56:21 -08:00
ri < t - > size ? table_to_bkey ( t , ri ) : bset_bkey_last ( t - > data )
2013-03-23 16:11:31 -07:00
} ;
}
2013-12-20 17:28:16 -08:00
static struct bset_search_iter bset_search_tree ( struct bset_tree * t ,
2013-03-23 16:11:31 -07:00
const struct bkey * search )
{
struct bkey * l , * r ;
struct bkey_float * f ;
unsigned inorder , j , n = 1 ;
do {
unsigned p = n < < 4 ;
p & = ( ( int ) ( p - t - > size ) ) > > 31 ;
prefetch ( & t - > tree [ p ] ) ;
j = n ;
f = & t - > tree [ j ] ;
/*
* n = ( f - > mantissa > bfloat_mantissa ( ) )
* ? j * 2
* : j * 2 + 1 ;
*
* We need to subtract 1 from f - > mantissa for the sign bit trick
* to work - that ' s done in make_bfloat ( )
*/
if ( likely ( f - > exponent ! = 127 ) )
n = j * 2 + ( ( ( unsigned )
( f - > mantissa -
bfloat_mantissa ( search , f ) ) ) > > 31 ) ;
else
n = ( bkey_cmp ( tree_to_bkey ( t , j ) , search ) > 0 )
? j * 2
: j * 2 + 1 ;
} while ( n < t - > size ) ;
inorder = to_inorder ( j , t ) ;
/*
* n would have been the node we recursed to - the low bit tells us if
* we recursed left or recursed right .
*/
if ( n & 1 ) {
l = cacheline_to_bkey ( t , inorder , f - > m ) ;
if ( + + inorder ! = t - > size ) {
f = & t - > tree [ inorder_next ( j , t - > size ) ] ;
r = cacheline_to_bkey ( t , inorder , f - > m ) ;
} else
2013-12-17 21:56:21 -08:00
r = bset_bkey_last ( t - > data ) ;
2013-03-23 16:11:31 -07:00
} else {
r = cacheline_to_bkey ( t , inorder , f - > m ) ;
if ( - - inorder ) {
f = & t - > tree [ inorder_prev ( j , t - > size ) ] ;
l = cacheline_to_bkey ( t , inorder , f - > m ) ;
} else
l = t - > data - > start ;
}
return ( struct bset_search_iter ) { l , r } ;
}
2013-11-11 17:35:24 -08:00
struct bkey * __bch_bset_search ( struct btree_keys * b , struct bset_tree * t ,
2013-03-23 16:11:31 -07:00
const struct bkey * search )
{
struct bset_search_iter i ;
/*
* First , we search for a cacheline , then lastly we do a linear search
* within that cacheline .
*
* To search for the cacheline , there ' s three different possibilities :
* * The set is too small to have a search tree , so we just do a linear
* search over the whole set .
* * The set is the one we ' re currently inserting into ; keeping a full
* auxiliary search tree up to date would be too expensive , so we
* use a much simpler lookup table to do a binary search -
* bset_search_write_set ( ) .
* * Or we use the auxiliary search tree we constructed earlier -
* bset_search_tree ( )
*/
if ( unlikely ( ! t - > size ) ) {
i . l = t - > data - > start ;
2013-12-17 21:56:21 -08:00
i . r = bset_bkey_last ( t - > data ) ;
2013-11-11 17:35:24 -08:00
} else if ( bset_written ( b , t ) ) {
2013-03-23 16:11:31 -07:00
/*
* Each node in the auxiliary search tree covers a certain range
* of bits , and keys above and below the set it covers might
* differ outside those bits - so we have to special case the
* start and end - handle that here :
*/
if ( unlikely ( bkey_cmp ( search , & t - > end ) > = 0 ) )
2013-12-17 21:56:21 -08:00
return bset_bkey_last ( t - > data ) ;
2013-03-23 16:11:31 -07:00
if ( unlikely ( bkey_cmp ( search , t - > data - > start ) < 0 ) )
return t - > data - > start ;
2013-12-20 17:28:16 -08:00
i = bset_search_tree ( t , search ) ;
} else {
2013-11-11 17:35:24 -08:00
BUG_ON ( ! b - > nsets & &
2013-12-20 17:28:16 -08:00
t - > size < bkey_to_cacheline ( t , bset_bkey_last ( t - > data ) ) ) ;
i = bset_search_write_set ( t , search ) ;
}
2013-03-23 16:11:31 -07:00
2013-11-11 17:35:24 -08:00
if ( btree_keys_expensive_checks ( b ) ) {
BUG_ON ( bset_written ( b , t ) & &
2013-10-24 16:36:03 -07:00
i . l ! = t - > data - > start & &
bkey_cmp ( tree_to_prev_bkey ( t ,
inorder_to_tree ( bkey_to_cacheline ( t , i . l ) , t ) ) ,
search ) > 0 ) ;
2013-03-23 16:11:31 -07:00
2013-12-17 21:56:21 -08:00
BUG_ON ( i . r ! = bset_bkey_last ( t - > data ) & &
2013-10-24 16:36:03 -07:00
bkey_cmp ( i . r , search ) < = 0 ) ;
}
2013-03-23 16:11:31 -07:00
while ( likely ( i . l ! = i . r ) & &
bkey_cmp ( i . l , search ) < = 0 )
i . l = bkey_next ( i . l ) ;
return i . l ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( __bch_bset_search ) ;
2013-03-23 16:11:31 -07:00
/* Btree iterator */
2013-07-28 18:35:09 -07:00
typedef bool ( btree_iter_cmp_fn ) ( struct btree_iter_set ,
struct btree_iter_set ) ;
2013-03-23 16:11:31 -07:00
static inline bool btree_iter_cmp ( struct btree_iter_set l ,
struct btree_iter_set r )
{
2013-07-28 18:35:09 -07:00
return bkey_cmp ( l . k , r . k ) > 0 ;
2013-03-23 16:11:31 -07:00
}
static inline bool btree_iter_end ( struct btree_iter * iter )
{
return ! iter - > used ;
}
void bch_btree_iter_push ( struct btree_iter * iter , struct bkey * k ,
struct bkey * end )
{
if ( k ! = end )
BUG_ON ( ! heap_add ( iter ,
( ( struct btree_iter_set ) { k , end } ) ,
btree_iter_cmp ) ) ;
}
2013-11-11 17:35:24 -08:00
static struct bkey * __bch_btree_iter_init ( struct btree_keys * b ,
2013-07-28 18:35:09 -07:00
struct btree_iter * iter ,
struct bkey * search ,
struct bset_tree * start )
2013-03-23 16:11:31 -07:00
{
struct bkey * ret = NULL ;
iter - > size = ARRAY_SIZE ( iter - > data ) ;
iter - > used = 0 ;
2013-10-24 16:36:03 -07:00
# ifdef CONFIG_BCACHE_DEBUG
iter - > b = b ;
# endif
2013-11-11 17:35:24 -08:00
for ( ; start < = bset_tree_last ( b ) ; start + + ) {
2013-03-23 16:11:31 -07:00
ret = bch_bset_search ( b , start , search ) ;
2013-12-17 21:56:21 -08:00
bch_btree_iter_push ( iter , ret , bset_bkey_last ( start - > data ) ) ;
2013-03-23 16:11:31 -07:00
}
return ret ;
}
2013-11-11 17:35:24 -08:00
struct bkey * bch_btree_iter_init ( struct btree_keys * b ,
2013-07-28 18:35:09 -07:00
struct btree_iter * iter ,
struct bkey * search )
{
2013-11-11 17:35:24 -08:00
return __bch_btree_iter_init ( b , iter , search , b - > set ) ;
2013-07-28 18:35:09 -07:00
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_btree_iter_init ) ;
2013-07-28 18:35:09 -07:00
static inline struct bkey * __bch_btree_iter_next ( struct btree_iter * iter ,
btree_iter_cmp_fn * cmp )
2013-03-23 16:11:31 -07:00
{
struct btree_iter_set unused ;
struct bkey * ret = NULL ;
if ( ! btree_iter_end ( iter ) ) {
2013-10-24 16:36:03 -07:00
bch_btree_iter_next_check ( iter ) ;
2013-03-23 16:11:31 -07:00
ret = iter - > data - > k ;
iter - > data - > k = bkey_next ( iter - > data - > k ) ;
if ( iter - > data - > k > iter - > data - > end ) {
2013-03-27 12:47:45 -07:00
WARN_ONCE ( 1 , " bset was corrupt! \n " ) ;
2013-03-23 16:11:31 -07:00
iter - > data - > k = iter - > data - > end ;
}
if ( iter - > data - > k = = iter - > data - > end )
2013-07-28 18:35:09 -07:00
heap_pop ( iter , unused , cmp ) ;
2013-03-23 16:11:31 -07:00
else
2013-07-28 18:35:09 -07:00
heap_sift ( iter , 0 , cmp ) ;
2013-03-23 16:11:31 -07:00
}
return ret ;
}
2013-07-28 18:35:09 -07:00
struct bkey * bch_btree_iter_next ( struct btree_iter * iter )
{
return __bch_btree_iter_next ( iter , btree_iter_cmp ) ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_btree_iter_next ) ;
2013-07-28 18:35:09 -07:00
2013-03-23 16:11:31 -07:00
struct bkey * bch_btree_iter_next_filter ( struct btree_iter * iter ,
2013-12-20 17:28:16 -08:00
struct btree_keys * b , ptr_filter_fn fn )
2013-03-23 16:11:31 -07:00
{
struct bkey * ret ;
do {
ret = bch_btree_iter_next ( iter ) ;
} while ( ret & & fn ( b , ret ) ) ;
return ret ;
}
/* Mergesort */
2013-09-10 22:53:34 -07:00
void bch_bset_sort_state_free ( struct bset_sort_state * state )
{
if ( state - > pool )
mempool_destroy ( state - > pool ) ;
}
int bch_bset_sort_state_init ( struct bset_sort_state * state , unsigned page_order )
{
spin_lock_init ( & state - > time . lock ) ;
state - > page_order = page_order ;
state - > crit_factor = int_sqrt ( 1 < < page_order ) ;
state - > pool = mempool_create_page_pool ( 1 , page_order ) ;
if ( ! state - > pool )
return - ENOMEM ;
return 0 ;
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_bset_sort_state_init ) ;
2013-09-10 22:53:34 -07:00
2013-12-20 17:28:16 -08:00
static void btree_mergesort ( struct btree_keys * b , struct bset * out ,
2013-03-23 16:11:31 -07:00
struct btree_iter * iter ,
bool fixup , bool remove_stale )
{
2013-07-28 18:35:09 -07:00
int i ;
2013-03-23 16:11:31 -07:00
struct bkey * k , * last = NULL ;
2013-12-17 17:51:02 -08:00
BKEY_PADDED ( k ) tmp ;
2013-12-20 17:28:16 -08:00
bool ( * bad ) ( struct btree_keys * , const struct bkey * ) = remove_stale
2013-03-23 16:11:31 -07:00
? bch_ptr_bad
: bch_ptr_invalid ;
2013-07-28 18:35:09 -07:00
/* Heapify the iterator, using our comparison function */
for ( i = iter - > used / 2 - 1 ; i > = 0 ; - - i )
2013-12-20 17:22:05 -08:00
heap_sift ( iter , i , b - > ops - > sort_cmp ) ;
2013-07-28 18:35:09 -07:00
2013-03-23 16:11:31 -07:00
while ( ! btree_iter_end ( iter ) ) {
2013-12-20 17:22:05 -08:00
if ( b - > ops - > sort_fixup & & fixup )
k = b - > ops - > sort_fixup ( iter , & tmp . k ) ;
2013-12-17 17:51:02 -08:00
else
k = NULL ;
if ( ! k )
2013-12-20 17:22:05 -08:00
k = __bch_btree_iter_next ( iter , b - > ops - > sort_cmp ) ;
2013-03-23 16:11:31 -07:00
if ( bad ( b , k ) )
continue ;
if ( ! last ) {
last = out - > start ;
bkey_copy ( last , k ) ;
2013-12-20 17:22:05 -08:00
} else if ( ! bch_bkey_try_merge ( b , last , k ) ) {
2013-03-23 16:11:31 -07:00
last = bkey_next ( last ) ;
bkey_copy ( last , k ) ;
}
}
out - > keys = last ? ( uint64_t * ) bkey_next ( last ) - out - > d : 0 ;
pr_debug ( " sorted %i keys " , out - > keys ) ;
}
2013-12-20 17:28:16 -08:00
static void __btree_sort ( struct btree_keys * b , struct btree_iter * iter ,
2013-09-10 22:53:34 -07:00
unsigned start , unsigned order , bool fixup ,
struct bset_sort_state * state )
2013-03-23 16:11:31 -07:00
{
uint64_t start_time ;
2013-12-18 00:01:06 -08:00
bool used_mempool = false ;
2014-05-19 08:55:40 -07:00
struct bset * out = ( void * ) __get_free_pages ( __GFP_NOWARN | GFP_NOWAIT ,
2013-03-23 16:11:31 -07:00
order ) ;
if ( ! out ) {
2014-01-10 18:53:02 -08:00
struct page * outp ;
2013-09-10 22:53:34 -07:00
BUG_ON ( order > state - > page_order ) ;
2014-01-10 18:53:02 -08:00
outp = mempool_alloc ( state - > pool , GFP_NOIO ) ;
out = page_address ( outp ) ;
2013-12-18 00:01:06 -08:00
used_mempool = true ;
2013-12-20 17:28:16 -08:00
order = state - > page_order ;
2013-03-23 16:11:31 -07:00
}
start_time = local_clock ( ) ;
2013-09-10 22:53:34 -07:00
btree_mergesort ( b , out , iter , fixup , false ) ;
2013-03-23 16:11:31 -07:00
b - > nsets = start ;
if ( ! start & & order = = b - > page_order ) {
/*
* Our temporary buffer is the same size as the btree node ' s
* buffer , we can just swap buffers instead of doing a big
* memcpy ( )
*/
2013-12-20 17:28:16 -08:00
out - > magic = b - > set - > data - > magic ;
out - > seq = b - > set - > data - > seq ;
out - > version = b - > set - > data - > version ;
swap ( out , b - > set - > data ) ;
2013-03-23 16:11:31 -07:00
} else {
2013-12-20 17:28:16 -08:00
b - > set [ start ] . data - > keys = out - > keys ;
memcpy ( b - > set [ start ] . data - > start , out - > start ,
2013-12-17 21:56:21 -08:00
( void * ) bset_bkey_last ( out ) - ( void * ) out - > start ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-18 00:01:06 -08:00
if ( used_mempool )
2013-09-10 22:53:34 -07:00
mempool_free ( virt_to_page ( out ) , state - > pool ) ;
2013-03-23 16:11:31 -07:00
else
free_pages ( ( unsigned long ) out , order ) ;
2013-12-20 17:28:16 -08:00
bch_bset_build_written_tree ( b ) ;
2013-03-23 16:11:31 -07:00
2013-07-31 00:03:54 -07:00
if ( ! start )
2013-09-10 22:53:34 -07:00
bch_time_stats_update ( & state - > time , start_time ) ;
2013-03-23 16:11:31 -07:00
}
2013-11-11 18:38:51 -08:00
void bch_btree_sort_partial ( struct btree_keys * b , unsigned start ,
2013-09-10 22:53:34 -07:00
struct bset_sort_state * state )
2013-03-23 16:11:31 -07:00
{
2013-11-11 18:38:51 -08:00
size_t order = b - > page_order , keys = 0 ;
2013-03-23 16:11:31 -07:00
struct btree_iter iter ;
2013-11-11 18:38:51 -08:00
int oldsize = bch_count_data ( b ) ;
2013-10-24 16:36:03 -07:00
2013-11-11 18:38:51 -08:00
__bch_btree_iter_init ( b , & iter , NULL , & b - > set [ start ] ) ;
2013-03-23 16:11:31 -07:00
if ( start ) {
unsigned i ;
2013-11-11 18:38:51 -08:00
for ( i = start ; i < = b - > nsets ; i + + )
keys + = b - > set [ i ] . data - > keys ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
order = get_order ( __set_bytes ( b - > set - > data , keys ) ) ;
2013-03-23 16:11:31 -07:00
}
2013-11-11 18:38:51 -08:00
__btree_sort ( b , & iter , start , order , false , state ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
EBUG_ON ( oldsize > = 0 & & bch_count_data ( b ) ! = oldsize ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-20 17:22:05 -08:00
EXPORT_SYMBOL ( bch_btree_sort_partial ) ;
2013-03-23 16:11:31 -07:00
2013-12-20 17:28:16 -08:00
void bch_btree_sort_and_fix_extents ( struct btree_keys * b ,
struct btree_iter * iter ,
2013-09-10 22:53:34 -07:00
struct bset_sort_state * state )
2013-03-23 16:11:31 -07:00
{
2013-09-10 22:53:34 -07:00
__btree_sort ( b , iter , 0 , b - > page_order , true , state ) ;
2013-03-23 16:11:31 -07:00
}
2013-11-11 18:38:51 -08:00
void bch_btree_sort_into ( struct btree_keys * b , struct btree_keys * new ,
2013-09-10 22:53:34 -07:00
struct bset_sort_state * state )
2013-03-23 16:11:31 -07:00
{
uint64_t start_time = local_clock ( ) ;
struct btree_iter iter ;
2013-11-11 18:38:51 -08:00
bch_btree_iter_init ( b , & iter , NULL ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
btree_mergesort ( b , new - > set - > data , & iter , false , true ) ;
2013-03-23 16:11:31 -07:00
2013-09-10 22:53:34 -07:00
bch_time_stats_update ( & state - > time , start_time ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
new - > set - > size = 0 ; // XXX: why?
2013-03-23 16:11:31 -07:00
}
2013-05-11 15:59:37 -07:00
# define SORT_CRIT (4096 / sizeof(uint64_t))
2013-11-11 18:38:51 -08:00
void bch_btree_sort_lazy ( struct btree_keys * b , struct bset_sort_state * state )
2013-03-23 16:11:31 -07:00
{
2013-05-11 15:59:37 -07:00
unsigned crit = SORT_CRIT ;
int i ;
2013-03-23 16:11:31 -07:00
2013-05-11 15:59:37 -07:00
/* Don't sort if nothing to do */
2013-11-11 18:38:51 -08:00
if ( ! b - > nsets )
2013-05-11 15:59:37 -07:00
goto out ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
for ( i = b - > nsets - 1 ; i > = 0 ; - - i ) {
2013-09-10 22:53:34 -07:00
crit * = state - > crit_factor ;
2013-03-23 16:11:31 -07:00
2013-11-11 18:38:51 -08:00
if ( b - > set [ i ] . data - > keys < crit ) {
2013-09-10 22:53:34 -07:00
bch_btree_sort_partial ( b , i , state ) ;
2013-03-23 16:11:31 -07:00
return ;
}
}
2013-05-11 15:59:37 -07:00
/* Sort if we'd overflow */
2013-11-11 18:38:51 -08:00
if ( b - > nsets + 1 = = MAX_BSETS ) {
2013-09-10 22:53:34 -07:00
bch_btree_sort ( b , state ) ;
2013-05-11 15:59:37 -07:00
return ;
}
out :
2013-11-11 18:38:51 -08:00
bch_bset_build_written_tree ( b ) ;
2013-03-23 16:11:31 -07:00
}
2013-12-20 17:28:16 -08:00
EXPORT_SYMBOL ( bch_btree_sort_lazy ) ;
2013-03-23 16:11:31 -07:00
2013-11-11 19:25:55 -08:00
void bch_btree_keys_stats ( struct btree_keys * b , struct bset_stats * stats )
2013-03-23 16:11:31 -07:00
{
unsigned i ;
2013-11-11 19:25:55 -08:00
for ( i = 0 ; i < = b - > nsets ; i + + ) {
struct bset_tree * t = & b - > set [ i ] ;
2013-03-23 16:11:31 -07:00
size_t bytes = t - > data - > keys * sizeof ( uint64_t ) ;
size_t j ;
2013-11-11 19:25:55 -08:00
if ( bset_written ( b , t ) ) {
2013-03-23 16:11:31 -07:00
stats - > sets_written + + ;
stats - > bytes_written + = bytes ;
stats - > floats + = t - > size - 1 ;
for ( j = 1 ; j < t - > size ; j + + )
if ( t - > tree [ j ] . exponent = = 127 )
stats - > failed + + ;
} else {
stats - > sets_unwritten + + ;
stats - > bytes_unwritten + = bytes ;
}
}
}