2011-11-01 00:19:11 +04:00
/*
* Copyright ( C ) 2011 Red Hat , Inc .
*
* This file is released under the GPL .
*/
# include "dm-btree-internal.h"
# include "dm-space-map.h"
# include "dm-transaction-manager.h"
2011-09-29 02:29:32 +04:00
# include <linux/export.h>
2011-11-01 00:19:11 +04:00
# include <linux/device-mapper.h>
# define DM_MSG_PREFIX "btree"
/*----------------------------------------------------------------
* Array manipulation
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void memcpy_disk ( void * dest , const void * src , size_t len )
__dm_written_to_disk ( src )
{
memcpy ( dest , src , len ) ;
__dm_unbless_for_disk ( src ) ;
}
static void array_insert ( void * base , size_t elt_size , unsigned nr_elts ,
unsigned index , void * elt )
__dm_written_to_disk ( elt )
{
if ( index < nr_elts )
memmove ( base + ( elt_size * ( index + 1 ) ) ,
base + ( elt_size * index ) ,
( nr_elts - index ) * elt_size ) ;
memcpy_disk ( base + ( elt_size * index ) , elt , elt_size ) ;
}
/*----------------------------------------------------------------*/
/* makes the assumption that no two keys are the same. */
2012-12-22 00:23:30 +04:00
static int bsearch ( struct btree_node * n , uint64_t key , int want_hi )
2011-11-01 00:19:11 +04:00
{
int lo = - 1 , hi = le32_to_cpu ( n - > header . nr_entries ) ;
while ( hi - lo > 1 ) {
int mid = lo + ( ( hi - lo ) / 2 ) ;
uint64_t mid_key = le64_to_cpu ( n - > keys [ mid ] ) ;
if ( mid_key = = key )
return mid ;
if ( mid_key < key )
lo = mid ;
else
hi = mid ;
}
return want_hi ? hi : lo ;
}
2012-12-22 00:23:30 +04:00
int lower_bound ( struct btree_node * n , uint64_t key )
2011-11-01 00:19:11 +04:00
{
return bsearch ( n , key , 0 ) ;
}
2015-12-02 15:24:39 +03:00
static int upper_bound ( struct btree_node * n , uint64_t key )
{
return bsearch ( n , key , 1 ) ;
}
2012-12-22 00:23:30 +04:00
void inc_children ( struct dm_transaction_manager * tm , struct btree_node * n ,
2011-11-01 00:19:11 +04:00
struct dm_btree_value_type * vt )
{
uint32_t nr_entries = le32_to_cpu ( n - > header . nr_entries ) ;
if ( le32_to_cpu ( n - > header . flags ) & INTERNAL_NODE )
2021-04-13 13:03:45 +03:00
dm_tm_with_runs ( tm , value_ptr ( n , 0 ) , nr_entries , dm_tm_inc_range ) ;
2011-11-01 00:19:11 +04:00
else if ( vt - > inc )
2021-04-13 13:03:45 +03:00
vt - > inc ( vt - > context , value_ptr ( n , 0 ) , nr_entries ) ;
2011-11-01 00:19:11 +04:00
}
2012-12-22 00:23:30 +04:00
static int insert_at ( size_t value_size , struct btree_node * node , unsigned index ,
2021-12-10 16:44:13 +03:00
uint64_t key , void * value )
__dm_written_to_disk ( value )
2011-11-01 00:19:11 +04:00
{
uint32_t nr_entries = le32_to_cpu ( node - > header . nr_entries ) ;
2021-12-10 16:44:13 +03:00
uint32_t max_entries = le32_to_cpu ( node - > header . max_entries ) ;
2011-11-01 00:19:11 +04:00
__le64 key_le = cpu_to_le64 ( key ) ;
if ( index > nr_entries | |
2021-12-10 16:44:13 +03:00
index > = max_entries | |
nr_entries > = max_entries ) {
2011-11-01 00:19:11 +04:00
DMERR ( " too many entries in btree node for insert " ) ;
__dm_unbless_for_disk ( value ) ;
return - ENOMEM ;
}
__dm_bless_for_disk ( & key_le ) ;
array_insert ( node - > keys , sizeof ( * node - > keys ) , nr_entries , index , & key_le ) ;
array_insert ( value_base ( node ) , value_size , nr_entries , index , value ) ;
node - > header . nr_entries = cpu_to_le32 ( nr_entries + 1 ) ;
return 0 ;
}
/*----------------------------------------------------------------*/
/*
* We want 3 n entries ( for some n ) . This works more nicely for repeated
* insert remove loops than ( 2 n + 1 ) .
*/
static uint32_t calc_max_entries ( size_t value_size , size_t block_size )
{
uint32_t total , n ;
size_t elt_size = sizeof ( uint64_t ) + value_size ; /* key + value */
block_size - = sizeof ( struct node_header ) ;
total = block_size / elt_size ;
n = total / 3 ; /* rounds down */
return 3 * n ;
}
int dm_btree_empty ( struct dm_btree_info * info , dm_block_t * root )
{
int r ;
struct dm_block * b ;
2012-12-22 00:23:30 +04:00
struct btree_node * n ;
2011-11-01 00:19:11 +04:00
size_t block_size ;
uint32_t max_entries ;
r = new_block ( info , & b ) ;
if ( r < 0 )
return r ;
block_size = dm_bm_block_size ( dm_tm_get_bm ( info - > tm ) ) ;
max_entries = calc_max_entries ( info - > value_type . size , block_size ) ;
n = dm_block_data ( b ) ;
memset ( n , 0 , block_size ) ;
n - > header . flags = cpu_to_le32 ( LEAF_NODE ) ;
n - > header . nr_entries = cpu_to_le32 ( 0 ) ;
n - > header . max_entries = cpu_to_le32 ( max_entries ) ;
n - > header . value_size = cpu_to_le32 ( info - > value_type . size ) ;
* root = dm_block_location ( b ) ;
2015-10-22 23:46:59 +03:00
unlock_block ( info , b ) ;
return 0 ;
2011-11-01 00:19:11 +04:00
}
EXPORT_SYMBOL_GPL ( dm_btree_empty ) ;
/*----------------------------------------------------------------*/
/*
* Deletion uses a recursive algorithm , since we have limited stack space
* we explicitly manage our own stack on the heap .
*/
# define MAX_SPINE_DEPTH 64
struct frame {
struct dm_block * b ;
2012-12-22 00:23:30 +04:00
struct btree_node * n ;
2011-11-01 00:19:11 +04:00
unsigned level ;
unsigned nr_children ;
unsigned current_child ;
} ;
struct del_stack {
2013-08-09 15:59:30 +04:00
struct dm_btree_info * info ;
2011-11-01 00:19:11 +04:00
struct dm_transaction_manager * tm ;
int top ;
struct frame spine [ MAX_SPINE_DEPTH ] ;
} ;
static int top_frame ( struct del_stack * s , struct frame * * f )
{
if ( s - > top < 0 ) {
DMERR ( " btree deletion stack empty " ) ;
return - EINVAL ;
}
* f = s - > spine + s - > top ;
return 0 ;
}
static int unprocessed_frames ( struct del_stack * s )
{
return s - > top > = 0 ;
}
2013-08-09 15:59:30 +04:00
static void prefetch_children ( struct del_stack * s , struct frame * f )
{
unsigned i ;
struct dm_block_manager * bm = dm_tm_get_bm ( s - > tm ) ;
for ( i = 0 ; i < f - > nr_children ; i + + )
dm_bm_prefetch ( bm , value64 ( f - > n , i ) ) ;
}
static bool is_internal_level ( struct dm_btree_info * info , struct frame * f )
{
return f - > level < ( info - > levels - 1 ) ;
}
2011-11-01 00:19:11 +04:00
static int push_frame ( struct del_stack * s , dm_block_t b , unsigned level )
{
int r ;
uint32_t ref_count ;
if ( s - > top > = MAX_SPINE_DEPTH - 1 ) {
DMERR ( " btree deletion stack out of memory " ) ;
return - ENOMEM ;
}
r = dm_tm_ref ( s - > tm , b , & ref_count ) ;
if ( r )
return r ;
if ( ref_count > 1 )
/*
* This is a shared node , so we can just decrement it ' s
* reference counter and leave the children .
*/
dm_tm_dec ( s - > tm , b ) ;
else {
2013-08-09 15:59:30 +04:00
uint32_t flags ;
2011-11-01 00:19:11 +04:00
struct frame * f = s - > spine + + + s - > top ;
r = dm_tm_read_lock ( s - > tm , b , & btree_node_validator , & f - > b ) ;
if ( r ) {
s - > top - - ;
return r ;
}
f - > n = dm_block_data ( f - > b ) ;
f - > level = level ;
f - > nr_children = le32_to_cpu ( f - > n - > header . nr_entries ) ;
f - > current_child = 0 ;
2013-08-09 15:59:30 +04:00
flags = le32_to_cpu ( f - > n - > header . flags ) ;
if ( flags & INTERNAL_NODE | | is_internal_level ( s - > info , f ) )
prefetch_children ( s , f ) ;
2011-11-01 00:19:11 +04:00
}
return 0 ;
}
static void pop_frame ( struct del_stack * s )
{
struct frame * f = s - > spine + s - > top - - ;
dm_tm_dec ( s - > tm , dm_block_location ( f - > b ) ) ;
dm_tm_unlock ( s - > tm , f - > b ) ;
}
2015-12-10 17:37:53 +03:00
static void unlock_all_frames ( struct del_stack * s )
{
struct frame * f ;
while ( unprocessed_frames ( s ) ) {
f = s - > spine + s - > top - - ;
dm_tm_unlock ( s - > tm , f - > b ) ;
}
}
2011-11-01 00:19:11 +04:00
int dm_btree_del ( struct dm_btree_info * info , dm_block_t root )
{
int r ;
struct del_stack * s ;
2015-11-19 16:36:45 +03:00
/*
* dm_btree_del ( ) is called via an ioctl , as such should be
* considered an FS op . We can ' t recurse back into the FS , so we
* allocate GFP_NOFS .
*/
s = kmalloc ( sizeof ( * s ) , GFP_NOFS ) ;
2011-11-01 00:19:11 +04:00
if ( ! s )
return - ENOMEM ;
2013-08-09 15:59:30 +04:00
s - > info = info ;
2011-11-01 00:19:11 +04:00
s - > tm = info - > tm ;
s - > top = - 1 ;
2012-12-22 00:23:32 +04:00
r = push_frame ( s , root , 0 ) ;
2011-11-01 00:19:11 +04:00
if ( r )
goto out ;
while ( unprocessed_frames ( s ) ) {
uint32_t flags ;
struct frame * f ;
dm_block_t b ;
r = top_frame ( s , & f ) ;
if ( r )
goto out ;
if ( f - > current_child > = f - > nr_children ) {
pop_frame ( s ) ;
continue ;
}
flags = le32_to_cpu ( f - > n - > header . flags ) ;
if ( flags & INTERNAL_NODE ) {
b = value64 ( f - > n , f - > current_child ) ;
f - > current_child + + ;
r = push_frame ( s , b , f - > level ) ;
if ( r )
goto out ;
2012-12-22 00:23:32 +04:00
} else if ( is_internal_level ( info , f ) ) {
2011-11-01 00:19:11 +04:00
b = value64 ( f - > n , f - > current_child ) ;
f - > current_child + + ;
r = push_frame ( s , b , f - > level + 1 ) ;
if ( r )
goto out ;
} else {
2021-04-13 13:03:45 +03:00
if ( info - > value_type . dec )
info - > value_type . dec ( info - > value_type . context ,
value_ptr ( f - > n , 0 ) , f - > nr_children ) ;
2013-08-09 15:48:42 +04:00
pop_frame ( s ) ;
2011-11-01 00:19:11 +04:00
}
}
out :
2015-12-10 17:37:53 +03:00
if ( r ) {
/* cleanup all frames of del_stack */
unlock_all_frames ( s ) ;
}
2011-11-01 00:19:11 +04:00
kfree ( s ) ;
2015-12-10 17:37:53 +03:00
2011-11-01 00:19:11 +04:00
return r ;
}
EXPORT_SYMBOL_GPL ( dm_btree_del ) ;
/*----------------------------------------------------------------*/
static int btree_lookup_raw ( struct ro_spine * s , dm_block_t block , uint64_t key ,
2012-12-22 00:23:30 +04:00
int ( * search_fn ) ( struct btree_node * , uint64_t ) ,
2011-11-01 00:19:11 +04:00
uint64_t * result_key , void * v , size_t value_size )
{
int i , r ;
uint32_t flags , nr_entries ;
do {
r = ro_step ( s , block ) ;
if ( r < 0 )
return r ;
i = search_fn ( ro_node ( s ) , key ) ;
flags = le32_to_cpu ( ro_node ( s ) - > header . flags ) ;
nr_entries = le32_to_cpu ( ro_node ( s ) - > header . nr_entries ) ;
if ( i < 0 | | i > = nr_entries )
return - ENODATA ;
if ( flags & INTERNAL_NODE )
block = value64 ( ro_node ( s ) , i ) ;
} while ( ! ( flags & LEAF_NODE ) ) ;
* result_key = le64_to_cpu ( ro_node ( s ) - > keys [ i ] ) ;
2020-09-15 11:56:08 +03:00
if ( v )
memcpy ( v , value_ptr ( ro_node ( s ) , i ) , value_size ) ;
2011-11-01 00:19:11 +04:00
return 0 ;
}
int dm_btree_lookup ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * keys , void * value_le )
{
unsigned level , last_level = info - > levels - 1 ;
int r = - ENODATA ;
uint64_t rkey ;
__le64 internal_value_le ;
struct ro_spine spine ;
init_ro_spine ( & spine , info ) ;
for ( level = 0 ; level < info - > levels ; level + + ) {
size_t size ;
void * value_p ;
if ( level = = last_level ) {
value_p = value_le ;
size = info - > value_type . size ;
} else {
value_p = & internal_value_le ;
size = sizeof ( uint64_t ) ;
}
r = btree_lookup_raw ( & spine , root , keys [ level ] ,
lower_bound , & rkey ,
value_p , size ) ;
if ( ! r ) {
if ( rkey ! = keys [ level ] ) {
exit_ro_spine ( & spine ) ;
return - ENODATA ;
}
} else {
exit_ro_spine ( & spine ) ;
return r ;
}
root = le64_to_cpu ( internal_value_le ) ;
}
exit_ro_spine ( & spine ) ;
return r ;
}
EXPORT_SYMBOL_GPL ( dm_btree_lookup ) ;
2015-12-02 15:24:39 +03:00
static int dm_btree_lookup_next_single ( struct dm_btree_info * info , dm_block_t root ,
uint64_t key , uint64_t * rkey , void * value_le )
{
int r , i ;
uint32_t flags , nr_entries ;
struct dm_block * node ;
struct btree_node * n ;
r = bn_read_lock ( info , root , & node ) ;
if ( r )
return r ;
n = dm_block_data ( node ) ;
flags = le32_to_cpu ( n - > header . flags ) ;
nr_entries = le32_to_cpu ( n - > header . nr_entries ) ;
if ( flags & INTERNAL_NODE ) {
i = lower_bound ( n , key ) ;
2016-07-01 13:09:13 +03:00
if ( i < 0 ) {
/*
* avoid early - ENODATA return when all entries are
* higher than the search @ key .
*/
i = 0 ;
}
if ( i > = nr_entries ) {
2015-12-02 15:24:39 +03:00
r = - ENODATA ;
goto out ;
}
r = dm_btree_lookup_next_single ( info , value64 ( n , i ) , key , rkey , value_le ) ;
if ( r = = - ENODATA & & i < ( nr_entries - 1 ) ) {
i + + ;
r = dm_btree_lookup_next_single ( info , value64 ( n , i ) , key , rkey , value_le ) ;
}
} else {
i = upper_bound ( n , key ) ;
if ( i < 0 | | i > = nr_entries ) {
r = - ENODATA ;
goto out ;
}
* rkey = le64_to_cpu ( n - > keys [ i ] ) ;
memcpy ( value_le , value_ptr ( n , i ) , info - > value_type . size ) ;
}
out :
dm_tm_unlock ( info - > tm , node ) ;
return r ;
}
int dm_btree_lookup_next ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * keys , uint64_t * rkey , void * value_le )
{
unsigned level ;
int r = - ENODATA ;
__le64 internal_value_le ;
struct ro_spine spine ;
init_ro_spine ( & spine , info ) ;
for ( level = 0 ; level < info - > levels - 1u ; level + + ) {
r = btree_lookup_raw ( & spine , root , keys [ level ] ,
lower_bound , rkey ,
& internal_value_le , sizeof ( uint64_t ) ) ;
if ( r )
goto out ;
if ( * rkey ! = keys [ level ] ) {
r = - ENODATA ;
goto out ;
}
root = le64_to_cpu ( internal_value_le ) ;
}
r = dm_btree_lookup_next_single ( info , root , keys [ level ] , rkey , value_le ) ;
out :
exit_ro_spine ( & spine ) ;
return r ;
}
EXPORT_SYMBOL_GPL ( dm_btree_lookup_next ) ;
2021-04-08 15:47:08 +03:00
/*----------------------------------------------------------------*/
/*
* Copies entries from one region of a btree node to another . The regions
* must not overlap .
*/
static void copy_entries ( struct btree_node * dest , unsigned dest_offset ,
struct btree_node * src , unsigned src_offset ,
unsigned count )
{
size_t value_size = le32_to_cpu ( dest - > header . value_size ) ;
memcpy ( dest - > keys + dest_offset , src - > keys + src_offset , count * sizeof ( uint64_t ) ) ;
memcpy ( value_ptr ( dest , dest_offset ) , value_ptr ( src , src_offset ) , count * value_size ) ;
}
/*
* Moves entries from one region fo a btree node to another . The regions
* may overlap .
*/
static void move_entries ( struct btree_node * dest , unsigned dest_offset ,
struct btree_node * src , unsigned src_offset ,
unsigned count )
{
size_t value_size = le32_to_cpu ( dest - > header . value_size ) ;
memmove ( dest - > keys + dest_offset , src - > keys + src_offset , count * sizeof ( uint64_t ) ) ;
memmove ( value_ptr ( dest , dest_offset ) , value_ptr ( src , src_offset ) , count * value_size ) ;
}
/*
* Erases the first ' count ' entries of a btree node , shifting following
* entries down into their place .
*/
static void shift_down ( struct btree_node * n , unsigned count )
{
move_entries ( n , 0 , n , count , le32_to_cpu ( n - > header . nr_entries ) - count ) ;
}
/*
* Moves entries in a btree node up ' count ' places , making space for
* new entries at the start of the node .
*/
static void shift_up ( struct btree_node * n , unsigned count )
{
move_entries ( n , count , n , 0 , le32_to_cpu ( n - > header . nr_entries ) ) ;
}
/*
* Redistributes entries between two btree nodes to make them
* have similar numbers of entries .
*/
static void redistribute2 ( struct btree_node * left , struct btree_node * right )
{
unsigned nr_left = le32_to_cpu ( left - > header . nr_entries ) ;
unsigned nr_right = le32_to_cpu ( right - > header . nr_entries ) ;
unsigned total = nr_left + nr_right ;
unsigned target_left = total / 2 ;
unsigned target_right = total - target_left ;
if ( nr_left < target_left ) {
unsigned delta = target_left - nr_left ;
copy_entries ( left , nr_left , right , 0 , delta ) ;
shift_down ( right , delta ) ;
} else if ( nr_left > target_left ) {
unsigned delta = nr_left - target_left ;
if ( nr_right )
shift_up ( right , delta ) ;
copy_entries ( right , 0 , left , target_left , delta ) ;
}
left - > header . nr_entries = cpu_to_le32 ( target_left ) ;
right - > header . nr_entries = cpu_to_le32 ( target_right ) ;
}
/*
* Redistribute entries between three nodes . Assumes the central
* node is empty .
*/
static void redistribute3 ( struct btree_node * left , struct btree_node * center ,
struct btree_node * right )
{
unsigned nr_left = le32_to_cpu ( left - > header . nr_entries ) ;
unsigned nr_center = le32_to_cpu ( center - > header . nr_entries ) ;
unsigned nr_right = le32_to_cpu ( right - > header . nr_entries ) ;
unsigned total , target_left , target_center , target_right ;
BUG_ON ( nr_center ) ;
total = nr_left + nr_right ;
target_left = total / 3 ;
target_center = ( total - target_left ) / 2 ;
target_right = ( total - target_left - target_center ) ;
if ( nr_left < target_left ) {
unsigned left_short = target_left - nr_left ;
copy_entries ( left , nr_left , right , 0 , left_short ) ;
copy_entries ( center , 0 , right , left_short , target_center ) ;
shift_down ( right , nr_right - target_right ) ;
} else if ( nr_left < ( target_left + target_center ) ) {
unsigned left_to_center = nr_left - target_left ;
copy_entries ( center , 0 , left , target_left , left_to_center ) ;
copy_entries ( center , left_to_center , right , 0 , target_center - left_to_center ) ;
shift_down ( right , nr_right - target_right ) ;
} else {
unsigned right_short = target_right - nr_right ;
shift_up ( right , right_short ) ;
copy_entries ( right , 0 , left , nr_left - right_short , right_short ) ;
copy_entries ( center , 0 , left , target_left , nr_left - target_left ) ;
}
left - > header . nr_entries = cpu_to_le32 ( target_left ) ;
center - > header . nr_entries = cpu_to_le32 ( target_center ) ;
right - > header . nr_entries = cpu_to_le32 ( target_right ) ;
}
2011-11-01 00:19:11 +04:00
/*
* Splits a node by creating a sibling node and shifting half the nodes
* contents across . Assumes there is a parent node , and it has room for
* another child .
*
* Before :
* + - - - - - - - - +
* | Parent |
* + - - - - - - - - +
* |
* v
* + - - - - - - - - - - +
* | A + + + + + + |
* + - - - - - - - - - - +
*
*
* After :
* + - - - - - - - - +
* | Parent |
* + - - - - - - - - +
* | |
* v + - - - - - - +
* + - - - - - - - - - + |
* | A * + + + | v
* + - - - - - - - - - + + - - - - - - - +
* | B + + + |
* + - - - - - - - +
*
* Where A * is a shadow of A .
*/
2021-04-08 15:47:08 +03:00
static int split_one_into_two ( struct shadow_spine * s , unsigned parent_index ,
struct dm_btree_value_type * vt , uint64_t key )
2011-11-01 00:19:11 +04:00
{
int r ;
struct dm_block * left , * right , * parent ;
2012-12-22 00:23:30 +04:00
struct btree_node * ln , * rn , * pn ;
2011-11-01 00:19:11 +04:00
__le64 location ;
left = shadow_current ( s ) ;
r = new_block ( s - > info , & right ) ;
if ( r < 0 )
return r ;
ln = dm_block_data ( left ) ;
rn = dm_block_data ( right ) ;
rn - > header . flags = ln - > header . flags ;
2021-04-08 15:47:08 +03:00
rn - > header . nr_entries = cpu_to_le32 ( 0 ) ;
2011-11-01 00:19:11 +04:00
rn - > header . max_entries = ln - > header . max_entries ;
rn - > header . value_size = ln - > header . value_size ;
2021-04-08 15:47:08 +03:00
redistribute2 ( ln , rn ) ;
2011-11-01 00:19:11 +04:00
2021-04-08 15:47:08 +03:00
/* patch up the parent */
2011-11-01 00:19:11 +04:00
parent = shadow_parent ( s ) ;
pn = dm_block_data ( parent ) ;
location = cpu_to_le64 ( dm_block_location ( right ) ) ;
__dm_bless_for_disk ( & location ) ;
r = insert_at ( sizeof ( __le64 ) , pn , parent_index + 1 ,
le64_to_cpu ( rn - > keys [ 0 ] ) , & location ) ;
2015-11-24 00:24:45 +03:00
if ( r ) {
unlock_block ( s - > info , right ) ;
2011-11-01 00:19:11 +04:00
return r ;
2015-11-24 00:24:45 +03:00
}
2011-11-01 00:19:11 +04:00
2021-04-08 15:47:08 +03:00
/* patch up the spine */
2011-11-01 00:19:11 +04:00
if ( key < le64_to_cpu ( rn - > keys [ 0 ] ) ) {
unlock_block ( s - > info , right ) ;
s - > nodes [ 1 ] = left ;
} else {
unlock_block ( s - > info , left ) ;
s - > nodes [ 1 ] = right ;
}
return 0 ;
}
2021-04-08 15:47:08 +03:00
/*
* We often need to modify a sibling node . This function shadows a particular
* child of the given parent node . Making sure to update the parent to point
* to the new shadow .
*/
static int shadow_child ( struct dm_btree_info * info , struct dm_btree_value_type * vt ,
struct btree_node * parent , unsigned index ,
struct dm_block * * result )
{
int r , inc ;
dm_block_t root ;
struct btree_node * node ;
root = value64 ( parent , index ) ;
r = dm_tm_shadow_block ( info - > tm , root , & btree_node_validator ,
result , & inc ) ;
if ( r )
return r ;
node = dm_block_data ( * result ) ;
if ( inc )
inc_children ( info - > tm , node , vt ) ;
* ( ( __le64 * ) value_ptr ( parent , index ) ) =
cpu_to_le64 ( dm_block_location ( * result ) ) ;
return 0 ;
}
/*
* Splits two nodes into three . This is more work , but results in fuller
* nodes , so saves metadata space .
*/
static int split_two_into_three ( struct shadow_spine * s , unsigned parent_index ,
struct dm_btree_value_type * vt , uint64_t key )
{
int r ;
unsigned middle_index ;
struct dm_block * left , * middle , * right , * parent ;
struct btree_node * ln , * rn , * mn , * pn ;
__le64 location ;
parent = shadow_parent ( s ) ;
pn = dm_block_data ( parent ) ;
if ( parent_index = = 0 ) {
middle_index = 1 ;
left = shadow_current ( s ) ;
r = shadow_child ( s - > info , vt , pn , parent_index + 1 , & right ) ;
if ( r )
return r ;
} else {
middle_index = parent_index ;
right = shadow_current ( s ) ;
r = shadow_child ( s - > info , vt , pn , parent_index - 1 , & left ) ;
if ( r )
return r ;
}
r = new_block ( s - > info , & middle ) ;
if ( r < 0 )
return r ;
ln = dm_block_data ( left ) ;
mn = dm_block_data ( middle ) ;
rn = dm_block_data ( right ) ;
mn - > header . nr_entries = cpu_to_le32 ( 0 ) ;
mn - > header . flags = ln - > header . flags ;
mn - > header . max_entries = ln - > header . max_entries ;
mn - > header . value_size = ln - > header . value_size ;
redistribute3 ( ln , mn , rn ) ;
/* patch up the parent */
pn - > keys [ middle_index ] = rn - > keys [ 0 ] ;
location = cpu_to_le64 ( dm_block_location ( middle ) ) ;
__dm_bless_for_disk ( & location ) ;
r = insert_at ( sizeof ( __le64 ) , pn , middle_index ,
le64_to_cpu ( mn - > keys [ 0 ] ) , & location ) ;
if ( r ) {
if ( shadow_current ( s ) ! = left )
unlock_block ( s - > info , left ) ;
unlock_block ( s - > info , middle ) ;
if ( shadow_current ( s ) ! = right )
unlock_block ( s - > info , right ) ;
return r ;
}
/* patch up the spine */
if ( key < le64_to_cpu ( mn - > keys [ 0 ] ) ) {
unlock_block ( s - > info , middle ) ;
unlock_block ( s - > info , right ) ;
s - > nodes [ 1 ] = left ;
} else if ( key < le64_to_cpu ( rn - > keys [ 0 ] ) ) {
unlock_block ( s - > info , left ) ;
unlock_block ( s - > info , right ) ;
s - > nodes [ 1 ] = middle ;
} else {
unlock_block ( s - > info , left ) ;
unlock_block ( s - > info , middle ) ;
s - > nodes [ 1 ] = right ;
}
return 0 ;
}
/*----------------------------------------------------------------*/
2011-11-01 00:19:11 +04:00
/*
* Splits a node by creating two new children beneath the given node .
*
* Before :
* + - - - - - - - - - - +
* | A + + + + + + |
* + - - - - - - - - - - +
*
*
* After :
* + - - - - - - - - - - - - +
* | A ( shadow ) |
* + - - - - - - - - - - - - +
* | |
* + - - - - - - + + - - - - +
* | |
* v v
* + - - - - - - - + + - - - - - - - +
* | B + + + | | C + + + |
* + - - - - - - - + + - - - - - - - +
*/
static int btree_split_beneath ( struct shadow_spine * s , uint64_t key )
{
int r ;
size_t size ;
unsigned nr_left , nr_right ;
struct dm_block * left , * right , * new_parent ;
2012-12-22 00:23:30 +04:00
struct btree_node * pn , * ln , * rn ;
2011-11-01 00:19:11 +04:00
__le64 val ;
new_parent = shadow_current ( s ) ;
2019-08-17 08:32:40 +03:00
pn = dm_block_data ( new_parent ) ;
size = le32_to_cpu ( pn - > header . flags ) & INTERNAL_NODE ?
sizeof ( __le64 ) : s - > info - > value_type . size ;
/* create & init the left block */
2011-11-01 00:19:11 +04:00
r = new_block ( s - > info , & left ) ;
if ( r < 0 )
return r ;
2019-08-17 08:32:40 +03:00
ln = dm_block_data ( left ) ;
nr_left = le32_to_cpu ( pn - > header . nr_entries ) / 2 ;
ln - > header . flags = pn - > header . flags ;
ln - > header . nr_entries = cpu_to_le32 ( nr_left ) ;
ln - > header . max_entries = pn - > header . max_entries ;
ln - > header . value_size = pn - > header . value_size ;
memcpy ( ln - > keys , pn - > keys , nr_left * sizeof ( pn - > keys [ 0 ] ) ) ;
memcpy ( value_ptr ( ln , 0 ) , value_ptr ( pn , 0 ) , nr_left * size ) ;
/* create & init the right block */
2011-11-01 00:19:11 +04:00
r = new_block ( s - > info , & right ) ;
if ( r < 0 ) {
2015-10-22 17:56:40 +03:00
unlock_block ( s - > info , left ) ;
2011-11-01 00:19:11 +04:00
return r ;
}
rn = dm_block_data ( right ) ;
nr_right = le32_to_cpu ( pn - > header . nr_entries ) - nr_left ;
rn - > header . flags = pn - > header . flags ;
rn - > header . nr_entries = cpu_to_le32 ( nr_right ) ;
rn - > header . max_entries = pn - > header . max_entries ;
rn - > header . value_size = pn - > header . value_size ;
memcpy ( rn - > keys , pn - > keys + nr_left , nr_right * sizeof ( pn - > keys [ 0 ] ) ) ;
2012-03-28 21:41:25 +04:00
memcpy ( value_ptr ( rn , 0 ) , value_ptr ( pn , nr_left ) ,
2011-11-01 00:19:11 +04:00
nr_right * size ) ;
/* new_parent should just point to l and r now */
pn - > header . flags = cpu_to_le32 ( INTERNAL_NODE ) ;
pn - > header . nr_entries = cpu_to_le32 ( 2 ) ;
pn - > header . max_entries = cpu_to_le32 (
calc_max_entries ( sizeof ( __le64 ) ,
dm_bm_block_size (
dm_tm_get_bm ( s - > info - > tm ) ) ) ) ;
pn - > header . value_size = cpu_to_le32 ( sizeof ( __le64 ) ) ;
val = cpu_to_le64 ( dm_block_location ( left ) ) ;
__dm_bless_for_disk ( & val ) ;
pn - > keys [ 0 ] = ln - > keys [ 0 ] ;
2012-03-28 21:41:25 +04:00
memcpy_disk ( value_ptr ( pn , 0 ) , & val , sizeof ( __le64 ) ) ;
2011-11-01 00:19:11 +04:00
val = cpu_to_le64 ( dm_block_location ( right ) ) ;
__dm_bless_for_disk ( & val ) ;
pn - > keys [ 1 ] = rn - > keys [ 0 ] ;
2012-03-28 21:41:25 +04:00
memcpy_disk ( value_ptr ( pn , 1 ) , & val , sizeof ( __le64 ) ) ;
2011-11-01 00:19:11 +04:00
2017-12-20 12:56:06 +03:00
unlock_block ( s - > info , left ) ;
unlock_block ( s - > info , right ) ;
2011-11-01 00:19:11 +04:00
return 0 ;
}
2021-04-08 15:47:08 +03:00
/*----------------------------------------------------------------*/
/*
* Redistributes a node ' s entries with its left sibling .
*/
static int rebalance_left ( struct shadow_spine * s , struct dm_btree_value_type * vt ,
unsigned parent_index , uint64_t key )
{
int r ;
struct dm_block * sib ;
struct btree_node * left , * right , * parent = dm_block_data ( shadow_parent ( s ) ) ;
r = shadow_child ( s - > info , vt , parent , parent_index - 1 , & sib ) ;
if ( r )
return r ;
left = dm_block_data ( sib ) ;
right = dm_block_data ( shadow_current ( s ) ) ;
redistribute2 ( left , right ) ;
* key_ptr ( parent , parent_index ) = right - > keys [ 0 ] ;
if ( key < le64_to_cpu ( right - > keys [ 0 ] ) ) {
unlock_block ( s - > info , s - > nodes [ 1 ] ) ;
s - > nodes [ 1 ] = sib ;
} else {
unlock_block ( s - > info , sib ) ;
}
return 0 ;
}
/*
* Redistributes a nodes entries with its right sibling .
*/
static int rebalance_right ( struct shadow_spine * s , struct dm_btree_value_type * vt ,
unsigned parent_index , uint64_t key )
{
int r ;
struct dm_block * sib ;
struct btree_node * left , * right , * parent = dm_block_data ( shadow_parent ( s ) ) ;
r = shadow_child ( s - > info , vt , parent , parent_index + 1 , & sib ) ;
if ( r )
return r ;
left = dm_block_data ( shadow_current ( s ) ) ;
right = dm_block_data ( sib ) ;
redistribute2 ( left , right ) ;
* key_ptr ( parent , parent_index + 1 ) = right - > keys [ 0 ] ;
if ( key < le64_to_cpu ( right - > keys [ 0 ] ) ) {
unlock_block ( s - > info , sib ) ;
} else {
unlock_block ( s - > info , s - > nodes [ 1 ] ) ;
s - > nodes [ 1 ] = sib ;
}
return 0 ;
}
/*
* Returns the number of spare entries in a node .
*/
static int get_node_free_space ( struct dm_btree_info * info , dm_block_t b , unsigned * space )
{
int r ;
unsigned nr_entries ;
struct dm_block * block ;
struct btree_node * node ;
r = bn_read_lock ( info , b , & block ) ;
if ( r )
return r ;
node = dm_block_data ( block ) ;
nr_entries = le32_to_cpu ( node - > header . nr_entries ) ;
* space = le32_to_cpu ( node - > header . max_entries ) - nr_entries ;
unlock_block ( info , block ) ;
return 0 ;
}
/*
* Make space in a node , either by moving some entries to a sibling ,
* or creating a new sibling node . SPACE_THRESHOLD defines the minimum
* number of free entries that must be in the sibling to make the move
* worth while . If the siblings are shared ( eg , part of a snapshot ) ,
* then they are not touched , since this break sharing and so consume
* more space than we save .
*/
# define SPACE_THRESHOLD 8
static int rebalance_or_split ( struct shadow_spine * s , struct dm_btree_value_type * vt ,
unsigned parent_index , uint64_t key )
{
int r ;
struct btree_node * parent = dm_block_data ( shadow_parent ( s ) ) ;
unsigned nr_parent = le32_to_cpu ( parent - > header . nr_entries ) ;
unsigned free_space ;
int left_shared = 0 , right_shared = 0 ;
/* Should we move entries to the left sibling? */
if ( parent_index > 0 ) {
dm_block_t left_b = value64 ( parent , parent_index - 1 ) ;
r = dm_tm_block_is_shared ( s - > info - > tm , left_b , & left_shared ) ;
if ( r )
return r ;
if ( ! left_shared ) {
r = get_node_free_space ( s - > info , left_b , & free_space ) ;
if ( r )
return r ;
if ( free_space > = SPACE_THRESHOLD )
return rebalance_left ( s , vt , parent_index , key ) ;
}
}
/* Should we move entries to the right sibling? */
if ( parent_index < ( nr_parent - 1 ) ) {
dm_block_t right_b = value64 ( parent , parent_index + 1 ) ;
r = dm_tm_block_is_shared ( s - > info - > tm , right_b , & right_shared ) ;
if ( r )
return r ;
if ( ! right_shared ) {
r = get_node_free_space ( s - > info , right_b , & free_space ) ;
if ( r )
return r ;
if ( free_space > = SPACE_THRESHOLD )
return rebalance_right ( s , vt , parent_index , key ) ;
}
}
/*
* We need to split the node , normally we split two nodes
* into three . But when inserting a sequence that is either
* monotonically increasing or decreasing it ' s better to split
* a single node into two .
*/
if ( left_shared | | right_shared | | ( nr_parent < = 2 ) | |
( parent_index = = 0 ) | | ( parent_index + 1 = = nr_parent ) ) {
return split_one_into_two ( s , parent_index , vt , key ) ;
} else {
return split_two_into_three ( s , parent_index , vt , key ) ;
}
}
/*
* Does the node contain a particular key ?
*/
static bool contains_key ( struct btree_node * node , uint64_t key )
{
int i = lower_bound ( node , key ) ;
if ( i > = 0 & & le64_to_cpu ( node - > keys [ i ] ) = = key )
return true ;
return false ;
}
/*
* In general we preemptively make sure there ' s a free entry in every
* node on the spine when doing an insert . But we can avoid that with
* leaf nodes if we know it ' s an overwrite .
*/
static bool has_space_for_insert ( struct btree_node * node , uint64_t key )
{
if ( node - > header . nr_entries = = node - > header . max_entries ) {
if ( le32_to_cpu ( node - > header . flags ) & LEAF_NODE ) {
/* we don't need space if it's an overwrite */
return contains_key ( node , key ) ;
}
return false ;
}
return true ;
}
2011-11-01 00:19:11 +04:00
static int btree_insert_raw ( struct shadow_spine * s , dm_block_t root ,
struct dm_btree_value_type * vt ,
uint64_t key , unsigned * index )
{
int r , i = * index , top = 1 ;
2012-12-22 00:23:30 +04:00
struct btree_node * node ;
2011-11-01 00:19:11 +04:00
for ( ; ; ) {
r = shadow_step ( s , root , vt ) ;
if ( r < 0 )
return r ;
node = dm_block_data ( shadow_current ( s ) ) ;
/*
* We have to patch up the parent node , ugly , but I don ' t
* see a way to do this automatically as part of the spine
* op .
*/
if ( shadow_has_parent ( s ) & & i > = 0 ) { /* FIXME: second clause unness. */
__le64 location = cpu_to_le64 ( dm_block_location ( shadow_current ( s ) ) ) ;
__dm_bless_for_disk ( & location ) ;
2012-03-28 21:41:25 +04:00
memcpy_disk ( value_ptr ( dm_block_data ( shadow_parent ( s ) ) , i ) ,
2011-11-01 00:19:11 +04:00
& location , sizeof ( __le64 ) ) ;
}
node = dm_block_data ( shadow_current ( s ) ) ;
2021-04-08 15:47:08 +03:00
if ( ! has_space_for_insert ( node , key ) ) {
2011-11-01 00:19:11 +04:00
if ( top )
r = btree_split_beneath ( s , key ) ;
else
2021-04-08 15:47:08 +03:00
r = rebalance_or_split ( s , vt , i , key ) ;
2011-11-01 00:19:11 +04:00
if ( r < 0 )
return r ;
2021-04-08 15:47:08 +03:00
/* making space can cause the current node to change */
node = dm_block_data ( shadow_current ( s ) ) ;
}
2011-11-01 00:19:11 +04:00
i = lower_bound ( node , key ) ;
if ( le32_to_cpu ( node - > header . flags ) & LEAF_NODE )
break ;
if ( i < 0 ) {
/* change the bounds on the lowest key */
node - > keys [ 0 ] = cpu_to_le64 ( key ) ;
i = 0 ;
}
root = value64 ( node , i ) ;
top = 0 ;
}
if ( i < 0 | | le64_to_cpu ( node - > keys [ i ] ) ! = key )
i + + ;
* index = i ;
return 0 ;
}
2021-04-13 13:03:45 +03:00
static int __btree_get_overwrite_leaf ( struct shadow_spine * s , dm_block_t root ,
uint64_t key , int * index )
{
int r , i = - 1 ;
struct btree_node * node ;
* index = 0 ;
for ( ; ; ) {
r = shadow_step ( s , root , & s - > info - > value_type ) ;
if ( r < 0 )
return r ;
node = dm_block_data ( shadow_current ( s ) ) ;
/*
* We have to patch up the parent node , ugly , but I don ' t
* see a way to do this automatically as part of the spine
* op .
*/
if ( shadow_has_parent ( s ) & & i > = 0 ) {
__le64 location = cpu_to_le64 ( dm_block_location ( shadow_current ( s ) ) ) ;
__dm_bless_for_disk ( & location ) ;
memcpy_disk ( value_ptr ( dm_block_data ( shadow_parent ( s ) ) , i ) ,
& location , sizeof ( __le64 ) ) ;
}
node = dm_block_data ( shadow_current ( s ) ) ;
i = lower_bound ( node , key ) ;
BUG_ON ( i < 0 ) ;
BUG_ON ( i > = le32_to_cpu ( node - > header . nr_entries ) ) ;
if ( le32_to_cpu ( node - > header . flags ) & LEAF_NODE ) {
if ( key ! = le64_to_cpu ( node - > keys [ i ] ) )
return - EINVAL ;
break ;
}
root = value64 ( node , i ) ;
}
* index = i ;
return 0 ;
}
int btree_get_overwrite_leaf ( struct dm_btree_info * info , dm_block_t root ,
uint64_t key , int * index ,
dm_block_t * new_root , struct dm_block * * leaf )
{
int r ;
struct shadow_spine spine ;
BUG_ON ( info - > levels > 1 ) ;
init_shadow_spine ( & spine , info ) ;
r = __btree_get_overwrite_leaf ( & spine , root , key , index ) ;
if ( ! r ) {
* new_root = shadow_root ( & spine ) ;
* leaf = shadow_current ( & spine ) ;
/*
* Decrement the count so exit_shadow_spine ( ) doesn ' t
* unlock the leaf .
*/
spine . count - - ;
}
exit_shadow_spine ( & spine ) ;
return r ;
}
2015-11-24 00:38:25 +03:00
static bool need_insert ( struct btree_node * node , uint64_t * keys ,
unsigned level , unsigned index )
{
return ( ( index > = le32_to_cpu ( node - > header . nr_entries ) ) | |
( le64_to_cpu ( node - > keys [ index ] ) ! = keys [ level ] ) ) ;
}
2011-11-01 00:19:11 +04:00
static int insert ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * keys , void * value , dm_block_t * new_root ,
int * inserted )
__dm_written_to_disk ( value )
{
2015-11-24 00:38:25 +03:00
int r ;
2011-11-01 00:19:11 +04:00
unsigned level , index = - 1 , last_level = info - > levels - 1 ;
dm_block_t block = root ;
struct shadow_spine spine ;
2012-12-22 00:23:30 +04:00
struct btree_node * n ;
2011-11-01 00:19:11 +04:00
struct dm_btree_value_type le64_type ;
2015-08-12 17:12:09 +03:00
init_le64_type ( info - > tm , & le64_type ) ;
2011-11-01 00:19:11 +04:00
init_shadow_spine ( & spine , info ) ;
for ( level = 0 ; level < ( info - > levels - 1 ) ; level + + ) {
r = btree_insert_raw ( & spine , block , & le64_type , keys [ level ] , & index ) ;
if ( r < 0 )
goto bad ;
n = dm_block_data ( shadow_current ( & spine ) ) ;
2015-11-24 00:38:25 +03:00
if ( need_insert ( n , keys , level , index ) ) {
2011-11-01 00:19:11 +04:00
dm_block_t new_tree ;
__le64 new_le ;
r = dm_btree_empty ( info , & new_tree ) ;
if ( r < 0 )
goto bad ;
new_le = cpu_to_le64 ( new_tree ) ;
__dm_bless_for_disk ( & new_le ) ;
r = insert_at ( sizeof ( uint64_t ) , n , index ,
keys [ level ] , & new_le ) ;
if ( r )
goto bad ;
}
if ( level < last_level )
block = value64 ( n , index ) ;
}
r = btree_insert_raw ( & spine , block , & info - > value_type ,
keys [ level ] , & index ) ;
if ( r < 0 )
goto bad ;
n = dm_block_data ( shadow_current ( & spine ) ) ;
2015-11-24 00:38:25 +03:00
if ( need_insert ( n , keys , level , index ) ) {
2011-11-01 00:19:11 +04:00
if ( inserted )
* inserted = 1 ;
r = insert_at ( info - > value_type . size , n , index ,
keys [ level ] , value ) ;
if ( r )
goto bad_unblessed ;
} else {
if ( inserted )
* inserted = 0 ;
if ( info - > value_type . dec & &
( ! info - > value_type . equal | |
! info - > value_type . equal (
info - > value_type . context ,
2012-03-28 21:41:25 +04:00
value_ptr ( n , index ) ,
2011-11-01 00:19:11 +04:00
value ) ) ) {
info - > value_type . dec ( info - > value_type . context ,
2021-04-13 13:03:45 +03:00
value_ptr ( n , index ) , 1 ) ;
2011-11-01 00:19:11 +04:00
}
2012-03-28 21:41:25 +04:00
memcpy_disk ( value_ptr ( n , index ) ,
2011-11-01 00:19:11 +04:00
value , info - > value_type . size ) ;
}
* new_root = shadow_root ( & spine ) ;
exit_shadow_spine ( & spine ) ;
return 0 ;
bad :
__dm_unbless_for_disk ( value ) ;
bad_unblessed :
exit_shadow_spine ( & spine ) ;
return r ;
}
int dm_btree_insert ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * keys , void * value , dm_block_t * new_root )
__dm_written_to_disk ( value )
{
return insert ( info , root , keys , value , new_root , NULL ) ;
}
EXPORT_SYMBOL_GPL ( dm_btree_insert ) ;
int dm_btree_insert_notify ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * keys , void * value , dm_block_t * new_root ,
int * inserted )
__dm_written_to_disk ( value )
{
return insert ( info , root , keys , value , new_root , inserted ) ;
}
EXPORT_SYMBOL_GPL ( dm_btree_insert_notify ) ;
/*----------------------------------------------------------------*/
2013-12-20 19:41:11 +04:00
static int find_key ( struct ro_spine * s , dm_block_t block , bool find_highest ,
uint64_t * result_key , dm_block_t * next_block )
2011-11-01 00:19:11 +04:00
{
int i , r ;
uint32_t flags ;
do {
r = ro_step ( s , block ) ;
if ( r < 0 )
return r ;
flags = le32_to_cpu ( ro_node ( s ) - > header . flags ) ;
i = le32_to_cpu ( ro_node ( s ) - > header . nr_entries ) ;
if ( ! i )
return - ENODATA ;
else
i - - ;
2013-12-20 19:41:11 +04:00
if ( find_highest )
* result_key = le64_to_cpu ( ro_node ( s ) - > keys [ i ] ) ;
else
* result_key = le64_to_cpu ( ro_node ( s ) - > keys [ 0 ] ) ;
2017-04-07 05:09:38 +03:00
if ( next_block | | flags & INTERNAL_NODE ) {
if ( find_highest )
block = value64 ( ro_node ( s ) , i ) ;
else
block = value64 ( ro_node ( s ) , 0 ) ;
}
2011-11-01 00:19:11 +04:00
} while ( flags & INTERNAL_NODE ) ;
if ( next_block )
* next_block = block ;
return 0 ;
}
2013-12-20 19:41:11 +04:00
static int dm_btree_find_key ( struct dm_btree_info * info , dm_block_t root ,
bool find_highest , uint64_t * result_keys )
2011-11-01 00:19:11 +04:00
{
int r = 0 , count = 0 , level ;
struct ro_spine spine ;
init_ro_spine ( & spine , info ) ;
for ( level = 0 ; level < info - > levels ; level + + ) {
2013-12-20 19:41:11 +04:00
r = find_key ( & spine , root , find_highest , result_keys + level ,
level = = info - > levels - 1 ? NULL : & root ) ;
2011-11-01 00:19:11 +04:00
if ( r = = - ENODATA ) {
r = 0 ;
break ;
} else if ( r )
break ;
count + + ;
}
exit_ro_spine ( & spine ) ;
return r ? r : count ;
}
2013-12-20 19:41:11 +04:00
int dm_btree_find_highest_key ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * result_keys )
{
return dm_btree_find_key ( info , root , true , result_keys ) ;
}
2011-11-01 00:19:11 +04:00
EXPORT_SYMBOL_GPL ( dm_btree_find_highest_key ) ;
2013-03-02 02:45:50 +04:00
2013-12-20 19:41:11 +04:00
int dm_btree_find_lowest_key ( struct dm_btree_info * info , dm_block_t root ,
uint64_t * result_keys )
{
return dm_btree_find_key ( info , root , false , result_keys ) ;
}
EXPORT_SYMBOL_GPL ( dm_btree_find_lowest_key ) ;
/*----------------------------------------------------------------*/
2013-03-02 02:45:50 +04:00
/*
* FIXME : We shouldn ' t use a recursive algorithm when we have limited stack
* space . Also this only works for single level trees .
*/
2014-11-10 18:03:24 +03:00
static int walk_node ( struct dm_btree_info * info , dm_block_t block ,
2013-03-02 02:45:50 +04:00
int ( * fn ) ( void * context , uint64_t * keys , void * leaf ) ,
void * context )
{
int r ;
unsigned i , nr ;
2014-11-10 18:03:24 +03:00
struct dm_block * node ;
2013-03-02 02:45:50 +04:00
struct btree_node * n ;
uint64_t keys ;
2014-11-10 18:03:24 +03:00
r = bn_read_lock ( info , block , & node ) ;
if ( r )
return r ;
n = dm_block_data ( node ) ;
2013-03-02 02:45:50 +04:00
nr = le32_to_cpu ( n - > header . nr_entries ) ;
for ( i = 0 ; i < nr ; i + + ) {
if ( le32_to_cpu ( n - > header . flags ) & INTERNAL_NODE ) {
2014-11-10 18:03:24 +03:00
r = walk_node ( info , value64 ( n , i ) , fn , context ) ;
2013-03-02 02:45:50 +04:00
if ( r )
goto out ;
} else {
keys = le64_to_cpu ( * key_ptr ( n , i ) ) ;
r = fn ( context , & keys , value_ptr ( n , i ) ) ;
if ( r )
goto out ;
}
}
out :
2014-11-10 18:03:24 +03:00
dm_tm_unlock ( info - > tm , node ) ;
2013-03-02 02:45:50 +04:00
return r ;
}
int dm_btree_walk ( struct dm_btree_info * info , dm_block_t root ,
int ( * fn ) ( void * context , uint64_t * keys , void * leaf ) ,
void * context )
{
BUG_ON ( info - > levels > 1 ) ;
2014-11-10 18:03:24 +03:00
return walk_node ( info , root , fn , context ) ;
2013-03-02 02:45:50 +04:00
}
EXPORT_SYMBOL_GPL ( dm_btree_walk ) ;
2016-09-15 17:49:24 +03:00
/*----------------------------------------------------------------*/
static void prefetch_values ( struct dm_btree_cursor * c )
{
unsigned i , nr ;
__le64 value_le ;
struct cursor_node * n = c - > nodes + c - > depth - 1 ;
struct btree_node * bn = dm_block_data ( n - > b ) ;
struct dm_block_manager * bm = dm_tm_get_bm ( c - > info - > tm ) ;
BUG_ON ( c - > info - > value_type . size ! = sizeof ( value_le ) ) ;
nr = le32_to_cpu ( bn - > header . nr_entries ) ;
for ( i = 0 ; i < nr ; i + + ) {
memcpy ( & value_le , value_ptr ( bn , i ) , sizeof ( value_le ) ) ;
dm_bm_prefetch ( bm , le64_to_cpu ( value_le ) ) ;
}
}
static bool leaf_node ( struct dm_btree_cursor * c )
{
struct cursor_node * n = c - > nodes + c - > depth - 1 ;
struct btree_node * bn = dm_block_data ( n - > b ) ;
return le32_to_cpu ( bn - > header . flags ) & LEAF_NODE ;
}
static int push_node ( struct dm_btree_cursor * c , dm_block_t b )
{
int r ;
struct cursor_node * n = c - > nodes + c - > depth ;
if ( c - > depth > = DM_BTREE_CURSOR_MAX_DEPTH - 1 ) {
DMERR ( " couldn't push cursor node, stack depth too high " ) ;
return - EINVAL ;
}
r = bn_read_lock ( c - > info , b , & n - > b ) ;
if ( r )
return r ;
n - > index = 0 ;
c - > depth + + ;
if ( c - > prefetch_leaves | | ! leaf_node ( c ) )
prefetch_values ( c ) ;
return 0 ;
}
static void pop_node ( struct dm_btree_cursor * c )
{
c - > depth - - ;
unlock_block ( c - > info , c - > nodes [ c - > depth ] . b ) ;
}
static int inc_or_backtrack ( struct dm_btree_cursor * c )
{
struct cursor_node * n ;
struct btree_node * bn ;
for ( ; ; ) {
if ( ! c - > depth )
return - ENODATA ;
n = c - > nodes + c - > depth - 1 ;
bn = dm_block_data ( n - > b ) ;
n - > index + + ;
if ( n - > index < le32_to_cpu ( bn - > header . nr_entries ) )
break ;
pop_node ( c ) ;
}
return 0 ;
}
static int find_leaf ( struct dm_btree_cursor * c )
{
int r = 0 ;
struct cursor_node * n ;
struct btree_node * bn ;
__le64 value_le ;
for ( ; ; ) {
n = c - > nodes + c - > depth - 1 ;
bn = dm_block_data ( n - > b ) ;
if ( le32_to_cpu ( bn - > header . flags ) & LEAF_NODE )
break ;
memcpy ( & value_le , value_ptr ( bn , n - > index ) , sizeof ( value_le ) ) ;
r = push_node ( c , le64_to_cpu ( value_le ) ) ;
if ( r ) {
DMERR ( " push_node failed " ) ;
break ;
}
}
if ( ! r & & ( le32_to_cpu ( bn - > header . nr_entries ) = = 0 ) )
return - ENODATA ;
return r ;
}
int dm_btree_cursor_begin ( struct dm_btree_info * info , dm_block_t root ,
bool prefetch_leaves , struct dm_btree_cursor * c )
{
int r ;
c - > info = info ;
c - > root = root ;
c - > depth = 0 ;
c - > prefetch_leaves = prefetch_leaves ;
r = push_node ( c , root ) ;
if ( r )
return r ;
return find_leaf ( c ) ;
}
EXPORT_SYMBOL_GPL ( dm_btree_cursor_begin ) ;
void dm_btree_cursor_end ( struct dm_btree_cursor * c )
{
while ( c - > depth )
pop_node ( c ) ;
}
EXPORT_SYMBOL_GPL ( dm_btree_cursor_end ) ;
int dm_btree_cursor_next ( struct dm_btree_cursor * c )
{
int r = inc_or_backtrack ( c ) ;
if ( ! r ) {
r = find_leaf ( c ) ;
if ( r )
DMERR ( " find_leaf failed " ) ;
}
return r ;
}
EXPORT_SYMBOL_GPL ( dm_btree_cursor_next ) ;
2016-10-05 17:40:39 +03:00
int dm_btree_cursor_skip ( struct dm_btree_cursor * c , uint32_t count )
{
int r = 0 ;
while ( count - - & & ! r )
r = dm_btree_cursor_next ( c ) ;
return r ;
}
EXPORT_SYMBOL_GPL ( dm_btree_cursor_skip ) ;
2016-09-15 17:49:24 +03:00
int dm_btree_cursor_get_value ( struct dm_btree_cursor * c , uint64_t * key , void * value_le )
{
if ( c - > depth ) {
struct cursor_node * n = c - > nodes + c - > depth - 1 ;
struct btree_node * bn = dm_block_data ( n - > b ) ;
if ( le32_to_cpu ( bn - > header . flags ) & INTERNAL_NODE )
return - EINVAL ;
* key = le64_to_cpu ( * key_ptr ( bn , n - > index ) ) ;
memcpy ( value_le , value_ptr ( bn , n - > index ) , c - > info - > value_type . size ) ;
return 0 ;
} else
return - ENODATA ;
}
EXPORT_SYMBOL_GPL ( dm_btree_cursor_get_value ) ;