2009-10-06 22:31:11 +04:00
# ifdef __KERNEL__
# include <linux / string.h>
# include <linux / slab.h>
# include <linux / bug.h>
# include <linux / kernel.h>
# ifndef dprintk
# define dprintk(args...)
# endif
# else
# include <string.h>
# include <stdio.h>
# include <stdlib.h>
# include <assert.h>
# define BUG_ON(x) assert(!(x))
# define dprintk(args...) /* printf(args) */
# define kmalloc(x, f) malloc(x)
# define kfree(x) free(x)
# endif
# include "crush.h"
# include "hash.h"
/*
* Implement the core CRUSH mapping algorithm .
*/
/**
* crush_find_rule - find a crush_rule id for a given ruleset , type , and size .
* @ map : the crush_map
* @ ruleset : the storage ruleset id ( user defined )
* @ type : storage ruleset type ( user defined )
* @ size : output set size
*/
int crush_find_rule ( struct crush_map * map , int ruleset , int type , int size )
{
int i ;
for ( i = 0 ; i < map - > max_rules ; i + + ) {
if ( map - > rules [ i ] & &
map - > rules [ i ] - > mask . ruleset = = ruleset & &
map - > rules [ i ] - > mask . type = = type & &
map - > rules [ i ] - > mask . min_size < = size & &
map - > rules [ i ] - > mask . max_size > = size )
return i ;
}
return - 1 ;
}
/*
* bucket choose methods
*
* For each bucket algorithm , we have a " choose " method that , given a
* crush input @ x and replica position ( usually , position in output set ) @ r ,
* will produce an item in the bucket .
*/
/*
* Choose based on a random permutation of the bucket .
*
* We used to use some prime number arithmetic to do this , but it
* wasn ' t very random , and had some other bad behaviors . Instead , we
* calculate an actual random permutation of the bucket members .
* Since this is expensive , we optimize for the r = 0 case , which
* captures the vast majority of calls .
*/
static int bucket_perm_choose ( struct crush_bucket * bucket ,
int x , int r )
{
unsigned pr = r % bucket - > size ;
unsigned i , s ;
/* start a new permutation if @x has changed */
if ( bucket - > perm_x ! = x | | bucket - > perm_n = = 0 ) {
dprintk ( " bucket %d new x=%d \n " , bucket - > id , x ) ;
bucket - > perm_x = x ;
/* optimize common r=0 case */
if ( pr = = 0 ) {
2009-11-08 07:18:22 +03:00
s = crush_hash32_3 ( bucket - > hash , x , bucket - > id , 0 ) %
2009-10-06 22:31:11 +04:00
bucket - > size ;
bucket - > perm [ 0 ] = s ;
bucket - > perm_n = 0xffff ; /* magic value, see below */
goto out ;
}
for ( i = 0 ; i < bucket - > size ; i + + )
bucket - > perm [ i ] = i ;
bucket - > perm_n = 0 ;
} else if ( bucket - > perm_n = = 0xffff ) {
/* clean up after the r=0 case above */
for ( i = 1 ; i < bucket - > size ; i + + )
bucket - > perm [ i ] = i ;
bucket - > perm [ bucket - > perm [ 0 ] ] = 0 ;
bucket - > perm_n = 1 ;
}
/* calculate permutation up to pr */
for ( i = 0 ; i < bucket - > perm_n ; i + + )
dprintk ( " perm_choose have %d: %d \n " , i , bucket - > perm [ i ] ) ;
while ( bucket - > perm_n < = pr ) {
unsigned p = bucket - > perm_n ;
/* no point in swapping the final entry */
if ( p < bucket - > size - 1 ) {
2009-11-08 07:18:22 +03:00
i = crush_hash32_3 ( bucket - > hash , x , bucket - > id , p ) %
2009-10-06 22:31:11 +04:00
( bucket - > size - p ) ;
if ( i ) {
unsigned t = bucket - > perm [ p + i ] ;
bucket - > perm [ p + i ] = bucket - > perm [ p ] ;
bucket - > perm [ p ] = t ;
}
dprintk ( " perm_choose swap %d with %d \n " , p , p + i ) ;
}
bucket - > perm_n + + ;
}
for ( i = 0 ; i < bucket - > size ; i + + )
dprintk ( " perm_choose %d: %d \n " , i , bucket - > perm [ i ] ) ;
s = bucket - > perm [ pr ] ;
out :
dprintk ( " perm_choose %d sz=%d x=%d r=%d (%d) s=%d \n " , bucket - > id ,
bucket - > size , x , r , pr , s ) ;
return bucket - > items [ s ] ;
}
/* uniform */
static int bucket_uniform_choose ( struct crush_bucket_uniform * bucket ,
int x , int r )
{
return bucket_perm_choose ( & bucket - > h , x , r ) ;
}
/* list */
static int bucket_list_choose ( struct crush_bucket_list * bucket ,
int x , int r )
{
int i ;
for ( i = bucket - > h . size - 1 ; i > = 0 ; i - - ) {
2009-11-08 07:18:22 +03:00
__u64 w = crush_hash32_4 ( bucket - > h . hash , x , bucket - > h . items [ i ] ,
r , bucket - > h . id ) ;
2009-10-06 22:31:11 +04:00
w & = 0xffff ;
dprintk ( " list_choose i=%d x=%d r=%d item %d weight %x "
" sw %x rand %llx " ,
i , x , r , bucket - > h . items [ i ] , bucket - > item_weights [ i ] ,
bucket - > sum_weights [ i ] , w ) ;
w * = bucket - > sum_weights [ i ] ;
w = w > > 16 ;
/*dprintk(" scaled %llx\n", w);*/
if ( w < bucket - > item_weights [ i ] )
return bucket - > h . items [ i ] ;
}
BUG_ON ( 1 ) ;
return 0 ;
}
/* (binary) tree */
static int height ( int n )
{
int h = 0 ;
while ( ( n & 1 ) = = 0 ) {
h + + ;
n = n > > 1 ;
}
return h ;
}
static int left ( int x )
{
int h = height ( x ) ;
return x - ( 1 < < ( h - 1 ) ) ;
}
static int right ( int x )
{
int h = height ( x ) ;
return x + ( 1 < < ( h - 1 ) ) ;
}
static int terminal ( int x )
{
return x & 1 ;
}
static int bucket_tree_choose ( struct crush_bucket_tree * bucket ,
int x , int r )
{
int n , l ;
__u32 w ;
__u64 t ;
/* start at root */
n = bucket - > num_nodes > > 1 ;
while ( ! terminal ( n ) ) {
/* pick point in [0, w) */
w = bucket - > node_weights [ n ] ;
2009-11-08 07:18:22 +03:00
t = ( __u64 ) crush_hash32_4 ( bucket - > h . hash , x , n , r ,
bucket - > h . id ) * ( __u64 ) w ;
2009-10-06 22:31:11 +04:00
t = t > > 32 ;
/* descend to the left or right? */
l = left ( n ) ;
if ( t < bucket - > node_weights [ l ] )
n = l ;
else
n = right ( n ) ;
}
return bucket - > h . items [ n > > 1 ] ;
}
/* straw */
static int bucket_straw_choose ( struct crush_bucket_straw * bucket ,
int x , int r )
{
int i ;
int high = 0 ;
__u64 high_draw = 0 ;
__u64 draw ;
for ( i = 0 ; i < bucket - > h . size ; i + + ) {
2009-11-08 07:18:22 +03:00
draw = crush_hash32_3 ( bucket - > h . hash , x , bucket - > h . items [ i ] , r ) ;
2009-10-06 22:31:11 +04:00
draw & = 0xffff ;
draw * = bucket - > straws [ i ] ;
if ( i = = 0 | | draw > high_draw ) {
high = i ;
high_draw = draw ;
}
}
return bucket - > h . items [ high ] ;
}
static int crush_bucket_choose ( struct crush_bucket * in , int x , int r )
{
dprintk ( " choose %d x=%d r=%d \n " , in - > id , x , r ) ;
switch ( in - > alg ) {
case CRUSH_BUCKET_UNIFORM :
return bucket_uniform_choose ( ( struct crush_bucket_uniform * ) in ,
x , r ) ;
case CRUSH_BUCKET_LIST :
return bucket_list_choose ( ( struct crush_bucket_list * ) in ,
x , r ) ;
case CRUSH_BUCKET_TREE :
return bucket_tree_choose ( ( struct crush_bucket_tree * ) in ,
x , r ) ;
case CRUSH_BUCKET_STRAW :
return bucket_straw_choose ( ( struct crush_bucket_straw * ) in ,
x , r ) ;
default :
BUG_ON ( 1 ) ;
2009-11-02 04:53:24 +03:00
return in - > items [ 0 ] ;
2009-10-06 22:31:11 +04:00
}
}
/*
* true if device is marked " out " ( failed , fully offloaded )
* of the cluster
*/
static int is_out ( struct crush_map * map , __u32 * weight , int item , int x )
{
if ( weight [ item ] > = 0x1000 )
return 0 ;
if ( weight [ item ] = = 0 )
return 1 ;
2009-11-08 07:18:22 +03:00
if ( ( crush_hash32_2 ( CRUSH_HASH_RJENKINS1 , x , item ) & 0xffff )
< weight [ item ] )
2009-10-06 22:31:11 +04:00
return 0 ;
return 1 ;
}
/**
* crush_choose - choose numrep distinct items of given type
* @ map : the crush_map
* @ bucket : the bucket we are choose an item from
* @ x : crush input value
* @ numrep : the number of items to choose
* @ type : the type of item to choose
* @ out : pointer to output vector
* @ outpos : our position in that vector
* @ firstn : true if choosing " first n " items , false if choosing " indep "
* @ recurse_to_leaf : true if we want one device under each item of given type
* @ out2 : second output vector for leaf items ( if @ recurse_to_leaf )
*/
static int crush_choose ( struct crush_map * map ,
struct crush_bucket * bucket ,
__u32 * weight ,
int x , int numrep , int type ,
int * out , int outpos ,
int firstn , int recurse_to_leaf ,
int * out2 )
{
int rep ;
int ftotal , flocal ;
int retry_descent , retry_bucket , skip_rep ;
struct crush_bucket * in = bucket ;
int r ;
int i ;
2009-10-07 21:59:34 +04:00
int item = 0 ;
2009-10-06 22:31:11 +04:00
int itemtype ;
int collide , reject ;
const int orig_tries = 5 ; /* attempts before we fall back to search */
dprintk ( " choose bucket %d x %d outpos %d \n " , bucket - > id , x , outpos ) ;
for ( rep = outpos ; rep < numrep ; rep + + ) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0 ;
skip_rep = 0 ;
do {
retry_descent = 0 ;
in = bucket ; /* initial bucket */
/* choose through intervening buckets */
flocal = 0 ;
do {
2009-10-07 21:59:34 +04:00
collide = 0 ;
2009-10-06 22:31:11 +04:00
retry_bucket = 0 ;
r = rep ;
if ( in - > alg = = CRUSH_BUCKET_UNIFORM ) {
/* be careful */
if ( firstn | | numrep > = in - > size )
/* r' = r + f_total */
r + = ftotal ;
else if ( in - > size % numrep = = 0 )
/* r'=r+(n+1)*f_local */
r + = ( numrep + 1 ) *
( flocal + ftotal ) ;
else
/* r' = r + n*f_local */
r + = numrep * ( flocal + ftotal ) ;
} else {
if ( firstn )
/* r' = r + f_total */
r + = ftotal ;
else
/* r' = r + n*f_local */
r + = numrep * ( flocal + ftotal ) ;
}
/* bucket choose */
2009-10-07 21:59:34 +04:00
if ( in - > size = = 0 ) {
reject = 1 ;
goto reject ;
}
2009-10-06 22:31:11 +04:00
if ( flocal > = ( in - > size > > 1 ) & &
flocal > orig_tries )
item = bucket_perm_choose ( in , x , r ) ;
else
item = crush_bucket_choose ( in , x , r ) ;
BUG_ON ( item > = map - > max_devices ) ;
/* desired type? */
if ( item < 0 )
itemtype = map - > buckets [ - 1 - item ] - > type ;
else
itemtype = 0 ;
dprintk ( " item %d type %d \n " , item , itemtype ) ;
/* keep going? */
if ( itemtype ! = type ) {
BUG_ON ( item > = 0 | |
( - 1 - item ) > = map - > max_buckets ) ;
in = map - > buckets [ - 1 - item ] ;
continue ;
}
/* collision? */
for ( i = 0 ; i < outpos ; i + + ) {
if ( out [ i ] = = item ) {
collide = 1 ;
break ;
}
}
if ( recurse_to_leaf & &
item < 0 & &
crush_choose ( map , map - > buckets [ - 1 - item ] ,
weight ,
x , outpos + 1 , 0 ,
out2 , outpos ,
firstn , 0 , NULL ) < = outpos ) {
reject = 1 ;
} else {
/* out? */
if ( itemtype = = 0 )
reject = is_out ( map , weight ,
item , x ) ;
else
reject = 0 ;
}
2009-10-07 21:59:34 +04:00
reject :
2009-10-06 22:31:11 +04:00
if ( reject | | collide ) {
ftotal + + ;
flocal + + ;
if ( collide & & flocal < 3 )
/* retry locally a few times */
retry_bucket = 1 ;
else if ( flocal < in - > size + orig_tries )
/* exhaustive bucket search */
retry_bucket = 1 ;
else if ( ftotal < 20 )
/* then retry descent */
retry_descent = 1 ;
else
/* else give up */
skip_rep = 1 ;
dprintk ( " reject %d collide %d "
" ftotal %d flocal %d \n " ,
reject , collide , ftotal ,
flocal ) ;
}
} while ( retry_bucket ) ;
} while ( retry_descent ) ;
if ( skip_rep ) {
dprintk ( " skip rep \n " ) ;
continue ;
}
dprintk ( " choose got %d \n " , item ) ;
out [ outpos ] = item ;
outpos + + ;
}
dprintk ( " choose returns %d \n " , outpos ) ;
return outpos ;
}
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @ map : the crush_map
* @ ruleno : the rule id
* @ x : hash input
* @ result : pointer to result vector
* @ result_max : maximum result size
* @ force : force initial replica choice ; - 1 for none
*/
int crush_do_rule ( struct crush_map * map ,
int ruleno , int x , int * result , int result_max ,
int force , __u32 * weight )
{
int result_len ;
int force_context [ CRUSH_MAX_DEPTH ] ;
int force_pos = - 1 ;
int a [ CRUSH_MAX_SET ] ;
int b [ CRUSH_MAX_SET ] ;
int c [ CRUSH_MAX_SET ] ;
int recurse_to_leaf ;
int * w ;
int wsize = 0 ;
int * o ;
int osize ;
int * tmp ;
struct crush_rule * rule ;
int step ;
int i , j ;
int numrep ;
int firstn ;
int rc = - 1 ;
BUG_ON ( ruleno > = map - > max_rules ) ;
rule = map - > rules [ ruleno ] ;
result_len = 0 ;
w = a ;
o = b ;
/*
* determine hierarchical context of force , if any . note
* that this may or may not correspond to the specific types
* referenced by the crush rule .
*/
if ( force > = 0 ) {
if ( force > = map - > max_devices | |
map - > device_parents [ force ] = = 0 ) {
/*dprintk("CRUSH: forcefed device dne\n");*/
rc = - 1 ; /* force fed device dne */
goto out ;
}
if ( ! is_out ( map , weight , force , x ) ) {
while ( 1 ) {
force_context [ + + force_pos ] = force ;
if ( force > = 0 )
force = map - > device_parents [ force ] ;
else
force = map - > bucket_parents [ - 1 - force ] ;
if ( force = = 0 )
break ;
}
}
}
for ( step = 0 ; step < rule - > len ; step + + ) {
firstn = 0 ;
switch ( rule - > steps [ step ] . op ) {
case CRUSH_RULE_TAKE :
w [ 0 ] = rule - > steps [ step ] . arg1 ;
if ( force_pos > = 0 ) {
BUG_ON ( force_context [ force_pos ] ! = w [ 0 ] ) ;
force_pos - - ;
}
wsize = 1 ;
break ;
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN :
case CRUSH_RULE_CHOOSE_FIRSTN :
firstn = 1 ;
case CRUSH_RULE_CHOOSE_LEAF_INDEP :
case CRUSH_RULE_CHOOSE_INDEP :
BUG_ON ( wsize = = 0 ) ;
recurse_to_leaf =
rule - > steps [ step ] . op = =
CRUSH_RULE_CHOOSE_LEAF_FIRSTN | |
rule - > steps [ step ] . op = =
CRUSH_RULE_CHOOSE_LEAF_INDEP ;
/* reset output */
osize = 0 ;
for ( i = 0 ; i < wsize ; i + + ) {
/*
* see CRUSH_N , CRUSH_N_MINUS macros .
* basically , numrep < = 0 means relative to
* the provided result_max
*/
numrep = rule - > steps [ step ] . arg1 ;
if ( numrep < = 0 ) {
numrep + = result_max ;
if ( numrep < = 0 )
continue ;
}
j = 0 ;
if ( osize = = 0 & & force_pos > = 0 ) {
/* skip any intermediate types */
while ( force_pos & &
force_context [ force_pos ] < 0 & &
rule - > steps [ step ] . arg2 ! =
map - > buckets [ - 1 -
force_context [ force_pos ] ] - > type )
force_pos - - ;
o [ osize ] = force_context [ force_pos ] ;
if ( recurse_to_leaf )
c [ osize ] = force_context [ 0 ] ;
j + + ;
force_pos - - ;
}
osize + = crush_choose ( map ,
map - > buckets [ - 1 - w [ i ] ] ,
weight ,
x , numrep ,
rule - > steps [ step ] . arg2 ,
o + osize , j ,
firstn ,
recurse_to_leaf , c + osize ) ;
}
if ( recurse_to_leaf )
/* copy final _leaf_ values to output set */
memcpy ( o , c , osize * sizeof ( * o ) ) ;
/* swap t and w arrays */
tmp = o ;
o = w ;
w = tmp ;
wsize = osize ;
break ;
case CRUSH_RULE_EMIT :
for ( i = 0 ; i < wsize & & result_len < result_max ; i + + ) {
result [ result_len ] = w [ i ] ;
result_len + + ;
}
wsize = 0 ;
break ;
default :
BUG_ON ( 1 ) ;
}
}
rc = result_len ;
out :
return rc ;
}