2015-06-12 13:21:07 +03:00
/*
* Ceph - scalable distributed file system
*
* Copyright ( C ) 2015 Intel Corporation All Rights Reserved
*
* This is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1 , as published by the Free Software
* Foundation . See file COPYING .
*
*/
2009-10-06 22:31:11 +04:00
# ifdef __KERNEL__
# include <linux / string.h>
# include <linux / slab.h>
# include <linux / bug.h>
# include <linux / kernel.h>
2015-06-12 13:21:07 +03:00
# include <linux / crush / crush.h>
# include <linux / crush / hash.h>
2016-10-28 14:23:24 +03:00
# include <linux / crush / mapper.h>
2009-10-06 22:31:11 +04:00
# else
2015-06-12 13:21:07 +03:00
# include "crush_compat.h"
# include "crush.h"
# include "hash.h"
2016-10-28 14:23:24 +03:00
# include "mapper.h"
2009-10-06 22:31:11 +04:00
# endif
2015-04-14 16:54:52 +03:00
# include "crush_ln_table.h"
2009-10-06 22:31:11 +04:00
2015-06-12 13:21:07 +03:00
# define dprintk(args...) /* printf(args) */
2009-10-06 22:31:11 +04:00
/*
* Implement the core CRUSH mapping algorithm .
*/
/**
* crush_find_rule - find a crush_rule id for a given ruleset , type , and size .
* @ map : the crush_map
* @ ruleset : the storage ruleset id ( user defined )
* @ type : storage ruleset type ( user defined )
* @ size : output set size
*/
2012-05-08 02:38:35 +04:00
int crush_find_rule ( const struct crush_map * map , int ruleset , int type , int size )
2009-10-06 22:31:11 +04:00
{
2012-05-08 02:38:35 +04:00
__u32 i ;
2009-10-06 22:31:11 +04:00
for ( i = 0 ; i < map - > max_rules ; i + + ) {
if ( map - > rules [ i ] & &
map - > rules [ i ] - > mask . ruleset = = ruleset & &
map - > rules [ i ] - > mask . type = = type & &
map - > rules [ i ] - > mask . min_size < = size & &
map - > rules [ i ] - > mask . max_size > = size )
return i ;
}
return - 1 ;
}
/*
* bucket choose methods
*
* For each bucket algorithm , we have a " choose " method that , given a
* crush input @ x and replica position ( usually , position in output set ) @ r ,
* will produce an item in the bucket .
*/
/*
* Choose based on a random permutation of the bucket .
*
* We used to use some prime number arithmetic to do this , but it
* wasn ' t very random , and had some other bad behaviors . Instead , we
* calculate an actual random permutation of the bucket members .
* Since this is expensive , we optimize for the r = 0 case , which
* captures the vast majority of calls .
*/
2017-01-31 17:55:06 +03:00
static int bucket_perm_choose ( const struct crush_bucket * bucket ,
struct crush_work_bucket * work ,
2009-10-06 22:31:11 +04:00
int x , int r )
{
2012-04-15 09:58:06 +04:00
unsigned int pr = r % bucket - > size ;
unsigned int i , s ;
2009-10-06 22:31:11 +04:00
/* start a new permutation if @x has changed */
2017-01-31 17:55:06 +03:00
if ( work - > perm_x ! = ( __u32 ) x | | work - > perm_n = = 0 ) {
2009-10-06 22:31:11 +04:00
dprintk ( " bucket %d new x=%d \n " , bucket - > id , x ) ;
2017-01-31 17:55:06 +03:00
work - > perm_x = x ;
2009-10-06 22:31:11 +04:00
/* optimize common r=0 case */
if ( pr = = 0 ) {
2009-11-08 07:18:22 +03:00
s = crush_hash32_3 ( bucket - > hash , x , bucket - > id , 0 ) %
2009-10-06 22:31:11 +04:00
bucket - > size ;
2017-01-31 17:55:06 +03:00
work - > perm [ 0 ] = s ;
work - > perm_n = 0xffff ; /* magic value, see below */
2009-10-06 22:31:11 +04:00
goto out ;
}
for ( i = 0 ; i < bucket - > size ; i + + )
2017-01-31 17:55:06 +03:00
work - > perm [ i ] = i ;
work - > perm_n = 0 ;
} else if ( work - > perm_n = = 0xffff ) {
2009-10-06 22:31:11 +04:00
/* clean up after the r=0 case above */
for ( i = 1 ; i < bucket - > size ; i + + )
2017-01-31 17:55:06 +03:00
work - > perm [ i ] = i ;
work - > perm [ work - > perm [ 0 ] ] = 0 ;
work - > perm_n = 1 ;
2009-10-06 22:31:11 +04:00
}
/* calculate permutation up to pr */
2017-01-31 17:55:06 +03:00
for ( i = 0 ; i < work - > perm_n ; i + + )
2017-02-16 17:38:05 +03:00
dprintk ( " perm_choose have %d: %d \n " , i , work - > perm [ i ] ) ;
2017-01-31 17:55:06 +03:00
while ( work - > perm_n < = pr ) {
unsigned int p = work - > perm_n ;
2009-10-06 22:31:11 +04:00
/* no point in swapping the final entry */
if ( p < bucket - > size - 1 ) {
2009-11-08 07:18:22 +03:00
i = crush_hash32_3 ( bucket - > hash , x , bucket - > id , p ) %
2009-10-06 22:31:11 +04:00
( bucket - > size - p ) ;
if ( i ) {
2017-01-31 17:55:06 +03:00
unsigned int t = work - > perm [ p + i ] ;
work - > perm [ p + i ] = work - > perm [ p ] ;
work - > perm [ p ] = t ;
2009-10-06 22:31:11 +04:00
}
dprintk ( " perm_choose swap %d with %d \n " , p , p + i ) ;
}
2017-01-31 17:55:06 +03:00
work - > perm_n + + ;
2009-10-06 22:31:11 +04:00
}
for ( i = 0 ; i < bucket - > size ; i + + )
2017-02-16 17:38:05 +03:00
dprintk ( " perm_choose %d: %d \n " , i , work - > perm [ i ] ) ;
2009-10-06 22:31:11 +04:00
2017-01-31 17:55:06 +03:00
s = work - > perm [ pr ] ;
2009-10-06 22:31:11 +04:00
out :
dprintk ( " perm_choose %d sz=%d x=%d r=%d (%d) s=%d \n " , bucket - > id ,
bucket - > size , x , r , pr , s ) ;
return bucket - > items [ s ] ;
}
/* uniform */
2017-01-31 17:55:06 +03:00
static int bucket_uniform_choose ( const struct crush_bucket_uniform * bucket ,
struct crush_work_bucket * work , int x , int r )
2009-10-06 22:31:11 +04:00
{
2017-01-31 17:55:06 +03:00
return bucket_perm_choose ( & bucket - > h , work , x , r ) ;
2009-10-06 22:31:11 +04:00
}
/* list */
2017-01-31 17:55:06 +03:00
static int bucket_list_choose ( const struct crush_bucket_list * bucket ,
2009-10-06 22:31:11 +04:00
int x , int r )
{
int i ;
for ( i = bucket - > h . size - 1 ; i > = 0 ; i - - ) {
2015-06-12 13:21:07 +03:00
__u64 w = crush_hash32_4 ( bucket - > h . hash , x , bucket - > h . items [ i ] ,
2009-11-08 07:18:22 +03:00
r , bucket - > h . id ) ;
2009-10-06 22:31:11 +04:00
w & = 0xffff ;
dprintk ( " list_choose i=%d x=%d r=%d item %d weight %x "
" sw %x rand %llx " ,
i , x , r , bucket - > h . items [ i ] , bucket - > item_weights [ i ] ,
bucket - > sum_weights [ i ] , w ) ;
w * = bucket - > sum_weights [ i ] ;
w = w > > 16 ;
/*dprintk(" scaled %llx\n", w);*/
2017-01-31 17:55:06 +03:00
if ( w < bucket - > item_weights [ i ] ) {
2009-10-06 22:31:11 +04:00
return bucket - > h . items [ i ] ;
2017-01-31 17:55:06 +03:00
}
2009-10-06 22:31:11 +04:00
}
2012-05-08 02:35:24 +04:00
dprintk ( " bad list sums for bucket %d \n " , bucket - > h . id ) ;
return bucket - > h . items [ 0 ] ;
2009-10-06 22:31:11 +04:00
}
/* (binary) tree */
static int height ( int n )
{
int h = 0 ;
while ( ( n & 1 ) = = 0 ) {
h + + ;
n = n > > 1 ;
}
return h ;
}
static int left ( int x )
{
int h = height ( x ) ;
return x - ( 1 < < ( h - 1 ) ) ;
}
static int right ( int x )
{
int h = height ( x ) ;
return x + ( 1 < < ( h - 1 ) ) ;
}
static int terminal ( int x )
{
return x & 1 ;
}
2017-01-31 17:55:06 +03:00
static int bucket_tree_choose ( const struct crush_bucket_tree * bucket ,
2009-10-06 22:31:11 +04:00
int x , int r )
{
2013-12-24 23:19:24 +04:00
int n ;
2009-10-06 22:31:11 +04:00
__u32 w ;
__u64 t ;
/* start at root */
n = bucket - > num_nodes > > 1 ;
while ( ! terminal ( n ) ) {
2013-12-24 23:19:24 +04:00
int l ;
2009-10-06 22:31:11 +04:00
/* pick point in [0, w) */
w = bucket - > node_weights [ n ] ;
2009-11-08 07:18:22 +03:00
t = ( __u64 ) crush_hash32_4 ( bucket - > h . hash , x , n , r ,
bucket - > h . id ) * ( __u64 ) w ;
2009-10-06 22:31:11 +04:00
t = t > > 32 ;
/* descend to the left or right? */
l = left ( n ) ;
if ( t < bucket - > node_weights [ l ] )
n = l ;
else
n = right ( n ) ;
}
return bucket - > h . items [ n > > 1 ] ;
}
/* straw */
2017-01-31 17:55:06 +03:00
static int bucket_straw_choose ( const struct crush_bucket_straw * bucket ,
2009-10-06 22:31:11 +04:00
int x , int r )
{
2012-05-08 02:38:35 +04:00
__u32 i ;
2009-10-06 22:31:11 +04:00
int high = 0 ;
__u64 high_draw = 0 ;
__u64 draw ;
for ( i = 0 ; i < bucket - > h . size ; i + + ) {
2009-11-08 07:18:22 +03:00
draw = crush_hash32_3 ( bucket - > h . hash , x , bucket - > h . items [ i ] , r ) ;
2009-10-06 22:31:11 +04:00
draw & = 0xffff ;
draw * = bucket - > straws [ i ] ;
if ( i = = 0 | | draw > high_draw ) {
high = i ;
high_draw = draw ;
}
}
return bucket - > h . items [ high ] ;
}
2015-06-12 13:21:07 +03:00
/* compute 2^44*log2(input+1) */
static __u64 crush_ln ( unsigned int xin )
2015-04-14 16:54:52 +03:00
{
2016-09-27 13:35:55 +03:00
unsigned int x = xin ;
2015-06-12 13:21:07 +03:00
int iexpon , index1 , index2 ;
__u64 RH , LH , LL , xl64 , result ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
x + + ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
/* normalize input */
iexpon = 15 ;
2016-09-27 13:30:09 +03:00
/*
* figure out number of bits we need to shift and
* do it in one step instead of iteratively
*/
if ( ! ( x & 0x18000 ) ) {
int bits = __builtin_clz ( x & 0x1FFFF ) - 16 ;
x < < = bits ;
iexpon = 15 - bits ;
2015-06-12 13:21:07 +03:00
}
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
index1 = ( x > > 8 ) < < 1 ;
/* RH ~ 2^56/index1 */
RH = __RH_LH_tbl [ index1 - 256 ] ;
/* LH ~ 2^48 * log2(index1/256) */
LH = __RH_LH_tbl [ index1 + 1 - 256 ] ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
xl64 = ( __s64 ) x * RH ;
xl64 > > = 48 ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
result = iexpon ;
result < < = ( 12 + 32 ) ;
2015-04-14 16:54:52 +03:00
2016-09-27 13:35:55 +03:00
index2 = xl64 & 0xff ;
2015-06-12 13:21:07 +03:00
/* LL ~ 2^48*log2(1.0+index2/2^15) */
LL = __LL_tbl [ index2 ] ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
LH = LH + LL ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
LH > > = ( 48 - 12 - 32 ) ;
result + = LH ;
2015-04-14 16:54:52 +03:00
2015-06-12 13:21:07 +03:00
return result ;
2015-04-14 16:54:52 +03:00
}
/*
* straw2
*
* for reference , see :
*
* http : //en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
*
*/
2017-06-22 20:44:05 +03:00
static __u32 * get_choose_arg_weights ( const struct crush_bucket_straw2 * bucket ,
const struct crush_choose_arg * arg ,
int position )
{
if ( ! arg | | ! arg - > weight_set | | arg - > weight_set_size = = 0 )
return bucket - > item_weights ;
if ( position > = arg - > weight_set_size )
position = arg - > weight_set_size - 1 ;
return arg - > weight_set [ position ] . weights ;
}
static __s32 * get_choose_arg_ids ( const struct crush_bucket_straw2 * bucket ,
const struct crush_choose_arg * arg )
{
if ( ! arg | | ! arg - > ids )
return bucket - > h . items ;
return arg - > ids ;
}
2017-01-31 17:55:06 +03:00
static int bucket_straw2_choose ( const struct crush_bucket_straw2 * bucket ,
2017-06-22 20:44:05 +03:00
int x , int r ,
const struct crush_choose_arg * arg ,
int position )
2015-04-14 16:54:52 +03:00
{
2015-06-12 13:21:07 +03:00
unsigned int i , high = 0 ;
unsigned int u ;
2015-04-14 16:54:52 +03:00
__s64 ln , draw , high_draw = 0 ;
2017-06-22 20:44:05 +03:00
__u32 * weights = get_choose_arg_weights ( bucket , arg , position ) ;
__s32 * ids = get_choose_arg_ids ( bucket , arg ) ;
2015-04-14 16:54:52 +03:00
for ( i = 0 ; i < bucket - > h . size ; i + + ) {
2017-06-22 20:44:05 +03:00
dprintk ( " weight 0x%x item %d \n " , weights [ i ] , ids [ i ] ) ;
if ( weights [ i ] ) {
u = crush_hash32_3 ( bucket - > h . hash , x , ids [ i ] , r ) ;
2015-04-14 16:54:52 +03:00
u & = 0xffff ;
/*
* for some reason slightly less than 0x10000 produces
* a slightly more accurate distribution . . . probably a
* rounding effect .
*
* the natural log lookup table maps [ 0 , 0xffff ]
* ( corresponding to real numbers [ 1 / 0x10000 , 1 ] to
* [ 0 , 0xffffffffffff ] ( corresponding to real numbers
* [ - 11.090355 , 0 ] ) .
*/
ln = crush_ln ( u ) - 0x1000000000000ll ;
/*
* divide by 16.16 fixed - point weight . note
* that the ln value is negative , so a larger
* weight means a larger ( less negative ) value
* for draw .
*/
2017-06-22 20:44:05 +03:00
draw = div64_s64 ( ln , weights [ i ] ) ;
2015-04-14 16:54:52 +03:00
} else {
draw = S64_MIN ;
}
if ( i = = 0 | | draw > high_draw ) {
high = i ;
high_draw = draw ;
}
}
2017-01-31 17:55:06 +03:00
2015-04-14 16:54:52 +03:00
return bucket - > h . items [ high ] ;
}
2017-01-31 17:55:06 +03:00
static int crush_bucket_choose ( const struct crush_bucket * in ,
struct crush_work_bucket * work ,
2017-06-22 20:44:05 +03:00
int x , int r ,
const struct crush_choose_arg * arg ,
int position )
2009-10-06 22:31:11 +04:00
{
2010-06-24 23:58:14 +04:00
dprintk ( " crush_bucket_choose %d x=%d r=%d \n " , in - > id , x , r ) ;
2012-05-08 02:35:24 +04:00
BUG_ON ( in - > size = = 0 ) ;
2009-10-06 22:31:11 +04:00
switch ( in - > alg ) {
case CRUSH_BUCKET_UNIFORM :
2017-01-31 17:55:06 +03:00
return bucket_uniform_choose (
( const struct crush_bucket_uniform * ) in ,
work , x , r ) ;
2009-10-06 22:31:11 +04:00
case CRUSH_BUCKET_LIST :
2017-01-31 17:55:06 +03:00
return bucket_list_choose ( ( const struct crush_bucket_list * ) in ,
2009-10-06 22:31:11 +04:00
x , r ) ;
case CRUSH_BUCKET_TREE :
2017-01-31 17:55:06 +03:00
return bucket_tree_choose ( ( const struct crush_bucket_tree * ) in ,
2009-10-06 22:31:11 +04:00
x , r ) ;
case CRUSH_BUCKET_STRAW :
2017-01-31 17:55:06 +03:00
return bucket_straw_choose (
( const struct crush_bucket_straw * ) in ,
x , r ) ;
2015-04-14 16:54:52 +03:00
case CRUSH_BUCKET_STRAW2 :
2017-01-31 17:55:06 +03:00
return bucket_straw2_choose (
( const struct crush_bucket_straw2 * ) in ,
2017-06-22 20:44:05 +03:00
x , r , arg , position ) ;
2009-10-06 22:31:11 +04:00
default :
2012-05-08 02:35:24 +04:00
dprintk ( " unknown bucket %d alg %d \n " , in - > id , in - > alg ) ;
2009-12-02 01:12:07 +03:00
return in - > items [ 0 ] ;
2009-10-06 22:31:11 +04:00
}
}
/*
* true if device is marked " out " ( failed , fully offloaded )
* of the cluster
*/
2013-12-24 23:19:24 +04:00
static int is_out ( const struct crush_map * map ,
const __u32 * weight , int weight_max ,
int item , int x )
2009-10-06 22:31:11 +04:00
{
2013-12-24 23:19:24 +04:00
if ( item > = weight_max )
return 1 ;
2010-07-05 20:44:17 +04:00
if ( weight [ item ] > = 0x10000 )
2009-10-06 22:31:11 +04:00
return 0 ;
if ( weight [ item ] = = 0 )
return 1 ;
2009-11-08 07:18:22 +03:00
if ( ( crush_hash32_2 ( CRUSH_HASH_RJENKINS1 , x , item ) & 0xffff )
< weight [ item ] )
2009-10-06 22:31:11 +04:00
return 0 ;
return 1 ;
}
/**
2013-12-24 23:19:25 +04:00
* crush_choose_firstn - choose numrep distinct items of given type
2009-10-06 22:31:11 +04:00
* @ map : the crush_map
* @ bucket : the bucket we are choose an item from
* @ x : crush input value
* @ numrep : the number of items to choose
* @ type : the type of item to choose
* @ out : pointer to output vector
* @ outpos : our position in that vector
2015-04-14 16:04:23 +03:00
* @ out_size : size of the out vector
2013-12-24 23:19:27 +04:00
* @ tries : number of attempts to make
* @ recurse_tries : number of attempts to have recursive chooseleaf make
2014-03-19 18:58:36 +04:00
* @ local_retries : localized retries
* @ local_fallback_retries : localized fallback retries
2013-12-24 23:19:27 +04:00
* @ recurse_to_leaf : true if we want one device under each item of given type ( chooseleaf instead of choose )
2016-01-31 16:36:07 +03:00
* @ stable : stable mode starts rep = 0 in the recursive call for all replicas
2014-03-19 18:58:37 +04:00
* @ vary_r : pass r to recursive calls
2009-10-06 22:31:11 +04:00
* @ out2 : second output vector for leaf items ( if @ recurse_to_leaf )
2014-03-19 18:58:37 +04:00
* @ parent_r : r value passed from the parent
2009-10-06 22:31:11 +04:00
*/
2013-12-24 23:19:25 +04:00
static int crush_choose_firstn ( const struct crush_map * map ,
2017-01-31 17:55:06 +03:00
struct crush_work * work ,
const struct crush_bucket * bucket ,
2013-12-24 23:19:25 +04:00
const __u32 * weight , int weight_max ,
int x , int numrep , int type ,
int * out , int outpos ,
2015-04-14 16:04:23 +03:00
int out_size ,
2013-12-24 23:19:27 +04:00
unsigned int tries ,
unsigned int recurse_tries ,
2014-03-19 18:58:36 +04:00
unsigned int local_retries ,
unsigned int local_fallback_retries ,
2013-12-24 23:19:25 +04:00
int recurse_to_leaf ,
2014-03-19 18:58:37 +04:00
unsigned int vary_r ,
2016-01-31 16:36:07 +03:00
unsigned int stable ,
2014-03-19 18:58:37 +04:00
int * out2 ,
2017-06-22 20:44:05 +03:00
int parent_r ,
const struct crush_choose_arg * choose_args )
2009-10-06 22:31:11 +04:00
{
int rep ;
2012-05-08 02:38:35 +04:00
unsigned int ftotal , flocal ;
2009-10-06 22:31:11 +04:00
int retry_descent , retry_bucket , skip_rep ;
2017-01-31 17:55:06 +03:00
const struct crush_bucket * in = bucket ;
2009-10-06 22:31:11 +04:00
int r ;
int i ;
2009-10-07 21:59:34 +04:00
int item = 0 ;
2009-10-06 22:31:11 +04:00
int itemtype ;
int collide , reject ;
2015-04-14 16:04:23 +03:00
int count = out_size ;
2010-06-24 23:58:14 +04:00
2016-01-31 16:36:07 +03:00
dprintk ( " CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d \n " ,
2014-03-19 18:58:37 +04:00
recurse_to_leaf ? " _LEAF " : " " ,
bucket - > id , x , outpos , numrep ,
tries , recurse_tries , local_retries , local_fallback_retries ,
2016-01-31 16:36:07 +03:00
parent_r , stable ) ;
2009-10-06 22:31:11 +04:00
2016-01-31 16:36:07 +03:00
for ( rep = stable ? 0 : outpos ; rep < numrep & & count > 0 ; rep + + ) {
2009-10-06 22:31:11 +04:00
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0 ;
skip_rep = 0 ;
do {
retry_descent = 0 ;
in = bucket ; /* initial bucket */
/* choose through intervening buckets */
flocal = 0 ;
do {
2009-10-07 21:59:34 +04:00
collide = 0 ;
2009-10-06 22:31:11 +04:00
retry_bucket = 0 ;
2014-03-19 18:58:37 +04:00
r = rep + parent_r ;
2013-12-24 23:19:25 +04:00
/* r' = r + f_total */
r + = ftotal ;
2009-10-06 22:31:11 +04:00
/* bucket choose */
2009-10-07 21:59:34 +04:00
if ( in - > size = = 0 ) {
reject = 1 ;
goto reject ;
}
2014-03-19 18:58:36 +04:00
if ( local_fallback_retries > 0 & &
2012-07-31 05:15:23 +04:00
flocal > = ( in - > size > > 1 ) & &
2014-03-19 18:58:36 +04:00
flocal > local_fallback_retries )
2017-01-31 17:55:06 +03:00
item = bucket_perm_choose (
in , work - > work [ - 1 - in - > id ] ,
x , r ) ;
2009-10-06 22:31:11 +04:00
else
2017-01-31 17:55:06 +03:00
item = crush_bucket_choose (
in , work - > work [ - 1 - in - > id ] ,
2017-06-22 20:44:05 +03:00
x , r ,
( choose_args ?
& choose_args [ - 1 - in - > id ] : 0 ) ,
outpos ) ;
2012-05-08 02:35:24 +04:00
if ( item > = map - > max_devices ) {
dprintk ( " bad item %d \n " , item ) ;
skip_rep = 1 ;
break ;
}
2009-10-06 22:31:11 +04:00
/* desired type? */
if ( item < 0 )
itemtype = map - > buckets [ - 1 - item ] - > type ;
else
itemtype = 0 ;
dprintk ( " item %d type %d \n " , item , itemtype ) ;
/* keep going? */
if ( itemtype ! = type ) {
2012-05-08 02:35:24 +04:00
if ( item > = 0 | |
( - 1 - item ) > = map - > max_buckets ) {
dprintk ( " bad item type %d \n " , type ) ;
skip_rep = 1 ;
break ;
}
2009-10-06 22:31:11 +04:00
in = map - > buckets [ - 1 - item ] ;
2010-06-24 23:55:48 +04:00
retry_bucket = 1 ;
2009-10-06 22:31:11 +04:00
continue ;
}
/* collision? */
for ( i = 0 ; i < outpos ; i + + ) {
if ( out [ i ] = = item ) {
collide = 1 ;
break ;
}
}
2010-06-24 23:58:14 +04:00
reject = 0 ;
2013-01-16 06:49:09 +04:00
if ( ! collide & & recurse_to_leaf ) {
2010-06-24 23:58:14 +04:00
if ( item < 0 ) {
2014-03-19 18:58:37 +04:00
int sub_r ;
if ( vary_r )
sub_r = r > > ( vary_r - 1 ) ;
else
sub_r = 0 ;
2017-01-31 17:55:06 +03:00
if ( crush_choose_firstn (
map ,
work ,
map - > buckets [ - 1 - item ] ,
weight , weight_max ,
x , stable ? 1 : outpos + 1 , 0 ,
out2 , outpos , count ,
recurse_tries , 0 ,
local_retries ,
local_fallback_retries ,
0 ,
vary_r ,
stable ,
NULL ,
2017-06-22 20:44:05 +03:00
sub_r ,
choose_args ) < = outpos )
2010-06-24 23:58:14 +04:00
/* didn't get leaf */
reject = 1 ;
} else {
/* we already have a leaf! */
out2 [ outpos ] = item ;
}
}
2017-02-16 17:21:15 +03:00
if ( ! reject & & ! collide ) {
2009-10-06 22:31:11 +04:00
/* out? */
if ( itemtype = = 0 )
reject = is_out ( map , weight ,
2013-12-24 23:19:24 +04:00
weight_max ,
2009-10-06 22:31:11 +04:00
item , x ) ;
}
2009-10-07 21:59:34 +04:00
reject :
2009-10-06 22:31:11 +04:00
if ( reject | | collide ) {
ftotal + + ;
flocal + + ;
2014-03-19 18:58:36 +04:00
if ( collide & & flocal < = local_retries )
2009-10-06 22:31:11 +04:00
/* retry locally a few times */
retry_bucket = 1 ;
2014-03-19 18:58:36 +04:00
else if ( local_fallback_retries > 0 & &
flocal < = in - > size + local_fallback_retries )
2009-10-06 22:31:11 +04:00
/* exhaustive bucket search */
retry_bucket = 1 ;
2014-03-19 18:58:36 +04:00
else if ( ftotal < tries )
2009-10-06 22:31:11 +04:00
/* then retry descent */
retry_descent = 1 ;
else
/* else give up */
skip_rep = 1 ;
dprintk ( " reject %d collide %d "
2012-05-08 02:38:35 +04:00
" ftotal %u flocal %u \n " ,
2009-10-06 22:31:11 +04:00
reject , collide , ftotal ,
flocal ) ;
}
} while ( retry_bucket ) ;
} while ( retry_descent ) ;
if ( skip_rep ) {
2013-12-24 23:19:25 +04:00
dprintk ( " skip rep \n " ) ;
continue ;
2009-10-06 22:31:11 +04:00
}
2010-06-24 23:58:14 +04:00
dprintk ( " CHOOSE got %d \n " , item ) ;
2009-10-06 22:31:11 +04:00
out [ outpos ] = item ;
outpos + + ;
2015-04-14 16:04:23 +03:00
count - - ;
2015-06-12 13:21:07 +03:00
# ifndef __KERNEL__
if ( map - > choose_tries & & ftotal < = map - > choose_total_tries )
map - > choose_tries [ ftotal ] + + ;
# endif
2009-10-06 22:31:11 +04:00
}
2010-06-24 23:58:14 +04:00
dprintk ( " CHOOSE returns %d \n " , outpos ) ;
2009-10-06 22:31:11 +04:00
return outpos ;
}
2013-12-24 23:19:25 +04:00
/**
2013-12-24 23:19:25 +04:00
* crush_choose_indep : alternative breadth - first positionally stable mapping
2013-12-24 23:19:25 +04:00
*
*/
static void crush_choose_indep ( const struct crush_map * map ,
2017-01-31 17:55:06 +03:00
struct crush_work * work ,
const struct crush_bucket * bucket ,
2013-12-24 23:19:26 +04:00
const __u32 * weight , int weight_max ,
2013-12-24 23:19:25 +04:00
int x , int left , int numrep , int type ,
2013-12-24 23:19:26 +04:00
int * out , int outpos ,
2013-12-24 23:19:27 +04:00
unsigned int tries ,
unsigned int recurse_tries ,
2013-12-24 23:19:26 +04:00
int recurse_to_leaf ,
2013-12-24 23:19:25 +04:00
int * out2 ,
2017-06-22 20:44:05 +03:00
int parent_r ,
const struct crush_choose_arg * choose_args )
2013-12-24 23:19:25 +04:00
{
2017-01-31 17:55:06 +03:00
const struct crush_bucket * in = bucket ;
2013-12-24 23:19:25 +04:00
int endpos = outpos + left ;
2013-12-24 23:19:25 +04:00
int rep ;
unsigned int ftotal ;
int r ;
int i ;
int item = 0 ;
int itemtype ;
int collide ;
dprintk ( " CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d \n " , recurse_to_leaf ? " _LEAF " : " " ,
bucket - > id , x , outpos , numrep ) ;
/* initially my result is undefined */
2013-12-24 23:19:25 +04:00
for ( rep = outpos ; rep < endpos ; rep + + ) {
2013-12-24 23:19:25 +04:00
out [ rep ] = CRUSH_ITEM_UNDEF ;
if ( out2 )
out2 [ rep ] = CRUSH_ITEM_UNDEF ;
}
2013-12-24 23:19:27 +04:00
for ( ftotal = 0 ; left > 0 & & ftotal < tries ; ftotal + + ) {
2015-06-12 13:21:07 +03:00
# ifdef DEBUG_INDEP
if ( out2 & & ftotal ) {
dprintk ( " %u %d a: " , ftotal , left ) ;
for ( rep = outpos ; rep < endpos ; rep + + ) {
dprintk ( " %d " , out [ rep ] ) ;
}
dprintk ( " \n " ) ;
dprintk ( " %u %d b: " , ftotal , left ) ;
for ( rep = outpos ; rep < endpos ; rep + + ) {
dprintk ( " %d " , out2 [ rep ] ) ;
}
dprintk ( " \n " ) ;
}
# endif
2013-12-24 23:19:25 +04:00
for ( rep = outpos ; rep < endpos ; rep + + ) {
2013-12-24 23:19:25 +04:00
if ( out [ rep ] ! = CRUSH_ITEM_UNDEF )
continue ;
in = bucket ; /* initial bucket */
/* choose through intervening buckets */
for ( ; ; ) {
2013-12-24 23:19:25 +04:00
/* note: we base the choice on the position
* even in the nested call . that means that
* if the first layer chooses the same bucket
* in a different position , we will tend to
* choose a different item in that bucket .
* this will involve more devices in data
* movement and tend to distribute the load .
*/
2013-12-24 23:19:25 +04:00
r = rep + parent_r ;
2013-12-24 23:19:25 +04:00
/* be careful */
if ( in - > alg = = CRUSH_BUCKET_UNIFORM & &
in - > size % numrep = = 0 )
/* r'=r+(n+1)*f_total */
r + = ( numrep + 1 ) * ftotal ;
else
/* r' = r + n*f_total */
r + = numrep * ftotal ;
/* bucket choose */
if ( in - > size = = 0 ) {
dprintk ( " empty bucket \n " ) ;
break ;
}
2017-01-31 17:55:06 +03:00
item = crush_bucket_choose (
in , work - > work [ - 1 - in - > id ] ,
2017-06-22 20:44:05 +03:00
x , r ,
( choose_args ?
& choose_args [ - 1 - in - > id ] : 0 ) ,
outpos ) ;
2013-12-24 23:19:25 +04:00
if ( item > = map - > max_devices ) {
dprintk ( " bad item %d \n " , item ) ;
out [ rep ] = CRUSH_ITEM_NONE ;
if ( out2 )
out2 [ rep ] = CRUSH_ITEM_NONE ;
left - - ;
break ;
}
/* desired type? */
if ( item < 0 )
itemtype = map - > buckets [ - 1 - item ] - > type ;
else
itemtype = 0 ;
dprintk ( " item %d type %d \n " , item , itemtype ) ;
/* keep going? */
if ( itemtype ! = type ) {
if ( item > = 0 | |
( - 1 - item ) > = map - > max_buckets ) {
dprintk ( " bad item type %d \n " , type ) ;
out [ rep ] = CRUSH_ITEM_NONE ;
if ( out2 )
out2 [ rep ] =
CRUSH_ITEM_NONE ;
left - - ;
break ;
}
in = map - > buckets [ - 1 - item ] ;
continue ;
}
/* collision? */
collide = 0 ;
2013-12-24 23:19:25 +04:00
for ( i = outpos ; i < endpos ; i + + ) {
2013-12-24 23:19:25 +04:00
if ( out [ i ] = = item ) {
collide = 1 ;
break ;
}
}
if ( collide )
break ;
if ( recurse_to_leaf ) {
if ( item < 0 ) {
2017-01-31 17:55:06 +03:00
crush_choose_indep (
map ,
work ,
map - > buckets [ - 1 - item ] ,
weight , weight_max ,
x , 1 , numrep , 0 ,
out2 , rep ,
recurse_tries , 0 ,
2017-06-22 20:44:05 +03:00
0 , NULL , r ,
choose_args ) ;
2013-12-24 23:19:25 +04:00
if ( out2 [ rep ] = = CRUSH_ITEM_NONE ) {
/* placed nothing; no leaf */
break ;
}
} else {
/* we already have a leaf! */
out2 [ rep ] = item ;
}
}
/* out? */
if ( itemtype = = 0 & &
is_out ( map , weight , weight_max , item , x ) )
break ;
/* yay! */
out [ rep ] = item ;
left - - ;
break ;
}
}
}
2013-12-24 23:19:25 +04:00
for ( rep = outpos ; rep < endpos ; rep + + ) {
2013-12-24 23:19:25 +04:00
if ( out [ rep ] = = CRUSH_ITEM_UNDEF ) {
out [ rep ] = CRUSH_ITEM_NONE ;
}
if ( out2 & & out2 [ rep ] = = CRUSH_ITEM_UNDEF ) {
out2 [ rep ] = CRUSH_ITEM_NONE ;
}
}
2015-06-12 13:21:07 +03:00
# ifndef __KERNEL__
if ( map - > choose_tries & & ftotal < = map - > choose_total_tries )
map - > choose_tries [ ftotal ] + + ;
# endif
# ifdef DEBUG_INDEP
if ( out2 ) {
dprintk ( " %u %d a: " , ftotal , left ) ;
for ( rep = outpos ; rep < endpos ; rep + + ) {
dprintk ( " %d " , out [ rep ] ) ;
}
dprintk ( " \n " ) ;
dprintk ( " %u %d b: " , ftotal , left ) ;
for ( rep = outpos ; rep < endpos ; rep + + ) {
dprintk ( " %d " , out2 [ rep ] ) ;
}
dprintk ( " \n " ) ;
}
# endif
2013-12-24 23:19:25 +04:00
}
2017-01-31 17:55:06 +03:00
/*
* This takes a chunk of memory and sets it up to be a shiny new
* working area for a CRUSH placement computation . It must be called
* on any newly allocated memory before passing it in to
* crush_do_rule . It may be used repeatedly after that , so long as the
* map has not changed . If the map / has / changed , you must make sure
* the working size is no smaller than what was allocated and re - run
* crush_init_workspace .
*
* If you do retain the working space between calls to crush , make it
* thread - local .
*/
void crush_init_workspace ( const struct crush_map * map , void * v )
{
struct crush_work * w = v ;
__s32 b ;
/*
* We work by moving through the available space and setting
* values and pointers as we go .
*
* It ' s a bit like Forth ' s use of the ' allot ' word since we
* set the pointer first and then reserve the space for it to
* point to by incrementing the point .
*/
v + = sizeof ( struct crush_work * ) ;
w - > work = v ;
v + = map - > max_buckets * sizeof ( struct crush_work_bucket * ) ;
for ( b = 0 ; b < map - > max_buckets ; + + b ) {
if ( ! map - > buckets [ b ] )
continue ;
w - > work [ b ] = v ;
switch ( map - > buckets [ b ] - > alg ) {
default :
v + = sizeof ( struct crush_work_bucket ) ;
break ;
}
w - > work [ b ] - > perm_x = 0 ;
w - > work [ b ] - > perm_n = 0 ;
w - > work [ b ] - > perm = v ;
v + = map - > buckets [ b ] - > size * sizeof ( __u32 ) ;
}
BUG_ON ( v - ( void * ) w ! = map - > working_size ) ;
}
2009-10-06 22:31:11 +04:00
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @ map : the crush_map
* @ ruleno : the rule id
* @ x : hash input
* @ result : pointer to result vector
* @ result_max : maximum result size
2013-12-24 23:19:24 +04:00
* @ weight : weight vector ( for map leaves )
* @ weight_max : size of weight vector
2017-01-31 17:55:06 +03:00
* @ cwin : pointer to at least crush_work_size ( ) bytes of memory
2017-06-22 20:44:05 +03:00
* @ choose_args : weights and ids for each known bucket
2009-10-06 22:31:11 +04:00
*/
2012-05-08 02:38:35 +04:00
int crush_do_rule ( const struct crush_map * map ,
2009-10-06 22:31:11 +04:00
int ruleno , int x , int * result , int result_max ,
2013-12-24 23:19:24 +04:00
const __u32 * weight , int weight_max ,
2017-06-22 20:44:05 +03:00
void * cwin , const struct crush_choose_arg * choose_args )
2009-10-06 22:31:11 +04:00
{
int result_len ;
2017-01-31 17:55:06 +03:00
struct crush_work * cw = cwin ;
2017-01-31 17:55:06 +03:00
int * a = cwin + map - > working_size ;
int * b = a + result_max ;
int * c = b + result_max ;
int * w = a ;
int * o = b ;
2009-10-06 22:31:11 +04:00
int recurse_to_leaf ;
int wsize = 0 ;
int osize ;
int * tmp ;
2017-01-31 17:55:06 +03:00
const struct crush_rule * rule ;
2012-05-08 02:38:35 +04:00
__u32 step ;
2009-10-06 22:31:11 +04:00
int i , j ;
int numrep ;
2015-04-14 16:04:23 +03:00
int out_size ;
2014-03-19 18:58:36 +04:00
/*
* the original choose_total_tries value was off by one ( it
* counted " retries " and not " tries " ) . add one .
*/
int choose_tries = map - > choose_total_tries + 1 ;
2013-12-24 23:19:26 +04:00
int choose_leaf_tries = 0 ;
2014-03-19 18:58:36 +04:00
/*
* the local tries values were counted as " retries " , though ,
* and need no adjustment
*/
int choose_local_retries = map - > choose_local_tries ;
int choose_local_fallback_retries = map - > choose_local_fallback_tries ;
2009-10-06 22:31:11 +04:00
2014-03-19 18:58:37 +04:00
int vary_r = map - > chooseleaf_vary_r ;
2016-01-31 16:36:07 +03:00
int stable = map - > chooseleaf_stable ;
2014-03-19 18:58:37 +04:00
2012-05-08 02:35:24 +04:00
if ( ( __u32 ) ruleno > = map - > max_rules ) {
dprintk ( " bad ruleno %d \n " , ruleno ) ;
return 0 ;
}
2009-10-06 22:31:11 +04:00
rule = map - > rules [ ruleno ] ;
result_len = 0 ;
for ( step = 0 ; step < rule - > len ; step + + ) {
2013-12-24 23:19:24 +04:00
int firstn = 0 ;
2017-01-31 17:55:06 +03:00
const struct crush_rule_step * curstep = & rule - > steps [ step ] ;
2012-05-08 02:35:48 +04:00
switch ( curstep - > op ) {
2009-10-06 22:31:11 +04:00
case CRUSH_RULE_TAKE :
2015-06-12 11:20:03 +03:00
if ( ( curstep - > arg1 > = 0 & &
curstep - > arg1 < map - > max_devices ) | |
2016-01-31 16:36:05 +03:00
( - 1 - curstep - > arg1 > = 0 & &
- 1 - curstep - > arg1 < map - > max_buckets & &
2015-06-12 11:20:03 +03:00
map - > buckets [ - 1 - curstep - > arg1 ] ) ) {
w [ 0 ] = curstep - > arg1 ;
wsize = 1 ;
} else {
dprintk ( " bad take value %d \n " , curstep - > arg1 ) ;
}
2009-10-06 22:31:11 +04:00
break ;
2013-12-24 23:19:26 +04:00
case CRUSH_RULE_SET_CHOOSE_TRIES :
if ( curstep - > arg1 > 0 )
choose_tries = curstep - > arg1 ;
break ;
2013-12-24 23:19:26 +04:00
case CRUSH_RULE_SET_CHOOSELEAF_TRIES :
2013-12-24 23:19:26 +04:00
if ( curstep - > arg1 > 0 )
choose_leaf_tries = curstep - > arg1 ;
break ;
2013-12-24 23:19:27 +04:00
case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES :
2014-03-19 18:58:37 +04:00
if ( curstep - > arg1 > = 0 )
2014-03-19 18:58:36 +04:00
choose_local_retries = curstep - > arg1 ;
2013-12-24 23:19:27 +04:00
break ;
case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES :
2014-03-19 18:58:37 +04:00
if ( curstep - > arg1 > = 0 )
2014-03-19 18:58:36 +04:00
choose_local_fallback_retries = curstep - > arg1 ;
2013-12-24 23:19:27 +04:00
break ;
2014-03-19 18:58:37 +04:00
case CRUSH_RULE_SET_CHOOSELEAF_VARY_R :
if ( curstep - > arg1 > = 0 )
vary_r = curstep - > arg1 ;
break ;
2013-12-24 23:19:27 +04:00
2016-01-31 16:36:07 +03:00
case CRUSH_RULE_SET_CHOOSELEAF_STABLE :
if ( curstep - > arg1 > = 0 )
stable = curstep - > arg1 ;
break ;
2013-12-24 23:19:26 +04:00
case CRUSH_RULE_CHOOSELEAF_FIRSTN :
2009-10-06 22:31:11 +04:00
case CRUSH_RULE_CHOOSE_FIRSTN :
firstn = 1 ;
2012-05-08 02:35:48 +04:00
/* fall through */
2013-12-24 23:19:26 +04:00
case CRUSH_RULE_CHOOSELEAF_INDEP :
2009-10-06 22:31:11 +04:00
case CRUSH_RULE_CHOOSE_INDEP :
2012-05-08 02:35:24 +04:00
if ( wsize = = 0 )
break ;
2009-10-06 22:31:11 +04:00
recurse_to_leaf =
2012-05-08 02:35:48 +04:00
curstep - > op = =
2013-12-24 23:19:26 +04:00
CRUSH_RULE_CHOOSELEAF_FIRSTN | |
2012-05-08 02:35:48 +04:00
curstep - > op = =
2013-12-24 23:19:26 +04:00
CRUSH_RULE_CHOOSELEAF_INDEP ;
2009-10-06 22:31:11 +04:00
/* reset output */
osize = 0 ;
for ( i = 0 ; i < wsize ; i + + ) {
2016-01-31 16:35:59 +03:00
int bno ;
2009-10-06 22:31:11 +04:00
/*
* see CRUSH_N , CRUSH_N_MINUS macros .
* basically , numrep < = 0 means relative to
* the provided result_max
*/
2012-05-08 02:35:48 +04:00
numrep = curstep - > arg1 ;
2009-10-06 22:31:11 +04:00
if ( numrep < = 0 ) {
numrep + = result_max ;
if ( numrep < = 0 )
continue ;
}
j = 0 ;
2016-01-31 16:35:59 +03:00
/* make sure bucket id is valid */
bno = - 1 - w [ i ] ;
if ( bno < 0 | | bno > = map - > max_buckets ) {
/* w[i] is probably CRUSH_ITEM_NONE */
dprintk ( " bad w[i] %d \n " , w [ i ] ) ;
continue ;
}
2013-12-24 23:19:25 +04:00
if ( firstn ) {
2013-12-24 23:19:26 +04:00
int recurse_tries ;
if ( choose_leaf_tries )
recurse_tries =
choose_leaf_tries ;
else if ( map - > chooseleaf_descend_once )
recurse_tries = 1 ;
else
recurse_tries = choose_tries ;
2013-12-24 23:19:25 +04:00
osize + = crush_choose_firstn (
map ,
2017-01-31 17:55:06 +03:00
cw ,
2016-01-31 16:35:59 +03:00
map - > buckets [ bno ] ,
2013-12-24 23:19:25 +04:00
weight , weight_max ,
x , numrep ,
curstep - > arg2 ,
o + osize , j ,
2015-04-14 16:04:23 +03:00
result_max - osize ,
2013-12-24 23:19:26 +04:00
choose_tries ,
2013-12-24 23:19:26 +04:00
recurse_tries ,
2014-03-19 18:58:36 +04:00
choose_local_retries ,
choose_local_fallback_retries ,
2013-12-24 23:19:25 +04:00
recurse_to_leaf ,
2014-03-19 18:58:37 +04:00
vary_r ,
2016-01-31 16:36:07 +03:00
stable ,
2014-03-19 18:58:37 +04:00
c + osize ,
2017-06-22 20:44:05 +03:00
0 ,
choose_args ) ;
2013-12-24 23:19:25 +04:00
} else {
2015-04-14 16:04:23 +03:00
out_size = ( ( numrep < ( result_max - osize ) ) ?
2015-06-12 13:21:07 +03:00
numrep : ( result_max - osize ) ) ;
2013-12-24 23:19:25 +04:00
crush_choose_indep (
map ,
2017-01-31 17:55:06 +03:00
cw ,
2016-01-31 16:35:59 +03:00
map - > buckets [ bno ] ,
2013-12-24 23:19:25 +04:00
weight , weight_max ,
2015-04-14 16:04:23 +03:00
x , out_size , numrep ,
2013-12-24 23:19:25 +04:00
curstep - > arg2 ,
o + osize , j ,
2013-12-24 23:19:26 +04:00
choose_tries ,
2013-12-24 23:19:26 +04:00
choose_leaf_tries ?
choose_leaf_tries : 1 ,
2013-12-24 23:19:25 +04:00
recurse_to_leaf ,
2013-12-24 23:19:25 +04:00
c + osize ,
2017-06-22 20:44:05 +03:00
0 ,
choose_args ) ;
2015-04-14 16:04:23 +03:00
osize + = out_size ;
2013-12-24 23:19:25 +04:00
}
2009-10-06 22:31:11 +04:00
}
if ( recurse_to_leaf )
/* copy final _leaf_ values to output set */
memcpy ( o , c , osize * sizeof ( * o ) ) ;
2013-12-24 23:19:24 +04:00
/* swap o and w arrays */
2009-10-06 22:31:11 +04:00
tmp = o ;
o = w ;
w = tmp ;
wsize = osize ;
break ;
case CRUSH_RULE_EMIT :
for ( i = 0 ; i < wsize & & result_len < result_max ; i + + ) {
result [ result_len ] = w [ i ] ;
result_len + + ;
}
wsize = 0 ;
break ;
default :
2012-05-08 02:35:24 +04:00
dprintk ( " unknown op %d at step %d \n " ,
curstep - > op , step ) ;
break ;
2009-10-06 22:31:11 +04:00
}
}
2017-01-31 17:55:06 +03:00
2011-12-07 21:10:26 +04:00
return result_len ;
2009-10-06 22:31:11 +04:00
}