2010-04-06 15:14:15 -07:00
# include <linux/ceph/ceph_debug.h>
2009-10-06 11:31:09 -07:00
# include <linux/bug.h>
# include <linux/err.h>
# include <linux/random.h>
# include <linux/slab.h>
# include <linux/types.h>
2010-04-06 15:14:15 -07:00
# include <linux/ceph/mdsmap.h>
# include <linux/ceph/messenger.h>
# include <linux/ceph/decode.h>
2009-10-06 11:31:09 -07:00
# include "super.h"
/*
* choose a random mds that is " up " ( i . e . has a state > 0 ) , or - 1.
*/
int ceph_mdsmap_get_random_mds ( struct ceph_mdsmap * m )
{
int n = 0 ;
int i ;
2013-04-09 16:49:11 -05:00
/* special case for one mds */
2017-03-28 17:04:13 +08:00
if ( 1 = = m - > m_num_mds & & m - > m_info [ 0 ] . state > 0 )
2013-04-09 16:49:11 -05:00
return 0 ;
2009-10-06 11:31:09 -07:00
/* count */
2017-03-28 17:04:13 +08:00
for ( i = 0 ; i < m - > m_num_mds ; i + + )
2009-10-06 11:31:09 -07:00
if ( m - > m_info [ i ] . state > 0 )
n + + ;
if ( n = = 0 )
return - 1 ;
/* pick */
2013-04-09 16:49:11 -05:00
n = prandom_u32 ( ) % n ;
2009-10-06 11:31:09 -07:00
i = 0 ;
for ( i = 0 ; n > 0 ; i + + , n - - )
while ( m - > m_info [ i ] . state < = 0 )
i + + ;
return i ;
}
2016-11-10 16:02:06 +08:00
# define __decode_and_drop_type(p, end, type, bad) \
do { \
if ( * p + sizeof ( type ) > end ) \
goto bad ; \
* p + = sizeof ( type ) ; \
} while ( 0 )
# define __decode_and_drop_set(p, end, type, bad) \
do { \
u32 n ; \
size_t need ; \
ceph_decode_32_safe ( p , end , n , bad ) ; \
need = sizeof ( type ) * n ; \
ceph_decode_need ( p , end , need , bad ) ; \
* p + = need ; \
} while ( 0 )
# define __decode_and_drop_map(p, end, ktype, vtype, bad) \
do { \
u32 n ; \
size_t need ; \
ceph_decode_32_safe ( p , end , n , bad ) ; \
need = ( sizeof ( ktype ) + sizeof ( vtype ) ) * n ; \
ceph_decode_need ( p , end , need , bad ) ; \
* p + = need ; \
} while ( 0 )
static int __decode_and_drop_compat_set ( void * * p , void * end )
{
int i ;
/* compat, ro_compat, incompat*/
for ( i = 0 ; i < 3 ; i + + ) {
u32 n ;
ceph_decode_need ( p , end , sizeof ( u64 ) + sizeof ( u32 ) , bad ) ;
/* mask */
* p + = sizeof ( u64 ) ;
/* names (map<u64, string>) */
n = ceph_decode_32 ( p ) ;
while ( n - - > 0 ) {
u32 len ;
ceph_decode_need ( p , end , sizeof ( u64 ) + sizeof ( u32 ) ,
bad ) ;
* p + = sizeof ( u64 ) ;
len = ceph_decode_32 ( p ) ;
ceph_decode_need ( p , end , len , bad ) ;
* p + = len ;
}
}
return 0 ;
bad :
return - 1 ;
}
2009-10-06 11:31:09 -07:00
/*
* Decode an MDS map
*
* Ignore any fields we don ' t care about ( there are quite a few of
* them ) .
*/
struct ceph_mdsmap * ceph_mdsmap_decode ( void * * p , void * end )
{
struct ceph_mdsmap * m ;
2009-12-14 15:13:47 -08:00
const void * start = * p ;
2009-10-06 11:31:09 -07:00
int i , j , n ;
int err = - EINVAL ;
2016-03-31 15:53:01 +08:00
u8 mdsmap_v , mdsmap_cv ;
2016-11-10 16:02:06 +08:00
u16 mdsmap_ev ;
2009-10-06 11:31:09 -07:00
m = kzalloc ( sizeof ( * m ) , GFP_NOFS ) ;
if ( m = = NULL )
return ERR_PTR ( - ENOMEM ) ;
2016-03-31 15:53:01 +08:00
ceph_decode_need ( p , end , 1 + 1 , bad ) ;
mdsmap_v = ceph_decode_8 ( p ) ;
mdsmap_cv = ceph_decode_8 ( p ) ;
if ( mdsmap_v > = 4 ) {
u32 mdsmap_len ;
ceph_decode_32_safe ( p , end , mdsmap_len , bad ) ;
if ( end < * p + mdsmap_len )
goto bad ;
end = * p + mdsmap_len ;
2013-02-23 10:41:09 -08:00
}
2009-10-06 11:31:09 -07:00
ceph_decode_need ( p , end , 8 * sizeof ( u32 ) + sizeof ( u64 ) , bad ) ;
2009-10-14 09:59:09 -07:00
m - > m_epoch = ceph_decode_32 ( p ) ;
m - > m_client_epoch = ceph_decode_32 ( p ) ;
m - > m_last_failure = ceph_decode_32 ( p ) ;
m - > m_root = ceph_decode_32 ( p ) ;
m - > m_session_timeout = ceph_decode_32 ( p ) ;
m - > m_session_autoclose = ceph_decode_32 ( p ) ;
m - > m_max_file_size = ceph_decode_64 ( p ) ;
m - > m_max_mds = ceph_decode_32 ( p ) ;
2017-03-28 17:04:13 +08:00
m - > m_num_mds = m - > m_max_mds ;
2009-10-06 11:31:09 -07:00
2017-03-28 17:04:13 +08:00
m - > m_info = kcalloc ( m - > m_num_mds , sizeof ( * m - > m_info ) , GFP_NOFS ) ;
2009-10-06 11:31:09 -07:00
if ( m - > m_info = = NULL )
2016-11-10 16:02:06 +08:00
goto nomem ;
2009-10-06 11:31:09 -07:00
/* pick out active nodes from mds_info (state > 0) */
2009-10-14 09:59:09 -07:00
n = ceph_decode_32 ( p ) ;
2009-10-06 11:31:09 -07:00
for ( i = 0 ; i < n ; i + + ) {
2009-11-19 15:31:50 -08:00
u64 global_id ;
2009-10-06 11:31:09 -07:00
u32 namelen ;
s32 mds , inc , state ;
u64 state_seq ;
2016-03-31 15:53:01 +08:00
u8 info_v ;
void * info_end = NULL ;
2009-10-06 11:31:09 -07:00
struct ceph_entity_addr addr ;
u32 num_export_targets ;
void * pexport_targets = NULL ;
2010-06-17 14:19:01 -07:00
struct ceph_timespec laggy_since ;
2013-05-29 06:46:56 -05:00
struct ceph_mds_info * info ;
2009-10-06 11:31:09 -07:00
2016-03-31 15:53:01 +08:00
ceph_decode_need ( p , end , sizeof ( u64 ) + 1 , bad ) ;
2009-11-19 15:31:50 -08:00
global_id = ceph_decode_64 ( p ) ;
2016-03-31 15:53:01 +08:00
info_v = ceph_decode_8 ( p ) ;
if ( info_v > = 4 ) {
u32 info_len ;
u8 info_cv ;
ceph_decode_need ( p , end , 1 + sizeof ( u32 ) , bad ) ;
info_cv = ceph_decode_8 ( p ) ;
info_len = ceph_decode_32 ( p ) ;
info_end = * p + info_len ;
if ( info_end > end )
goto bad ;
}
ceph_decode_need ( p , end , sizeof ( u64 ) + sizeof ( u32 ) , bad ) ;
2009-11-19 15:31:50 -08:00
* p + = sizeof ( u64 ) ;
2009-10-14 09:59:09 -07:00
namelen = ceph_decode_32 ( p ) ; /* skip mds name */
2009-10-06 11:31:09 -07:00
* p + = namelen ;
ceph_decode_need ( p , end ,
2009-10-07 16:38:19 -07:00
4 * sizeof ( u32 ) + sizeof ( u64 ) +
2009-10-06 11:31:09 -07:00
sizeof ( addr ) + sizeof ( struct ceph_timespec ) ,
bad ) ;
2009-10-14 09:59:09 -07:00
mds = ceph_decode_32 ( p ) ;
inc = ceph_decode_32 ( p ) ;
state = ceph_decode_32 ( p ) ;
state_seq = ceph_decode_64 ( p ) ;
2009-11-19 15:31:50 -08:00
ceph_decode_copy ( p , & addr , sizeof ( addr ) ) ;
ceph_decode_addr ( & addr ) ;
2010-06-17 14:19:01 -07:00
ceph_decode_copy ( p , & laggy_since , sizeof ( laggy_since ) ) ;
2009-10-06 11:31:09 -07:00
* p + = sizeof ( u32 ) ;
ceph_decode_32_safe ( p , end , namelen , bad ) ;
2009-10-07 16:38:19 -07:00
* p + = namelen ;
2016-03-31 15:53:01 +08:00
if ( info_v > = 2 ) {
2009-10-06 11:31:09 -07:00
ceph_decode_32_safe ( p , end , num_export_targets , bad ) ;
pexport_targets = * p ;
2009-10-07 16:38:19 -07:00
* p + = num_export_targets * sizeof ( u32 ) ;
2009-10-06 11:31:09 -07:00
} else {
num_export_targets = 0 ;
}
2016-03-31 15:53:01 +08:00
if ( info_end & & * p ! = info_end ) {
if ( * p > info_end )
goto bad ;
* p = info_end ;
}
2009-11-19 15:31:50 -08:00
dout ( " mdsmap_decode %d/%d %lld mds%d.%d %s %s \n " ,
2010-04-06 15:14:15 -07:00
i + 1 , n , global_id , mds , inc ,
ceph_pr_addr ( & addr . in_addr ) ,
2009-10-06 11:31:09 -07:00
ceph_mds_state_name ( state ) ) ;
2013-05-29 06:46:56 -05:00
2017-03-28 17:04:13 +08:00
if ( mds < 0 | | state < = 0 )
2013-05-29 06:46:56 -05:00
continue ;
2017-03-28 17:04:13 +08:00
if ( mds > = m - > m_num_mds ) {
int new_num = max ( mds + 1 , m - > m_num_mds * 2 ) ;
void * new_m_info = krealloc ( m - > m_info ,
new_num * sizeof ( * m - > m_info ) ,
GFP_NOFS | __GFP_ZERO ) ;
if ( ! new_m_info )
goto nomem ;
m - > m_info = new_m_info ;
m - > m_num_mds = new_num ;
}
2013-05-29 06:46:56 -05:00
info = & m - > m_info [ mds ] ;
info - > global_id = global_id ;
info - > state = state ;
info - > addr = addr ;
info - > laggy = ( laggy_since . tv_sec ! = 0 | |
laggy_since . tv_nsec ! = 0 ) ;
info - > num_export_targets = num_export_targets ;
if ( num_export_targets ) {
info - > export_targets = kcalloc ( num_export_targets ,
sizeof ( u32 ) , GFP_NOFS ) ;
if ( info - > export_targets = = NULL )
2016-11-10 16:02:06 +08:00
goto nomem ;
2013-05-29 06:46:56 -05:00
for ( j = 0 ; j < num_export_targets ; j + + )
info - > export_targets [ j ] =
ceph_decode_32 ( & pexport_targets ) ;
} else {
info - > export_targets = NULL ;
2009-10-06 11:31:09 -07:00
}
}
2017-03-28 17:04:13 +08:00
if ( m - > m_num_mds > m - > m_max_mds ) {
/* find max up mds */
for ( i = m - > m_num_mds ; i > = m - > m_max_mds ; i - - ) {
if ( i = = 0 | | m - > m_info [ i - 1 ] . state > 0 )
break ;
}
m - > m_num_mds = i ;
}
2009-10-06 11:31:09 -07:00
/* pg_pools */
ceph_decode_32_safe ( p , end , n , bad ) ;
m - > m_num_data_pg_pools = n ;
2013-02-23 10:41:09 -08:00
m - > m_data_pg_pools = kcalloc ( n , sizeof ( u64 ) , GFP_NOFS ) ;
2009-10-06 11:31:09 -07:00
if ( ! m - > m_data_pg_pools )
2016-11-10 16:02:06 +08:00
goto nomem ;
2013-02-23 10:41:09 -08:00
ceph_decode_need ( p , end , sizeof ( u64 ) * ( n + 1 ) , bad ) ;
2009-10-06 11:31:09 -07:00
for ( i = 0 ; i < n ; i + + )
2013-02-23 10:41:09 -08:00
m - > m_data_pg_pools [ i ] = ceph_decode_64 ( p ) ;
m - > m_cas_pg_pool = ceph_decode_64 ( p ) ;
2016-11-10 16:02:06 +08:00
m - > m_enabled = m - > m_epoch > 1 ;
mdsmap_ev = 1 ;
if ( mdsmap_v > = 2 ) {
ceph_decode_16_safe ( p , end , mdsmap_ev , bad_ext ) ;
}
if ( mdsmap_ev > = 3 ) {
if ( __decode_and_drop_compat_set ( p , end ) < 0 )
goto bad_ext ;
}
/* metadata_pool */
if ( mdsmap_ev < 5 ) {
__decode_and_drop_type ( p , end , u32 , bad_ext ) ;
} else {
__decode_and_drop_type ( p , end , u64 , bad_ext ) ;
}
2009-10-06 11:31:09 -07:00
2016-11-10 16:02:06 +08:00
/* created + modified + tableserver */
__decode_and_drop_type ( p , end , struct ceph_timespec , bad_ext ) ;
__decode_and_drop_type ( p , end , struct ceph_timespec , bad_ext ) ;
__decode_and_drop_type ( p , end , u32 , bad_ext ) ;
/* in */
{
int num_laggy = 0 ;
ceph_decode_32_safe ( p , end , n , bad_ext ) ;
ceph_decode_need ( p , end , sizeof ( u32 ) * n , bad_ext ) ;
for ( i = 0 ; i < n ; i + + ) {
s32 mds = ceph_decode_32 ( p ) ;
2017-03-28 17:04:13 +08:00
if ( mds > = 0 & & mds < m - > m_num_mds ) {
2016-11-10 16:02:06 +08:00
if ( m - > m_info [ mds ] . laggy )
num_laggy + + ;
}
}
m - > m_num_laggy = num_laggy ;
2017-03-28 17:04:13 +08:00
if ( n > m - > m_num_mds ) {
void * new_m_info = krealloc ( m - > m_info ,
n * sizeof ( * m - > m_info ) ,
GFP_NOFS | __GFP_ZERO ) ;
if ( ! new_m_info )
goto nomem ;
m - > m_info = new_m_info ;
}
m - > m_num_mds = n ;
2016-11-10 16:02:06 +08:00
}
/* inc */
__decode_and_drop_map ( p , end , u32 , u32 , bad_ext ) ;
/* up */
__decode_and_drop_map ( p , end , u32 , u64 , bad_ext ) ;
/* failed */
__decode_and_drop_set ( p , end , u32 , bad_ext ) ;
/* stopped */
__decode_and_drop_set ( p , end , u32 , bad_ext ) ;
if ( mdsmap_ev > = 4 ) {
/* last_failure_osd_epoch */
__decode_and_drop_type ( p , end , u32 , bad_ext ) ;
}
if ( mdsmap_ev > = 6 ) {
/* ever_allowed_snaps */
__decode_and_drop_type ( p , end , u8 , bad_ext ) ;
/* explicitly_allowed_snaps */
__decode_and_drop_type ( p , end , u8 , bad_ext ) ;
}
if ( mdsmap_ev > = 7 ) {
/* inline_data_enabled */
__decode_and_drop_type ( p , end , u8 , bad_ext ) ;
}
if ( mdsmap_ev > = 8 ) {
u32 name_len ;
/* enabled */
ceph_decode_8_safe ( p , end , m - > m_enabled , bad_ext ) ;
ceph_decode_32_safe ( p , end , name_len , bad_ext ) ;
ceph_decode_need ( p , end , name_len , bad_ext ) ;
* p + = name_len ;
}
/* damaged */
if ( mdsmap_ev > = 9 ) {
size_t need ;
ceph_decode_32_safe ( p , end , n , bad_ext ) ;
need = sizeof ( u32 ) * n ;
ceph_decode_need ( p , end , need , bad_ext ) ;
* p + = need ;
m - > m_damaged = n > 0 ;
} else {
m - > m_damaged = false ;
}
bad_ext :
2016-03-31 15:53:01 +08:00
* p = end ;
2009-10-06 11:31:09 -07:00
dout ( " mdsmap_decode success epoch %u \n " , m - > m_epoch ) ;
return m ;
2016-11-10 16:02:06 +08:00
nomem :
2009-10-06 11:31:09 -07:00
err = - ENOMEM ;
2016-11-10 16:02:06 +08:00
goto out_err ;
2009-10-06 11:31:09 -07:00
bad :
pr_err ( " corrupt mdsmap \n " ) ;
2009-12-14 15:13:47 -08:00
print_hex_dump ( KERN_DEBUG , " mdsmap: " ,
DUMP_PREFIX_OFFSET , 16 , 1 ,
start , end - start , true ) ;
2016-11-10 16:02:06 +08:00
out_err :
2009-10-06 11:31:09 -07:00
ceph_mdsmap_destroy ( m ) ;
2013-05-28 16:59:00 +02:00
return ERR_PTR ( err ) ;
2009-10-06 11:31:09 -07:00
}
void ceph_mdsmap_destroy ( struct ceph_mdsmap * m )
{
int i ;
2017-03-28 17:04:13 +08:00
for ( i = 0 ; i < m - > m_num_mds ; i + + )
2009-10-06 11:31:09 -07:00
kfree ( m - > m_info [ i ] . export_targets ) ;
kfree ( m - > m_info ) ;
kfree ( m - > m_data_pg_pools ) ;
kfree ( m ) ;
}
2016-11-10 16:02:06 +08:00
bool ceph_mdsmap_is_cluster_available ( struct ceph_mdsmap * m )
{
int i , nr_active = 0 ;
if ( ! m - > m_enabled )
return false ;
if ( m - > m_damaged )
return false ;
if ( m - > m_num_laggy > 0 )
return false ;
2017-03-28 17:04:13 +08:00
for ( i = 0 ; i < m - > m_num_mds ; i + + ) {
2016-11-10 16:02:06 +08:00
if ( m - > m_info [ i ] . state = = CEPH_MDS_STATE_ACTIVE )
nr_active + + ;
}
return nr_active > 0 ;
}