2014-03-07 21:21:15 +04:00
/*
* Copyright ( C ) 2015 , SUSE
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 , or ( at your option )
* any later version .
*
*/
# include <linux/module.h>
2016-08-12 08:42:42 +03:00
# include <linux/kthread.h>
2014-03-07 23:49:26 +04:00
# include <linux/dlm.h>
# include <linux/sched.h>
2014-10-30 02:51:31 +03:00
# include <linux/raid/md_p.h>
2014-03-07 23:49:26 +04:00
# include "md.h"
2014-06-07 09:45:22 +04:00
# include "bitmap.h"
2014-03-29 19:01:53 +04:00
# include "md-cluster.h"
2014-03-07 23:49:26 +04:00
# define LVB_SIZE 64
2014-10-30 02:51:31 +03:00
# define NEW_DEV_TIMEOUT 5000
2014-03-07 23:49:26 +04:00
struct dlm_lock_resource {
dlm_lockspace_t * ls ;
struct dlm_lksb lksb ;
char * name ; /* lock name. */
uint32_t flags ; /* flags to pass to dlm_lock() */
2016-08-12 08:42:41 +03:00
wait_queue_head_t sync_locking ; /* wait queue for synchronized locking */
bool sync_locking_done ;
2014-03-29 19:20:02 +04:00
void ( * bast ) ( void * arg , int mode ) ; /* blocking AST function pointer*/
struct mddev * mddev ; /* pointing back to mddev. */
2015-10-01 21:20:27 +03:00
int mode ;
2014-03-29 19:20:02 +04:00
} ;
2014-06-06 21:35:34 +04:00
struct suspend_info {
int slot ;
sector_t lo ;
sector_t hi ;
struct list_head list ;
} ;
struct resync_info {
__le64 lo ;
__le64 hi ;
} ;
2015-03-02 19:55:49 +03:00
/* md_cluster_info flags */
# define MD_CLUSTER_WAITING_FOR_NEWDISK 1
2015-06-24 17:30:32 +03:00
# define MD_CLUSTER_SUSPEND_READ_BALANCING 2
2015-07-10 12:01:21 +03:00
# define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
2015-03-02 19:55:49 +03:00
2015-12-21 02:51:00 +03:00
/* Lock the send communication. This is done through
* bit manipulation as opposed to a mutex in order to
* accomodate lock and hold . See next comment .
*/
# define MD_CLUSTER_SEND_LOCK 4
2015-12-21 02:51:01 +03:00
/* If cluster operations (such as adding a disk) must lock the
* communication channel , so as to perform extra operations
* ( update metadata ) and no other operation is allowed on the
* MD . Token needs to be locked and held until the operation
* completes witha md_update_sb ( ) , which would eventually release
* the lock .
2015-12-21 02:51:00 +03:00
*/
# define MD_CLUSTER_SEND_LOCKED_ALREADY 5
2016-05-04 09:17:09 +03:00
/* We should receive message after node joined cluster and
* set up all the related infos such as bitmap and personality */
# define MD_CLUSTER_ALREADY_IN_CLUSTER 6
# define MD_CLUSTER_PENDING_RECV_EVENT 7
2015-12-21 02:51:00 +03:00
2015-03-02 19:55:49 +03:00
2014-03-29 19:20:02 +04:00
struct md_cluster_info {
/* dlm lock space and resources for clustered raid. */
dlm_lockspace_t * lockspace ;
2014-03-30 09:42:49 +04:00
int slot_number ;
struct completion completion ;
2015-12-21 02:51:00 +03:00
struct mutex recv_mutex ;
2014-06-06 21:12:32 +04:00
struct dlm_lock_resource * bitmap_lockres ;
2015-12-21 02:51:00 +03:00
struct dlm_lock_resource * * other_bitmap_lockres ;
2015-09-30 21:20:35 +03:00
struct dlm_lock_resource * resync_lockres ;
2014-06-06 21:35:34 +04:00
struct list_head suspend_list ;
spinlock_t suspend_lock ;
2014-06-07 09:45:22 +04:00
struct md_thread * recovery_thread ;
unsigned long recovery_map ;
2014-06-07 10:08:29 +04:00
/* communication loc resources */
struct dlm_lock_resource * ack_lockres ;
struct dlm_lock_resource * message_lockres ;
struct dlm_lock_resource * token_lockres ;
2014-10-30 02:51:31 +03:00
struct dlm_lock_resource * no_new_dev_lockres ;
2014-06-07 10:08:29 +04:00
struct md_thread * recv_thread ;
2014-10-30 02:51:31 +03:00
struct completion newdisk_completion ;
2015-12-21 02:51:00 +03:00
wait_queue_head_t wait ;
2015-03-02 19:55:49 +03:00
unsigned long state ;
2016-05-02 18:50:12 +03:00
/* record the region in RESYNCING message */
sector_t sync_low ;
sector_t sync_hi ;
2014-06-07 10:08:29 +04:00
} ;
enum msg_type {
METADATA_UPDATED = 0 ,
RESYNCING ,
2014-10-30 02:51:31 +03:00
NEWDISK ,
2015-04-14 18:44:44 +03:00
REMOVE ,
2015-04-14 18:45:42 +03:00
RE_ADD ,
2015-07-10 11:54:04 +03:00
BITMAP_NEEDS_SYNC ,
2014-06-07 10:08:29 +04:00
} ;
struct cluster_msg {
2015-10-16 10:40:22 +03:00
__le32 type ;
__le32 slot ;
2014-10-30 02:51:31 +03:00
/* TODO: Unionize this for smaller footprint */
2015-10-16 10:40:22 +03:00
__le64 low ;
__le64 high ;
2014-10-30 02:51:31 +03:00
char uuid [ 16 ] ;
2015-10-16 10:40:22 +03:00
__le32 raid_slot ;
2014-03-07 23:49:26 +04:00
} ;
static void sync_ast ( void * arg )
{
struct dlm_lock_resource * res ;
2015-10-19 07:42:18 +03:00
res = arg ;
2016-08-12 08:42:41 +03:00
res - > sync_locking_done = true ;
wake_up ( & res - > sync_locking ) ;
2014-03-07 23:49:26 +04:00
}
static int dlm_lock_sync ( struct dlm_lock_resource * res , int mode )
{
int ret = 0 ;
ret = dlm_lock ( res - > ls , mode , & res - > lksb ,
res - > flags , res - > name , strlen ( res - > name ) ,
0 , sync_ast , res , res - > bast ) ;
if ( ret )
return ret ;
2016-08-12 08:42:41 +03:00
wait_event ( res - > sync_locking , res - > sync_locking_done ) ;
res - > sync_locking_done = false ;
2015-10-01 21:20:27 +03:00
if ( res - > lksb . sb_status = = 0 )
res - > mode = mode ;
2014-03-07 23:49:26 +04:00
return res - > lksb . sb_status ;
}
static int dlm_unlock_sync ( struct dlm_lock_resource * res )
{
return dlm_lock_sync ( res , DLM_LOCK_NL ) ;
}
2016-08-12 08:42:42 +03:00
/*
* An variation of dlm_lock_sync , which make lock request could
* be interrupted
*/
static int dlm_lock_sync_interruptible ( struct dlm_lock_resource * res , int mode ,
struct mddev * mddev )
{
int ret = 0 ;
ret = dlm_lock ( res - > ls , mode , & res - > lksb ,
res - > flags , res - > name , strlen ( res - > name ) ,
0 , sync_ast , res , res - > bast ) ;
if ( ret )
return ret ;
wait_event ( res - > sync_locking , res - > sync_locking_done
2016-08-12 08:42:43 +03:00
| | kthread_should_stop ( )
| | test_bit ( MD_CLOSING , & mddev - > flags ) ) ;
2016-08-12 08:42:42 +03:00
if ( ! res - > sync_locking_done ) {
/*
* the convert queue contains the lock request when request is
* interrupted , and sync_ast could still be run , so need to
* cancel the request and reset completion
*/
ret = dlm_unlock ( res - > ls , res - > lksb . sb_lkid , DLM_LKF_CANCEL ,
& res - > lksb , res ) ;
res - > sync_locking_done = false ;
if ( unlikely ( ret ! = 0 ) )
pr_info ( " failed to cancel previous lock request "
" %s return %d \n " , res - > name , ret ) ;
return - EPERM ;
} else
res - > sync_locking_done = false ;
if ( res - > lksb . sb_status = = 0 )
res - > mode = mode ;
return res - > lksb . sb_status ;
}
2014-03-29 19:20:02 +04:00
static struct dlm_lock_resource * lockres_init ( struct mddev * mddev ,
2014-03-07 23:49:26 +04:00
char * name , void ( * bastfn ) ( void * arg , int mode ) , int with_lvb )
{
struct dlm_lock_resource * res = NULL ;
int ret , namelen ;
2014-03-29 19:20:02 +04:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2014-03-07 23:49:26 +04:00
res = kzalloc ( sizeof ( struct dlm_lock_resource ) , GFP_KERNEL ) ;
if ( ! res )
return NULL ;
2016-08-12 08:42:41 +03:00
init_waitqueue_head ( & res - > sync_locking ) ;
res - > sync_locking_done = false ;
2014-03-29 19:20:02 +04:00
res - > ls = cinfo - > lockspace ;
res - > mddev = mddev ;
2015-10-01 21:20:27 +03:00
res - > mode = DLM_LOCK_IV ;
2014-03-07 23:49:26 +04:00
namelen = strlen ( name ) ;
res - > name = kzalloc ( namelen + 1 , GFP_KERNEL ) ;
if ( ! res - > name ) {
pr_err ( " md-cluster: Unable to allocate resource name for resource %s \n " , name ) ;
goto out_err ;
}
strlcpy ( res - > name , name , namelen + 1 ) ;
if ( with_lvb ) {
res - > lksb . sb_lvbptr = kzalloc ( LVB_SIZE , GFP_KERNEL ) ;
if ( ! res - > lksb . sb_lvbptr ) {
pr_err ( " md-cluster: Unable to allocate LVB for resource %s \n " , name ) ;
goto out_err ;
}
res - > flags = DLM_LKF_VALBLK ;
}
if ( bastfn )
res - > bast = bastfn ;
res - > flags | = DLM_LKF_EXPEDITE ;
ret = dlm_lock_sync ( res , DLM_LOCK_NL ) ;
if ( ret ) {
pr_err ( " md-cluster: Unable to lock NL on new lock resource %s \n " , name ) ;
goto out_err ;
}
res - > flags & = ~ DLM_LKF_EXPEDITE ;
res - > flags | = DLM_LKF_CONVERT ;
return res ;
out_err :
kfree ( res - > lksb . sb_lvbptr ) ;
kfree ( res - > name ) ;
kfree ( res ) ;
return NULL ;
}
static void lockres_free ( struct dlm_lock_resource * res )
{
2016-08-12 08:42:35 +03:00
int ret = 0 ;
2015-07-10 12:01:17 +03:00
2014-03-07 23:49:26 +04:00
if ( ! res )
return ;
2016-08-12 08:42:35 +03:00
/*
* use FORCEUNLOCK flag , so we can unlock even the lock is on the
* waiting or convert queue
*/
ret = dlm_unlock ( res - > ls , res - > lksb . sb_lkid , DLM_LKF_FORCEUNLOCK ,
& res - > lksb , res ) ;
if ( unlikely ( ret ! = 0 ) )
pr_err ( " failed to unlock %s return %d \n " , res - > name , ret ) ;
else
2016-08-12 08:42:41 +03:00
wait_event ( res - > sync_locking , res - > sync_locking_done ) ;
2014-03-07 23:49:26 +04:00
kfree ( res - > name ) ;
kfree ( res - > lksb . sb_lvbptr ) ;
kfree ( res ) ;
}
2014-03-07 21:21:15 +04:00
2015-10-19 07:44:00 +03:00
static void add_resync_info ( struct dlm_lock_resource * lockres ,
sector_t lo , sector_t hi )
2014-06-06 21:35:34 +04:00
{
struct resync_info * ri ;
ri = ( struct resync_info * ) lockres - > lksb . sb_lvbptr ;
ri - > lo = cpu_to_le64 ( lo ) ;
ri - > hi = cpu_to_le64 ( hi ) ;
}
static struct suspend_info * read_resync_info ( struct mddev * mddev , struct dlm_lock_resource * lockres )
{
struct resync_info ri ;
struct suspend_info * s = NULL ;
sector_t hi = 0 ;
dlm_lock_sync ( lockres , DLM_LOCK_CR ) ;
memcpy ( & ri , lockres - > lksb . sb_lvbptr , sizeof ( struct resync_info ) ) ;
hi = le64_to_cpu ( ri . hi ) ;
2015-10-16 10:40:22 +03:00
if ( hi > 0 ) {
2014-06-06 21:35:34 +04:00
s = kzalloc ( sizeof ( struct suspend_info ) , GFP_KERNEL ) ;
if ( ! s )
goto out ;
s - > hi = hi ;
s - > lo = le64_to_cpu ( ri . lo ) ;
}
dlm_unlock_sync ( lockres ) ;
out :
return s ;
}
2015-02-28 02:04:37 +03:00
static void recover_bitmaps ( struct md_thread * thread )
2014-06-07 09:45:22 +04:00
{
struct mddev * mddev = thread - > mddev ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
struct dlm_lock_resource * bm_lockres ;
char str [ 64 ] ;
int slot , ret ;
struct suspend_info * s , * tmp ;
sector_t lo , hi ;
while ( cinfo - > recovery_map ) {
slot = fls64 ( ( u64 ) cinfo - > recovery_map ) - 1 ;
/* Clear suspend_area associated with the bitmap */
spin_lock_irq ( & cinfo - > suspend_lock ) ;
list_for_each_entry_safe ( s , tmp , & cinfo - > suspend_list , list )
if ( slot = = s - > slot ) {
list_del ( & s - > list ) ;
kfree ( s ) ;
}
spin_unlock_irq ( & cinfo - > suspend_lock ) ;
snprintf ( str , 64 , " bitmap%04d " , slot ) ;
bm_lockres = lockres_init ( mddev , str , NULL , 1 ) ;
if ( ! bm_lockres ) {
pr_err ( " md-cluster: Cannot initialize bitmaps \n " ) ;
goto clear_bit ;
}
2016-08-12 08:42:42 +03:00
ret = dlm_lock_sync_interruptible ( bm_lockres , DLM_LOCK_PW , mddev ) ;
2014-06-07 09:45:22 +04:00
if ( ret ) {
pr_err ( " md-cluster: Could not DLM lock %s: %d \n " ,
str , ret ) ;
goto clear_bit ;
}
2015-04-14 18:45:42 +03:00
ret = bitmap_copy_from_slot ( mddev , slot , & lo , & hi , true ) ;
2014-06-07 09:52:29 +04:00
if ( ret ) {
2014-06-07 09:45:22 +04:00
pr_err ( " md-cluster: Could not copy data from bitmap %d \n " , slot ) ;
2016-08-12 08:42:36 +03:00
goto clear_bit ;
2014-06-07 09:52:29 +04:00
}
if ( hi > 0 ) {
if ( lo < mddev - > recovery_cp )
mddev - > recovery_cp = lo ;
2016-05-02 18:33:10 +03:00
/* wake up thread to continue resync in case resync
* is not finished */
if ( mddev - > recovery_cp ! = MaxSector ) {
set_bit ( MD_RECOVERY_NEEDED , & mddev - > recovery ) ;
md_wakeup_thread ( mddev - > thread ) ;
}
2014-06-07 09:52:29 +04:00
}
2014-06-07 09:45:22 +04:00
clear_bit :
2016-01-23 02:54:42 +03:00
lockres_free ( bm_lockres ) ;
2014-06-07 09:45:22 +04:00
clear_bit ( slot , & cinfo - > recovery_map ) ;
}
}
2014-03-30 09:42:49 +04:00
static void recover_prep ( void * arg )
{
2015-06-24 17:30:32 +03:00
struct mddev * mddev = arg ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
set_bit ( MD_CLUSTER_SUSPEND_READ_BALANCING , & cinfo - > state ) ;
2014-03-30 09:42:49 +04:00
}
2015-07-10 11:54:03 +03:00
static void __recover_slot ( struct mddev * mddev , int slot )
2014-03-30 09:42:49 +04:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-07-10 11:54:03 +03:00
set_bit ( slot , & cinfo - > recovery_map ) ;
2014-06-07 09:45:22 +04:00
if ( ! cinfo - > recovery_thread ) {
cinfo - > recovery_thread = md_register_thread ( recover_bitmaps ,
mddev , " recover " ) ;
if ( ! cinfo - > recovery_thread ) {
pr_warn ( " md-cluster: Could not create recovery thread \n " ) ;
return ;
}
}
md_wakeup_thread ( cinfo - > recovery_thread ) ;
2014-03-30 09:42:49 +04:00
}
2015-07-10 11:54:03 +03:00
static void recover_slot ( void * arg , struct dlm_slot * slot )
{
struct mddev * mddev = arg ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
pr_info ( " md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery. \n " ,
mddev - > bitmap_info . cluster_name ,
slot - > nodeid , slot - > slot ,
cinfo - > slot_number ) ;
/* deduct one since dlm slot starts from one while the num of
* cluster - md begins with 0 */
__recover_slot ( mddev , slot - > slot - 1 ) ;
}
2014-03-30 09:42:49 +04:00
static void recover_done ( void * arg , struct dlm_slot * slots ,
int num_slots , int our_slot ,
uint32_t generation )
{
struct mddev * mddev = arg ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
cinfo - > slot_number = our_slot ;
2015-07-10 12:01:21 +03:00
/* completion is only need to be complete when node join cluster,
* it doesn ' t need to run during another node ' s failure */
if ( test_bit ( MD_CLUSTER_BEGIN_JOIN_CLUSTER , & cinfo - > state ) ) {
complete ( & cinfo - > completion ) ;
clear_bit ( MD_CLUSTER_BEGIN_JOIN_CLUSTER , & cinfo - > state ) ;
}
2015-06-24 17:30:32 +03:00
clear_bit ( MD_CLUSTER_SUSPEND_READ_BALANCING , & cinfo - > state ) ;
2014-03-30 09:42:49 +04:00
}
2015-07-10 12:01:21 +03:00
/* the ops is called when node join the cluster, and do lock recovery
* if node failure occurs */
2014-03-30 09:42:49 +04:00
static const struct dlm_lockspace_ops md_ls_ops = {
. recover_prep = recover_prep ,
. recover_slot = recover_slot ,
. recover_done = recover_done ,
} ;
2014-06-07 10:08:29 +04:00
/*
* The BAST function for the ack lock resource
* This function wakes up the receive thread in
* order to receive and process the message .
*/
static void ack_bast ( void * arg , int mode )
{
2015-10-19 07:42:18 +03:00
struct dlm_lock_resource * res = arg ;
2014-06-07 10:08:29 +04:00
struct md_cluster_info * cinfo = res - > mddev - > cluster_info ;
2016-05-04 09:17:09 +03:00
if ( mode = = DLM_LOCK_EX ) {
if ( test_bit ( MD_CLUSTER_ALREADY_IN_CLUSTER , & cinfo - > state ) )
md_wakeup_thread ( cinfo - > recv_thread ) ;
else
set_bit ( MD_CLUSTER_PENDING_RECV_EVENT , & cinfo - > state ) ;
}
2014-06-07 10:08:29 +04:00
}
2014-06-07 11:30:30 +04:00
static void __remove_suspend_info ( struct md_cluster_info * cinfo , int slot )
{
struct suspend_info * s , * tmp ;
list_for_each_entry_safe ( s , tmp , & cinfo - > suspend_list , list )
if ( slot = = s - > slot ) {
list_del ( & s - > list ) ;
kfree ( s ) ;
break ;
}
}
2015-10-09 19:27:01 +03:00
static void remove_suspend_info ( struct mddev * mddev , int slot )
2014-06-07 11:30:30 +04:00
{
2015-10-09 19:27:01 +03:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2014-06-07 11:30:30 +04:00
spin_lock_irq ( & cinfo - > suspend_lock ) ;
__remove_suspend_info ( cinfo , slot ) ;
spin_unlock_irq ( & cinfo - > suspend_lock ) ;
2015-10-09 19:27:01 +03:00
mddev - > pers - > quiesce ( mddev , 2 ) ;
2014-06-07 11:30:30 +04:00
}
2015-08-14 20:19:40 +03:00
static void process_suspend_info ( struct mddev * mddev ,
2014-06-07 11:30:30 +04:00
int slot , sector_t lo , sector_t hi )
{
2015-08-14 20:19:40 +03:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2014-06-07 11:30:30 +04:00
struct suspend_info * s ;
if ( ! hi ) {
2015-10-09 19:27:01 +03:00
remove_suspend_info ( mddev , slot ) ;
2015-09-30 21:20:35 +03:00
set_bit ( MD_RECOVERY_NEEDED , & mddev - > recovery ) ;
md_wakeup_thread ( mddev - > thread ) ;
2014-06-07 11:30:30 +04:00
return ;
}
2016-05-02 18:50:12 +03:00
/*
* The bitmaps are not same for different nodes
* if RESYNCING is happening in one node , then
* the node which received the RESYNCING message
* probably will perform resync with the region
* [ lo , hi ] again , so we could reduce resync time
* a lot if we can ensure that the bitmaps among
* different nodes are match up well .
*
* sync_low / hi is used to record the region which
* arrived in the previous RESYNCING message ,
*
* Call bitmap_sync_with_cluster to clear
* NEEDED_MASK and set RESYNC_MASK since
* resync thread is running in another node ,
* so we don ' t need to do the resync again
* with the same section */
bitmap_sync_with_cluster ( mddev , cinfo - > sync_low ,
cinfo - > sync_hi ,
lo , hi ) ;
cinfo - > sync_low = lo ;
cinfo - > sync_hi = hi ;
2014-06-07 11:30:30 +04:00
s = kzalloc ( sizeof ( struct suspend_info ) , GFP_KERNEL ) ;
if ( ! s )
return ;
s - > slot = slot ;
s - > lo = lo ;
s - > hi = hi ;
2015-08-14 20:19:40 +03:00
mddev - > pers - > quiesce ( mddev , 1 ) ;
mddev - > pers - > quiesce ( mddev , 0 ) ;
2014-06-07 11:30:30 +04:00
spin_lock_irq ( & cinfo - > suspend_lock ) ;
/* Remove existing entry (if exists) before adding */
__remove_suspend_info ( cinfo , slot ) ;
list_add ( & s - > list , & cinfo - > suspend_list ) ;
spin_unlock_irq ( & cinfo - > suspend_lock ) ;
2015-10-09 19:27:01 +03:00
mddev - > pers - > quiesce ( mddev , 2 ) ;
2014-06-07 11:30:30 +04:00
}
2014-10-30 02:51:31 +03:00
static void process_add_new_disk ( struct mddev * mddev , struct cluster_msg * cmsg )
{
char disk_uuid [ 64 ] ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
char event_name [ ] = " EVENT=ADD_DEVICE " ;
char raid_slot [ 16 ] ;
char * envp [ ] = { event_name , disk_uuid , raid_slot , NULL } ;
int len ;
len = snprintf ( disk_uuid , 64 , " DEVICE_UUID= " ) ;
2015-07-10 11:54:02 +03:00
sprintf ( disk_uuid + len , " %pU " , cmsg - > uuid ) ;
2015-10-12 12:21:21 +03:00
snprintf ( raid_slot , 16 , " RAID_DISK=%d " , le32_to_cpu ( cmsg - > raid_slot ) ) ;
2014-10-30 02:51:31 +03:00
pr_info ( " %s:%d Sending kobject change with %s and %s \n " , __func__ , __LINE__ , disk_uuid , raid_slot ) ;
init_completion ( & cinfo - > newdisk_completion ) ;
2015-03-02 19:55:49 +03:00
set_bit ( MD_CLUSTER_WAITING_FOR_NEWDISK , & cinfo - > state ) ;
2014-10-30 02:51:31 +03:00
kobject_uevent_env ( & disk_to_dev ( mddev - > gendisk ) - > kobj , KOBJ_CHANGE , envp ) ;
wait_for_completion_timeout ( & cinfo - > newdisk_completion ,
NEW_DEV_TIMEOUT ) ;
2015-03-02 19:55:49 +03:00
clear_bit ( MD_CLUSTER_WAITING_FOR_NEWDISK , & cinfo - > state ) ;
2014-10-30 02:51:31 +03:00
}
static void process_metadata_update ( struct mddev * mddev , struct cluster_msg * msg )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-12-21 02:51:00 +03:00
mddev - > good_device_nr = le32_to_cpu ( msg - > raid_slot ) ;
set_bit ( MD_RELOAD_SB , & mddev - > flags ) ;
2014-10-30 02:51:31 +03:00
dlm_lock_sync ( cinfo - > no_new_dev_lockres , DLM_LOCK_CR ) ;
2015-12-21 02:51:00 +03:00
md_wakeup_thread ( mddev - > thread ) ;
2014-10-30 02:51:31 +03:00
}
2015-04-14 18:44:44 +03:00
static void process_remove_disk ( struct mddev * mddev , struct cluster_msg * msg )
{
2016-08-12 08:42:39 +03:00
struct md_rdev * rdev ;
2015-04-14 18:44:44 +03:00
2016-08-12 08:42:39 +03:00
rcu_read_lock ( ) ;
rdev = md_find_rdev_nr_rcu ( mddev , le32_to_cpu ( msg - > raid_slot ) ) ;
2015-12-21 02:50:59 +03:00
if ( rdev ) {
set_bit ( ClusterRemove , & rdev - > flags ) ;
set_bit ( MD_RECOVERY_NEEDED , & mddev - > recovery ) ;
md_wakeup_thread ( mddev - > thread ) ;
}
2015-04-14 18:44:44 +03:00
else
2015-10-12 12:21:21 +03:00
pr_warn ( " %s: %d Could not find disk(%d) to REMOVE \n " ,
__func__ , __LINE__ , le32_to_cpu ( msg - > raid_slot ) ) ;
2016-08-12 08:42:39 +03:00
rcu_read_unlock ( ) ;
2015-04-14 18:44:44 +03:00
}
2015-04-14 18:45:42 +03:00
static void process_readd_disk ( struct mddev * mddev , struct cluster_msg * msg )
{
2016-08-12 08:42:39 +03:00
struct md_rdev * rdev ;
2015-04-14 18:45:42 +03:00
2016-08-12 08:42:39 +03:00
rcu_read_lock ( ) ;
rdev = md_find_rdev_nr_rcu ( mddev , le32_to_cpu ( msg - > raid_slot ) ) ;
2015-04-14 18:45:42 +03:00
if ( rdev & & test_bit ( Faulty , & rdev - > flags ) )
clear_bit ( Faulty , & rdev - > flags ) ;
else
2015-10-12 12:21:21 +03:00
pr_warn ( " %s: %d Could not find disk(%d) which is faulty " ,
__func__ , __LINE__ , le32_to_cpu ( msg - > raid_slot ) ) ;
2016-08-12 08:42:39 +03:00
rcu_read_unlock ( ) ;
2015-04-14 18:45:42 +03:00
}
2016-05-04 05:22:15 +03:00
static int process_recvd_msg ( struct mddev * mddev , struct cluster_msg * msg )
2014-06-07 10:08:29 +04:00
{
2016-05-04 05:22:15 +03:00
int ret = 0 ;
2015-10-12 12:21:23 +03:00
if ( WARN ( mddev - > cluster_info - > slot_number - 1 = = le32_to_cpu ( msg - > slot ) ,
" node %d received it's own msg \n " , le32_to_cpu ( msg - > slot ) ) )
2016-05-04 05:22:15 +03:00
return - 1 ;
2015-10-16 10:40:22 +03:00
switch ( le32_to_cpu ( msg - > type ) ) {
2014-06-07 10:08:29 +04:00
case METADATA_UPDATED :
2014-10-30 02:51:31 +03:00
process_metadata_update ( mddev , msg ) ;
2014-06-07 10:08:29 +04:00
break ;
case RESYNCING :
2015-10-16 10:40:22 +03:00
process_suspend_info ( mddev , le32_to_cpu ( msg - > slot ) ,
le64_to_cpu ( msg - > low ) ,
le64_to_cpu ( msg - > high ) ) ;
2014-06-07 10:08:29 +04:00
break ;
2014-10-30 02:51:31 +03:00
case NEWDISK :
process_add_new_disk ( mddev , msg ) ;
2015-04-14 18:44:44 +03:00
break ;
case REMOVE :
process_remove_disk ( mddev , msg ) ;
break ;
2015-04-14 18:45:42 +03:00
case RE_ADD :
process_readd_disk ( mddev , msg ) ;
break ;
2015-07-10 11:54:04 +03:00
case BITMAP_NEEDS_SYNC :
2015-10-16 10:40:22 +03:00
__recover_slot ( mddev , le32_to_cpu ( msg - > slot ) ) ;
2015-07-10 11:54:04 +03:00
break ;
2015-04-14 18:44:44 +03:00
default :
2016-05-04 05:22:15 +03:00
ret = - 1 ;
2015-04-14 18:44:44 +03:00
pr_warn ( " %s:%d Received unknown message from %d \n " ,
__func__ , __LINE__ , msg - > slot ) ;
2015-02-28 04:16:08 +03:00
}
2016-05-04 05:22:15 +03:00
return ret ;
2014-06-07 10:08:29 +04:00
}
/*
* thread for receiving message
*/
static void recv_daemon ( struct md_thread * thread )
{
struct md_cluster_info * cinfo = thread - > mddev - > cluster_info ;
struct dlm_lock_resource * ack_lockres = cinfo - > ack_lockres ;
struct dlm_lock_resource * message_lockres = cinfo - > message_lockres ;
struct cluster_msg msg ;
2015-07-10 12:01:17 +03:00
int ret ;
2014-06-07 10:08:29 +04:00
2015-12-21 02:51:00 +03:00
mutex_lock ( & cinfo - > recv_mutex ) ;
2014-06-07 10:08:29 +04:00
/*get CR on Message*/
if ( dlm_lock_sync ( message_lockres , DLM_LOCK_CR ) ) {
pr_err ( " md/raid1:failed to get CR on MESSAGE \n " ) ;
2015-12-21 02:51:00 +03:00
mutex_unlock ( & cinfo - > recv_mutex ) ;
2014-06-07 10:08:29 +04:00
return ;
}
/* read lvb and wake up thread to process this message_lockres */
memcpy ( & msg , message_lockres - > lksb . sb_lvbptr , sizeof ( struct cluster_msg ) ) ;
2016-05-04 05:22:15 +03:00
ret = process_recvd_msg ( thread - > mddev , & msg ) ;
if ( ret )
goto out ;
2014-06-07 10:08:29 +04:00
/*release CR on ack_lockres*/
2015-07-10 12:01:17 +03:00
ret = dlm_unlock_sync ( ack_lockres ) ;
if ( unlikely ( ret ! = 0 ) )
pr_info ( " unlock ack failed return %d \n " , ret ) ;
2015-07-10 12:01:15 +03:00
/*up-convert to PR on message_lockres*/
2015-07-10 12:01:17 +03:00
ret = dlm_lock_sync ( message_lockres , DLM_LOCK_PR ) ;
if ( unlikely ( ret ! = 0 ) )
pr_info ( " lock PR on msg failed return %d \n " , ret ) ;
2014-06-07 10:08:29 +04:00
/*get CR on ack_lockres again*/
2015-07-10 12:01:17 +03:00
ret = dlm_lock_sync ( ack_lockres , DLM_LOCK_CR ) ;
if ( unlikely ( ret ! = 0 ) )
pr_info ( " lock CR on ack failed return %d \n " , ret ) ;
2016-05-04 05:22:15 +03:00
out :
2014-06-07 10:08:29 +04:00
/*release CR on message_lockres*/
2015-07-10 12:01:17 +03:00
ret = dlm_unlock_sync ( message_lockres ) ;
if ( unlikely ( ret ! = 0 ) )
pr_info ( " unlock msg failed return %d \n " , ret ) ;
2015-12-21 02:51:00 +03:00
mutex_unlock ( & cinfo - > recv_mutex ) ;
2014-06-07 10:08:29 +04:00
}
2015-12-21 02:51:00 +03:00
/* lock_token()
2014-06-07 10:28:53 +04:00
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway .
*/
2015-12-21 02:51:00 +03:00
static int lock_token ( struct md_cluster_info * cinfo )
2014-06-07 10:28:53 +04:00
{
int error ;
error = dlm_lock_sync ( cinfo - > token_lockres , DLM_LOCK_EX ) ;
if ( error )
pr_err ( " md-cluster(%s:%d): failed to get EX on TOKEN (%d) \n " ,
__func__ , __LINE__ , error ) ;
2015-12-21 02:51:00 +03:00
/* Lock the receive sequence */
mutex_lock ( & cinfo - > recv_mutex ) ;
2014-06-07 10:28:53 +04:00
return error ;
}
2015-12-21 02:51:00 +03:00
/* lock_comm()
* Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel .
*/
static int lock_comm ( struct md_cluster_info * cinfo )
{
wait_event ( cinfo - > wait ,
! test_and_set_bit ( MD_CLUSTER_SEND_LOCK , & cinfo - > state ) ) ;
return lock_token ( cinfo ) ;
}
2014-06-07 10:28:53 +04:00
static void unlock_comm ( struct md_cluster_info * cinfo )
{
2015-10-01 21:20:27 +03:00
WARN_ON ( cinfo - > token_lockres - > mode ! = DLM_LOCK_EX ) ;
2015-12-21 02:51:00 +03:00
mutex_unlock ( & cinfo - > recv_mutex ) ;
2014-06-07 10:28:53 +04:00
dlm_unlock_sync ( cinfo - > token_lockres ) ;
2015-12-21 02:51:00 +03:00
clear_bit ( MD_CLUSTER_SEND_LOCK , & cinfo - > state ) ;
wake_up ( & cinfo - > wait ) ;
2014-06-07 10:28:53 +04:00
}
/* __sendmsg()
* This function performs the actual sending of the message . This function is
* usually called after performing the encompassing operation
* The function :
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
2015-07-10 12:01:15 +03:00
* 3. Downconverts message lockresource to CW
2014-06-07 10:28:53 +04:00
* 4. Upconverts ack lock resource from CR to EX . This forces the BAST on other nodes
* and the other nodes read the message . The thread will wait here until all other
* nodes have released ack lock resource .
* 5. Downconvert ack lockresource to CR
*/
static int __sendmsg ( struct md_cluster_info * cinfo , struct cluster_msg * cmsg )
{
int error ;
int slot = cinfo - > slot_number - 1 ;
cmsg - > slot = cpu_to_le32 ( slot ) ;
/*get EX on Message*/
error = dlm_lock_sync ( cinfo - > message_lockres , DLM_LOCK_EX ) ;
if ( error ) {
pr_err ( " md-cluster: failed to get EX on MESSAGE (%d) \n " , error ) ;
goto failed_message ;
}
memcpy ( cinfo - > message_lockres - > lksb . sb_lvbptr , ( void * ) cmsg ,
sizeof ( struct cluster_msg ) ) ;
2015-07-10 12:01:15 +03:00
/*down-convert EX to CW on Message*/
error = dlm_lock_sync ( cinfo - > message_lockres , DLM_LOCK_CW ) ;
2014-06-07 10:28:53 +04:00
if ( error ) {
2015-07-10 12:01:15 +03:00
pr_err ( " md-cluster: failed to convert EX to CW on MESSAGE(%d) \n " ,
2014-06-07 10:28:53 +04:00
error ) ;
2015-07-10 12:01:15 +03:00
goto failed_ack ;
2014-06-07 10:28:53 +04:00
}
/*up-convert CR to EX on Ack*/
error = dlm_lock_sync ( cinfo - > ack_lockres , DLM_LOCK_EX ) ;
if ( error ) {
pr_err ( " md-cluster: failed to convert CR to EX on ACK(%d) \n " ,
error ) ;
goto failed_ack ;
}
/*down-convert EX to CR on Ack*/
error = dlm_lock_sync ( cinfo - > ack_lockres , DLM_LOCK_CR ) ;
if ( error ) {
pr_err ( " md-cluster: failed to convert EX to CR on ACK(%d) \n " ,
error ) ;
goto failed_ack ;
}
failed_ack :
2015-07-10 12:01:17 +03:00
error = dlm_unlock_sync ( cinfo - > message_lockres ) ;
if ( unlikely ( error ! = 0 ) ) {
pr_err ( " md-cluster: failed convert to NL on MESSAGE(%d) \n " ,
error ) ;
/* in case the message can't be released due to some reason */
goto failed_ack ;
}
2014-06-07 10:28:53 +04:00
failed_message :
return error ;
}
static int sendmsg ( struct md_cluster_info * cinfo , struct cluster_msg * cmsg )
{
int ret ;
lock_comm ( cinfo ) ;
ret = __sendmsg ( cinfo , cmsg ) ;
unlock_comm ( cinfo ) ;
return ret ;
}
2014-06-06 21:35:34 +04:00
static int gather_all_resync_info ( struct mddev * mddev , int total_slots )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
int i , ret = 0 ;
struct dlm_lock_resource * bm_lockres ;
struct suspend_info * s ;
char str [ 64 ] ;
2015-07-10 12:01:22 +03:00
sector_t lo , hi ;
2014-06-06 21:35:34 +04:00
for ( i = 0 ; i < total_slots ; i + + ) {
memset ( str , ' \0 ' , 64 ) ;
snprintf ( str , 64 , " bitmap%04d " , i ) ;
bm_lockres = lockres_init ( mddev , str , NULL , 1 ) ;
if ( ! bm_lockres )
return - ENOMEM ;
2016-01-23 02:54:42 +03:00
if ( i = = ( cinfo - > slot_number - 1 ) ) {
lockres_free ( bm_lockres ) ;
2014-06-06 21:35:34 +04:00
continue ;
2016-01-23 02:54:42 +03:00
}
2014-06-06 21:35:34 +04:00
bm_lockres - > flags | = DLM_LKF_NOQUEUE ;
ret = dlm_lock_sync ( bm_lockres , DLM_LOCK_PW ) ;
if ( ret = = - EAGAIN ) {
memset ( bm_lockres - > lksb . sb_lvbptr , ' \0 ' , LVB_SIZE ) ;
s = read_resync_info ( mddev , bm_lockres ) ;
if ( s ) {
pr_info ( " %s:%d Resync[%llu..%llu] in progress on %d \n " ,
__func__ , __LINE__ ,
( unsigned long long ) s - > lo ,
( unsigned long long ) s - > hi , i ) ;
spin_lock_irq ( & cinfo - > suspend_lock ) ;
s - > slot = i ;
list_add ( & s - > list , & cinfo - > suspend_list ) ;
spin_unlock_irq ( & cinfo - > suspend_lock ) ;
}
ret = 0 ;
lockres_free ( bm_lockres ) ;
continue ;
}
2015-07-10 12:01:20 +03:00
if ( ret ) {
lockres_free ( bm_lockres ) ;
2014-06-06 21:35:34 +04:00
goto out ;
2015-07-10 12:01:20 +03:00
}
2015-07-10 12:01:22 +03:00
/* Read the disk bitmap sb and check if it needs recovery */
ret = bitmap_copy_from_slot ( mddev , i , & lo , & hi , false ) ;
if ( ret ) {
pr_warn ( " md-cluster: Could not gather bitmaps from slot %d " , i ) ;
lockres_free ( bm_lockres ) ;
continue ;
}
if ( ( hi > 0 ) & & ( lo < mddev - > recovery_cp ) ) {
set_bit ( MD_RECOVERY_NEEDED , & mddev - > recovery ) ;
mddev - > recovery_cp = lo ;
md_check_recovery ( mddev ) ;
}
2014-06-06 21:35:34 +04:00
lockres_free ( bm_lockres ) ;
}
out :
return ret ;
}
2014-03-29 19:01:53 +04:00
static int join ( struct mddev * mddev , int nodes )
{
2014-03-29 19:20:02 +04:00
struct md_cluster_info * cinfo ;
2014-03-30 09:42:49 +04:00
int ret , ops_rv ;
2014-03-29 19:20:02 +04:00
char str [ 64 ] ;
cinfo = kzalloc ( sizeof ( struct md_cluster_info ) , GFP_KERNEL ) ;
if ( ! cinfo )
return - ENOMEM ;
2015-07-10 12:01:18 +03:00
INIT_LIST_HEAD ( & cinfo - > suspend_list ) ;
spin_lock_init ( & cinfo - > suspend_lock ) ;
2014-03-30 09:42:49 +04:00
init_completion ( & cinfo - > completion ) ;
2015-07-10 12:01:21 +03:00
set_bit ( MD_CLUSTER_BEGIN_JOIN_CLUSTER , & cinfo - > state ) ;
2015-12-21 02:51:00 +03:00
init_waitqueue_head ( & cinfo - > wait ) ;
mutex_init ( & cinfo - > recv_mutex ) ;
2014-03-30 09:42:49 +04:00
mddev - > cluster_info = cinfo ;
2014-03-29 19:20:02 +04:00
memset ( str , 0 , 64 ) ;
2015-07-10 11:54:02 +03:00
sprintf ( str , " %pU " , mddev - > uuid ) ;
2014-03-30 09:42:49 +04:00
ret = dlm_new_lockspace ( str , mddev - > bitmap_info . cluster_name ,
DLM_LSFL_FS , LVB_SIZE ,
& md_ls_ops , mddev , & ops_rv , & cinfo - > lockspace ) ;
2014-03-29 19:20:02 +04:00
if ( ret )
goto err ;
2014-03-30 09:42:49 +04:00
wait_for_completion ( & cinfo - > completion ) ;
2015-04-21 19:25:52 +03:00
if ( nodes < cinfo - > slot_number ) {
pr_err ( " md-cluster: Slot allotted(%d) is greater than available slots(%d). " ,
cinfo - > slot_number , nodes ) ;
2014-06-06 20:50:56 +04:00
ret = - ERANGE ;
goto err ;
}
2014-06-07 10:08:29 +04:00
/* Initiate the communication resources */
ret = - ENOMEM ;
cinfo - > recv_thread = md_register_thread ( recv_daemon , mddev , " cluster_recv " ) ;
if ( ! cinfo - > recv_thread ) {
pr_err ( " md-cluster: cannot allocate memory for recv_thread! \n " ) ;
goto err ;
}
cinfo - > message_lockres = lockres_init ( mddev , " message " , NULL , 1 ) ;
if ( ! cinfo - > message_lockres )
goto err ;
cinfo - > token_lockres = lockres_init ( mddev , " token " , NULL , 0 ) ;
if ( ! cinfo - > token_lockres )
goto err ;
2014-10-30 02:51:31 +03:00
cinfo - > no_new_dev_lockres = lockres_init ( mddev , " no-new-dev " , NULL , 0 ) ;
if ( ! cinfo - > no_new_dev_lockres )
goto err ;
2016-05-02 18:33:12 +03:00
ret = dlm_lock_sync ( cinfo - > token_lockres , DLM_LOCK_EX ) ;
if ( ret ) {
ret = - EAGAIN ;
pr_err ( " md-cluster: can't join cluster to avoid lock issue \n " ) ;
goto err ;
}
cinfo - > ack_lockres = lockres_init ( mddev , " ack " , ack_bast , 0 ) ;
2016-08-21 17:42:25 +03:00
if ( ! cinfo - > ack_lockres ) {
ret = - ENOMEM ;
2016-05-02 18:33:12 +03:00
goto err ;
2016-08-21 17:42:25 +03:00
}
2014-06-07 10:08:29 +04:00
/* get sync CR lock on ACK. */
if ( dlm_lock_sync ( cinfo - > ack_lockres , DLM_LOCK_CR ) )
pr_err ( " md-cluster: failed to get a sync CR lock on ACK!(%d) \n " ,
ret ) ;
2016-05-02 18:33:12 +03:00
dlm_unlock_sync ( cinfo - > token_lockres ) ;
2014-10-30 02:51:31 +03:00
/* get sync CR lock on no-new-dev. */
if ( dlm_lock_sync ( cinfo - > no_new_dev_lockres , DLM_LOCK_CR ) )
pr_err ( " md-cluster: failed to get a sync CR lock on no-new-dev!(%d) \n " , ret ) ;
2014-06-06 21:12:32 +04:00
pr_info ( " md-cluster: Joined cluster %s slot %d \n " , str , cinfo - > slot_number ) ;
snprintf ( str , 64 , " bitmap%04d " , cinfo - > slot_number - 1 ) ;
cinfo - > bitmap_lockres = lockres_init ( mddev , str , NULL , 1 ) ;
2016-08-21 17:42:25 +03:00
if ( ! cinfo - > bitmap_lockres ) {
ret = - ENOMEM ;
2014-06-06 21:12:32 +04:00
goto err ;
2016-08-21 17:42:25 +03:00
}
2014-06-06 21:12:32 +04:00
if ( dlm_lock_sync ( cinfo - > bitmap_lockres , DLM_LOCK_PW ) ) {
pr_err ( " Failed to get bitmap lock \n " ) ;
ret = - EINVAL ;
goto err ;
}
2015-09-30 21:20:35 +03:00
cinfo - > resync_lockres = lockres_init ( mddev , " resync " , NULL , 0 ) ;
2016-08-21 17:42:25 +03:00
if ( ! cinfo - > resync_lockres ) {
ret = - ENOMEM ;
2015-09-30 21:20:35 +03:00
goto err ;
2016-08-21 17:42:25 +03:00
}
2015-09-30 21:20:35 +03:00
2014-03-29 19:01:53 +04:00
return 0 ;
2014-03-29 19:20:02 +04:00
err :
2016-05-02 18:33:11 +03:00
md_unregister_thread ( & cinfo - > recovery_thread ) ;
md_unregister_thread ( & cinfo - > recv_thread ) ;
2014-06-07 10:08:29 +04:00
lockres_free ( cinfo - > message_lockres ) ;
lockres_free ( cinfo - > token_lockres ) ;
lockres_free ( cinfo - > ack_lockres ) ;
2014-10-30 02:51:31 +03:00
lockres_free ( cinfo - > no_new_dev_lockres ) ;
2015-09-30 21:20:35 +03:00
lockres_free ( cinfo - > resync_lockres ) ;
2014-06-06 21:35:34 +04:00
lockres_free ( cinfo - > bitmap_lockres ) ;
2014-03-29 19:20:02 +04:00
if ( cinfo - > lockspace )
dlm_release_lockspace ( cinfo - > lockspace , 2 ) ;
2014-03-30 09:42:49 +04:00
mddev - > cluster_info = NULL ;
2014-03-29 19:20:02 +04:00
kfree ( cinfo ) ;
return ret ;
2014-03-29 19:01:53 +04:00
}
2016-05-04 09:17:09 +03:00
static void load_bitmaps ( struct mddev * mddev , int total_slots )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
/* load all the node's bitmap info for resync */
if ( gather_all_resync_info ( mddev , total_slots ) )
pr_err ( " md-cluster: failed to gather all resyn infos \n " ) ;
set_bit ( MD_CLUSTER_ALREADY_IN_CLUSTER , & cinfo - > state ) ;
/* wake up recv thread in case something need to be handled */
if ( test_and_clear_bit ( MD_CLUSTER_PENDING_RECV_EVENT , & cinfo - > state ) )
md_wakeup_thread ( cinfo - > recv_thread ) ;
}
2015-09-30 19:09:18 +03:00
static void resync_bitmap ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
struct cluster_msg cmsg = { 0 } ;
int err ;
cmsg . type = cpu_to_le32 ( BITMAP_NEEDS_SYNC ) ;
err = sendmsg ( cinfo , & cmsg ) ;
if ( err )
pr_err ( " %s:%d: failed to send BITMAP_NEEDS_SYNC message (%d) \n " ,
__func__ , __LINE__ , err ) ;
}
2015-12-21 02:51:00 +03:00
static void unlock_all_bitmaps ( struct mddev * mddev ) ;
2014-03-29 19:01:53 +04:00
static int leave ( struct mddev * mddev )
{
2014-03-29 19:20:02 +04:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
if ( ! cinfo )
return 0 ;
2015-09-30 19:09:18 +03:00
/* BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap , also we
* can only deliver it when dlm connection is available */
if ( cinfo - > slot_number > 0 & & mddev - > recovery_cp ! = MaxSector )
resync_bitmap ( mddev ) ;
2014-06-07 09:45:22 +04:00
md_unregister_thread ( & cinfo - > recovery_thread ) ;
2014-06-07 10:08:29 +04:00
md_unregister_thread ( & cinfo - > recv_thread ) ;
lockres_free ( cinfo - > message_lockres ) ;
lockres_free ( cinfo - > token_lockres ) ;
lockres_free ( cinfo - > ack_lockres ) ;
2014-10-30 02:51:31 +03:00
lockres_free ( cinfo - > no_new_dev_lockres ) ;
2016-01-23 02:54:42 +03:00
lockres_free ( cinfo - > resync_lockres ) ;
2014-06-06 21:12:32 +04:00
lockres_free ( cinfo - > bitmap_lockres ) ;
2015-12-21 02:51:00 +03:00
unlock_all_bitmaps ( mddev ) ;
2014-03-29 19:20:02 +04:00
dlm_release_lockspace ( cinfo - > lockspace , 2 ) ;
2014-03-29 19:01:53 +04:00
return 0 ;
}
2014-03-30 09:42:49 +04:00
/* slot_number(): Returns the MD slot number to use
* DLM starts the slot numbers from 1 , wheras cluster - md
* wants the number to be from zero , so we deduct one
*/
static int slot_number ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
return cinfo - > slot_number - 1 ;
}
2015-12-21 02:51:00 +03:00
/*
* Check if the communication is already locked , else lock the communication
* channel .
* If it is already locked , token is in EX mode , and hence lock_token ( )
* should not be called .
*/
2014-06-07 10:44:51 +04:00
static int metadata_update_start ( struct mddev * mddev )
{
2015-12-21 02:51:00 +03:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
wait_event ( cinfo - > wait ,
! test_and_set_bit ( MD_CLUSTER_SEND_LOCK , & cinfo - > state ) | |
test_and_clear_bit ( MD_CLUSTER_SEND_LOCKED_ALREADY , & cinfo - > state ) ) ;
/* If token is already locked, return 0 */
if ( cinfo - > token_lockres - > mode = = DLM_LOCK_EX )
return 0 ;
return lock_token ( cinfo ) ;
2014-06-07 10:44:51 +04:00
}
static int metadata_update_finish ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
struct cluster_msg cmsg ;
2015-08-21 18:33:39 +03:00
struct md_rdev * rdev ;
int ret = 0 ;
2015-10-16 05:48:35 +03:00
int raid_slot = - 1 ;
2014-06-07 10:44:51 +04:00
memset ( & cmsg , 0 , sizeof ( cmsg ) ) ;
cmsg . type = cpu_to_le32 ( METADATA_UPDATED ) ;
2015-08-21 18:33:39 +03:00
/* Pick up a good active device number to send.
*/
rdev_for_each ( rdev , mddev )
if ( rdev - > raid_disk > - 1 & & ! test_bit ( Faulty , & rdev - > flags ) ) {
2015-10-16 05:48:35 +03:00
raid_slot = rdev - > desc_nr ;
2015-08-21 18:33:39 +03:00
break ;
}
2015-10-16 05:48:35 +03:00
if ( raid_slot > = 0 ) {
cmsg . raid_slot = cpu_to_le32 ( raid_slot ) ;
2015-08-21 18:33:39 +03:00
ret = __sendmsg ( cinfo , & cmsg ) ;
2015-10-16 05:48:35 +03:00
} else
2015-08-21 18:33:39 +03:00
pr_warn ( " md-cluster: No good device id found to send \n " ) ;
2015-12-21 02:51:00 +03:00
clear_bit ( MD_CLUSTER_SEND_LOCKED_ALREADY , & cinfo - > state ) ;
2014-06-07 10:44:51 +04:00
unlock_comm ( cinfo ) ;
return ret ;
}
2015-10-01 21:20:27 +03:00
static void metadata_update_cancel ( struct mddev * mddev )
2014-06-07 10:44:51 +04:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-12-21 02:51:00 +03:00
clear_bit ( MD_CLUSTER_SEND_LOCKED_ALREADY , & cinfo - > state ) ;
2015-10-01 21:20:27 +03:00
unlock_comm ( cinfo ) ;
2014-06-07 10:44:51 +04:00
}
2015-09-30 21:20:35 +03:00
static int resync_start ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2016-08-12 08:42:43 +03:00
return dlm_lock_sync_interruptible ( cinfo - > resync_lockres , DLM_LOCK_EX , mddev ) ;
2015-09-30 21:20:35 +03:00
}
2015-08-19 01:14:42 +03:00
static int resync_info_update ( struct mddev * mddev , sector_t lo , sector_t hi )
2014-06-07 11:16:58 +04:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-12-21 02:50:59 +03:00
struct resync_info ri ;
2015-10-12 12:21:24 +03:00
struct cluster_msg cmsg = { 0 } ;
2014-06-07 11:16:58 +04:00
2015-12-21 02:50:59 +03:00
/* do not send zero again, if we have sent before */
if ( hi = = 0 ) {
memcpy ( & ri , cinfo - > bitmap_lockres - > lksb . sb_lvbptr , sizeof ( struct resync_info ) ) ;
if ( le64_to_cpu ( ri . hi ) = = 0 )
return 0 ;
}
2015-10-19 07:44:00 +03:00
add_resync_info ( cinfo - > bitmap_lockres , lo , hi ) ;
2015-08-19 01:14:42 +03:00
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync ( cinfo - > bitmap_lockres , DLM_LOCK_PW ) ;
cmsg . type = cpu_to_le32 ( RESYNCING ) ;
2014-06-07 11:16:58 +04:00
cmsg . low = cpu_to_le64 ( lo ) ;
cmsg . high = cpu_to_le64 ( hi ) ;
2015-09-30 21:20:35 +03:00
2014-06-07 11:16:58 +04:00
return sendmsg ( cinfo , & cmsg ) ;
}
2015-09-30 21:20:35 +03:00
static int resync_finish ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
dlm_unlock_sync ( cinfo - > resync_lockres ) ;
return resync_info_update ( mddev , 0 , 0 ) ;
}
2015-06-24 17:30:32 +03:00
static int area_resyncing ( struct mddev * mddev , int direction ,
sector_t lo , sector_t hi )
2014-06-07 11:39:37 +04:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
int ret = 0 ;
struct suspend_info * s ;
2015-06-24 17:30:32 +03:00
if ( ( direction = = READ ) & &
test_bit ( MD_CLUSTER_SUSPEND_READ_BALANCING , & cinfo - > state ) )
return 1 ;
2014-06-07 11:39:37 +04:00
spin_lock_irq ( & cinfo - > suspend_lock ) ;
if ( list_empty ( & cinfo - > suspend_list ) )
goto out ;
list_for_each_entry ( s , & cinfo - > suspend_list , list )
if ( hi > s - > lo & & lo < s - > hi ) {
ret = 1 ;
break ;
}
out :
spin_unlock_irq ( & cinfo - > suspend_lock ) ;
return ret ;
}
2015-10-01 21:20:27 +03:00
/* add_new_disk() - initiates a disk add
* However , if this fails before writing md_update_sb ( ) ,
* add_new_disk_cancel ( ) must be called to release token lock
*/
static int add_new_disk ( struct mddev * mddev , struct md_rdev * rdev )
2014-10-30 02:51:31 +03:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
struct cluster_msg cmsg ;
int ret = 0 ;
struct mdp_superblock_1 * sb = page_address ( rdev - > sb_page ) ;
char * uuid = sb - > device_uuid ;
memset ( & cmsg , 0 , sizeof ( cmsg ) ) ;
cmsg . type = cpu_to_le32 ( NEWDISK ) ;
memcpy ( cmsg . uuid , uuid , 16 ) ;
2015-10-12 12:21:21 +03:00
cmsg . raid_slot = cpu_to_le32 ( rdev - > desc_nr ) ;
2014-10-30 02:51:31 +03:00
lock_comm ( cinfo ) ;
ret = __sendmsg ( cinfo , & cmsg ) ;
if ( ret )
return ret ;
cinfo - > no_new_dev_lockres - > flags | = DLM_LKF_NOQUEUE ;
ret = dlm_lock_sync ( cinfo - > no_new_dev_lockres , DLM_LOCK_EX ) ;
cinfo - > no_new_dev_lockres - > flags & = ~ DLM_LKF_NOQUEUE ;
/* Some node does not "see" the device */
if ( ret = = - EAGAIN )
ret = - ENOENT ;
2015-10-01 21:20:27 +03:00
if ( ret )
unlock_comm ( cinfo ) ;
2015-12-21 02:51:00 +03:00
else {
2014-10-30 02:51:31 +03:00
dlm_lock_sync ( cinfo - > no_new_dev_lockres , DLM_LOCK_CR ) ;
2015-12-21 02:51:01 +03:00
/* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
* will run soon after add_new_disk , the below path will be
* invoked :
* md_wakeup_thread ( mddev - > thread )
* - > conf - > thread ( raid1d )
* - > md_check_recovery - > md_update_sb
* - > metadata_update_start / finish
* MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually .
*
* For other failure cases , metadata_update_cancel and
* add_new_disk_cancel also clear below bit as well .
* */
2015-12-21 02:51:00 +03:00
set_bit ( MD_CLUSTER_SEND_LOCKED_ALREADY , & cinfo - > state ) ;
wake_up ( & cinfo - > wait ) ;
}
2014-10-30 02:51:31 +03:00
return ret ;
}
2015-10-01 21:20:27 +03:00
static void add_new_disk_cancel ( struct mddev * mddev )
2014-10-30 02:51:31 +03:00
{
2015-10-01 21:20:27 +03:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-12-21 02:51:00 +03:00
clear_bit ( MD_CLUSTER_SEND_LOCKED_ALREADY , & cinfo - > state ) ;
2015-10-01 21:20:27 +03:00
unlock_comm ( cinfo ) ;
2014-10-30 02:51:31 +03:00
}
2015-03-02 19:55:49 +03:00
static int new_disk_ack ( struct mddev * mddev , bool ack )
2014-10-30 02:51:31 +03:00
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-03-02 19:55:49 +03:00
if ( ! test_bit ( MD_CLUSTER_WAITING_FOR_NEWDISK , & cinfo - > state ) ) {
pr_warn ( " md-cluster(%s): Spurious cluster confirmation \n " , mdname ( mddev ) ) ;
return - EINVAL ;
}
2014-10-30 02:51:31 +03:00
if ( ack )
dlm_unlock_sync ( cinfo - > no_new_dev_lockres ) ;
complete ( & cinfo - > newdisk_completion ) ;
2015-03-02 19:55:49 +03:00
return 0 ;
2014-10-30 02:51:31 +03:00
}
2015-04-14 18:44:44 +03:00
static int remove_disk ( struct mddev * mddev , struct md_rdev * rdev )
{
2015-10-12 12:21:24 +03:00
struct cluster_msg cmsg = { 0 } ;
2015-04-14 18:44:44 +03:00
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-10-12 12:21:21 +03:00
cmsg . type = cpu_to_le32 ( REMOVE ) ;
cmsg . raid_slot = cpu_to_le32 ( rdev - > desc_nr ) ;
2015-12-21 02:51:00 +03:00
return sendmsg ( cinfo , & cmsg ) ;
2015-04-14 18:44:44 +03:00
}
2015-12-21 02:51:00 +03:00
static int lock_all_bitmaps ( struct mddev * mddev )
{
int slot , my_slot , ret , held = 1 , i = 0 ;
char str [ 64 ] ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
cinfo - > other_bitmap_lockres = kzalloc ( ( mddev - > bitmap_info . nodes - 1 ) *
sizeof ( struct dlm_lock_resource * ) ,
GFP_KERNEL ) ;
if ( ! cinfo - > other_bitmap_lockres ) {
pr_err ( " md: can't alloc mem for other bitmap locks \n " ) ;
return 0 ;
}
my_slot = slot_number ( mddev ) ;
for ( slot = 0 ; slot < mddev - > bitmap_info . nodes ; slot + + ) {
if ( slot = = my_slot )
continue ;
memset ( str , ' \0 ' , 64 ) ;
snprintf ( str , 64 , " bitmap%04d " , slot ) ;
cinfo - > other_bitmap_lockres [ i ] = lockres_init ( mddev , str , NULL , 1 ) ;
if ( ! cinfo - > other_bitmap_lockres [ i ] )
return - ENOMEM ;
cinfo - > other_bitmap_lockres [ i ] - > flags | = DLM_LKF_NOQUEUE ;
ret = dlm_lock_sync ( cinfo - > other_bitmap_lockres [ i ] , DLM_LOCK_PW ) ;
if ( ret )
held = - 1 ;
i + + ;
}
return held ;
}
static void unlock_all_bitmaps ( struct mddev * mddev )
{
struct md_cluster_info * cinfo = mddev - > cluster_info ;
int i ;
/* release other node's bitmap lock if they are existed */
if ( cinfo - > other_bitmap_lockres ) {
for ( i = 0 ; i < mddev - > bitmap_info . nodes - 1 ; i + + ) {
if ( cinfo - > other_bitmap_lockres [ i ] ) {
lockres_free ( cinfo - > other_bitmap_lockres [ i ] ) ;
}
}
kfree ( cinfo - > other_bitmap_lockres ) ;
}
}
2015-04-14 18:45:42 +03:00
static int gather_bitmaps ( struct md_rdev * rdev )
{
int sn , err ;
sector_t lo , hi ;
2015-10-12 12:21:24 +03:00
struct cluster_msg cmsg = { 0 } ;
2015-04-14 18:45:42 +03:00
struct mddev * mddev = rdev - > mddev ;
struct md_cluster_info * cinfo = mddev - > cluster_info ;
2015-10-12 12:21:21 +03:00
cmsg . type = cpu_to_le32 ( RE_ADD ) ;
cmsg . raid_slot = cpu_to_le32 ( rdev - > desc_nr ) ;
2015-04-14 18:45:42 +03:00
err = sendmsg ( cinfo , & cmsg ) ;
if ( err )
goto out ;
for ( sn = 0 ; sn < mddev - > bitmap_info . nodes ; sn + + ) {
if ( sn = = ( cinfo - > slot_number - 1 ) )
continue ;
err = bitmap_copy_from_slot ( mddev , sn , & lo , & hi , false ) ;
if ( err ) {
pr_warn ( " md-cluster: Could not gather bitmaps from slot %d " , sn ) ;
goto out ;
}
if ( ( hi > 0 ) & & ( lo < mddev - > recovery_cp ) )
mddev - > recovery_cp = lo ;
}
out :
return err ;
}
2014-03-29 19:01:53 +04:00
static struct md_cluster_operations cluster_ops = {
. join = join ,
. leave = leave ,
2014-03-30 09:42:49 +04:00
. slot_number = slot_number ,
2015-09-30 21:20:35 +03:00
. resync_start = resync_start ,
. resync_finish = resync_finish ,
2014-06-06 21:35:34 +04:00
. resync_info_update = resync_info_update ,
2014-06-07 10:44:51 +04:00
. metadata_update_start = metadata_update_start ,
. metadata_update_finish = metadata_update_finish ,
. metadata_update_cancel = metadata_update_cancel ,
2014-06-07 11:39:37 +04:00
. area_resyncing = area_resyncing ,
2015-10-01 21:20:27 +03:00
. add_new_disk = add_new_disk ,
. add_new_disk_cancel = add_new_disk_cancel ,
2014-10-30 02:51:31 +03:00
. new_disk_ack = new_disk_ack ,
2015-04-14 18:44:44 +03:00
. remove_disk = remove_disk ,
2016-05-04 09:17:09 +03:00
. load_bitmaps = load_bitmaps ,
2015-04-14 18:45:42 +03:00
. gather_bitmaps = gather_bitmaps ,
2015-12-21 02:51:00 +03:00
. lock_all_bitmaps = lock_all_bitmaps ,
. unlock_all_bitmaps = unlock_all_bitmaps ,
2014-03-29 19:01:53 +04:00
} ;
2014-03-07 21:21:15 +04:00
static int __init cluster_init ( void )
{
pr_warn ( " md-cluster: EXPERIMENTAL. Use with caution \n " ) ;
pr_info ( " Registering Cluster MD functions \n " ) ;
2014-03-29 19:01:53 +04:00
register_md_cluster_operations ( & cluster_ops , THIS_MODULE ) ;
2014-03-07 21:21:15 +04:00
return 0 ;
}
static void cluster_exit ( void )
{
2014-03-29 19:01:53 +04:00
unregister_md_cluster_operations ( ) ;
2014-03-07 21:21:15 +04:00
}
module_init ( cluster_init ) ;
module_exit ( cluster_exit ) ;
2015-10-12 12:21:25 +03:00
MODULE_AUTHOR ( " SUSE " ) ;
2014-03-07 21:21:15 +04:00
MODULE_LICENSE ( " GPL " ) ;
MODULE_DESCRIPTION ( " Clustering support for MD " ) ;