2015-11-23 08:18:16 +03:00
/*
ctdb ip takeover code
Copyright ( C ) Ronnie Sahlberg 2007
Copyright ( C ) Andrew Tridgell 2007
Copyright ( C ) Martin Schwenke 2011
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation ; either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
*/
# include "replace.h"
# include "system/network.h"
# include "lib/util/debug.h"
# include "common/logging.h"
2017-09-04 09:00:48 +03:00
# include "protocol/protocol_util.h"
2015-11-23 08:18:16 +03:00
# include "server/ipalloc_private.h"
/*
* This is the length of the longtest common prefix between the IPs .
* It is calculated by XOR - ing the 2 IPs together and counting the
* number of leading zeroes . The implementation means that all
* addresses end up being 128 bits long .
*
* FIXME ? Should we consider IPv4 and IPv6 separately given that the
* 12 bytes of 0 prefix padding will hurt the algorithm if there are
* lots of nodes and IP addresses ?
*/
static uint32_t ip_distance ( ctdb_sock_addr * ip1 , ctdb_sock_addr * ip2 )
{
uint32_t ip1_k [ IP_KEYLEN ] ;
uint32_t * t ;
int i ;
uint32_t x ;
uint32_t distance = 0 ;
memcpy ( ip1_k , ip_key ( ip1 ) , sizeof ( ip1_k ) ) ;
t = ip_key ( ip2 ) ;
for ( i = 0 ; i < IP_KEYLEN ; i + + ) {
x = ip1_k [ i ] ^ t [ i ] ;
if ( x = = 0 ) {
distance + = 32 ;
} else {
/* Count number of leading zeroes.
* FIXME ? This could be optimised . . .
*/
while ( ( x & ( 1 < < 31 ) ) = = 0 ) {
x < < = 1 ;
distance + = 1 ;
}
}
}
return distance ;
}
/* Calculate the IP distance for the given IP relative to IPs on the
given node . The ips argument is generally the all_ips variable
used in the main part of the algorithm .
*/
static uint32_t ip_distance_2_sum ( ctdb_sock_addr * ip ,
struct public_ip_list * ips ,
int pnn )
{
struct public_ip_list * t ;
uint32_t d ;
uint32_t sum = 0 ;
for ( t = ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn ! = pnn ) {
continue ;
}
/* Optimisation: We never calculate the distance
* between an address and itself . This allows us to
* calculate the effect of removing an address from a
* node by simply calculating the distance between
* that address and all of the exitsing addresses .
* Moreover , we assume that we ' re only ever dealing
* with addresses from all_ips so we can identify an
* address via a pointer rather than doing a more
* expensive address comparison . */
if ( & ( t - > addr ) = = ip ) {
continue ;
}
d = ip_distance ( ip , & ( t - > addr ) ) ;
sum + = d * d ; /* Cheaper than pulling in math.h :-) */
}
return sum ;
}
/* Return the LCP2 imbalance metric for addresses currently assigned
to the given node .
*/
static uint32_t lcp2_imbalance ( struct public_ip_list * all_ips , int pnn )
{
struct public_ip_list * t ;
uint32_t imbalance = 0 ;
for ( t = all_ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn ! = pnn ) {
continue ;
}
/* Pass the rest of the IPs rather than the whole
all_ips input list .
*/
imbalance + = ip_distance_2_sum ( & ( t - > addr ) , t - > next , pnn ) ;
}
return imbalance ;
}
static bool lcp2_init ( struct ipalloc_state * ipalloc_state ,
uint32_t * * lcp2_imbalances ,
bool * * rebalance_candidates )
{
int i , numnodes ;
struct public_ip_list * t ;
numnodes = ipalloc_state - > num ;
* rebalance_candidates = talloc_array ( ipalloc_state , bool , numnodes ) ;
if ( * rebalance_candidates = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " out of memory \n " ) ) ;
return false ;
}
* lcp2_imbalances = talloc_array ( ipalloc_state , uint32_t , numnodes ) ;
if ( * lcp2_imbalances = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " out of memory \n " ) ) ;
return false ;
}
for ( i = 0 ; i < numnodes ; i + + ) {
( * lcp2_imbalances ) [ i ] =
lcp2_imbalance ( ipalloc_state - > all_ips , i ) ;
/* First step: assume all nodes are candidates */
( * rebalance_candidates ) [ i ] = true ;
}
/* 2nd step: if a node has IPs assigned then it must have been
* healthy before , so we remove it from consideration . This
* is overkill but is all we have because we don ' t maintain
* state between takeover runs . An alternative would be to
* keep state and invalidate it every time the recovery master
* changes .
*/
for ( t = ipalloc_state - > all_ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn ! = - 1 ) {
( * rebalance_candidates ) [ t - > pnn ] = false ;
}
}
/* 3rd step: if a node is forced to re-balance then
we allow failback onto the node */
if ( ipalloc_state - > force_rebalance_nodes = = NULL ) {
return true ;
}
for ( i = 0 ;
i < talloc_array_length ( ipalloc_state - > force_rebalance_nodes ) ;
i + + ) {
uint32_t pnn = ipalloc_state - > force_rebalance_nodes [ i ] ;
if ( pnn > = numnodes ) {
DEBUG ( DEBUG_ERR ,
( __location__ " unknown node %u \n " , pnn ) ) ;
continue ;
}
DEBUG ( DEBUG_NOTICE ,
( " Forcing rebalancing of IPs to node %u \n " , pnn ) ) ;
( * rebalance_candidates ) [ pnn ] = true ;
}
return true ;
}
/* Allocate any unassigned addresses using the LCP2 algorithm to find
* the IP / node combination that will cost the least .
*/
static void lcp2_allocate_unassigned ( struct ipalloc_state * ipalloc_state ,
uint32_t * lcp2_imbalances )
{
struct public_ip_list * t ;
int dstnode , numnodes ;
int minnode ;
2016-02-05 13:39:46 +03:00
uint32_t mindsum , dstdsum , dstimbl ;
uint32_t minimbl = 0 ;
2015-11-23 08:18:16 +03:00
struct public_ip_list * minip ;
bool should_loop = true ;
bool have_unassigned = true ;
numnodes = ipalloc_state - > num ;
while ( have_unassigned & & should_loop ) {
should_loop = false ;
DEBUG ( DEBUG_DEBUG , ( " ---------------------------------------- \n " ) ) ;
DEBUG ( DEBUG_DEBUG , ( " CONSIDERING MOVES (UNASSIGNED) \n " ) ) ;
minnode = - 1 ;
mindsum = 0 ;
minip = NULL ;
/* loop over each unassigned ip. */
for ( t = ipalloc_state - > all_ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn ! = - 1 ) {
continue ;
}
for ( dstnode = 0 ; dstnode < numnodes ; dstnode + + ) {
/* only check nodes that can actually takeover this ip */
if ( ! can_node_takeover_ip ( ipalloc_state ,
dstnode ,
t ) ) {
/* no it couldnt so skip to the next node */
continue ;
}
dstdsum = ip_distance_2_sum ( & ( t - > addr ) ,
ipalloc_state - > all_ips ,
dstnode ) ;
dstimbl = lcp2_imbalances [ dstnode ] + dstdsum ;
DEBUG ( DEBUG_DEBUG ,
( " %s -> %d [+%d] \n " ,
ctdb_sock_addr_to_string ( ipalloc_state ,
2017-09-11 08:00:10 +03:00
& ( t - > addr ) ,
false ) ,
2015-11-23 08:18:16 +03:00
dstnode ,
dstimbl - lcp2_imbalances [ dstnode ] ) ) ;
if ( ( minnode = = - 1 ) | | ( dstdsum < mindsum ) ) {
minnode = dstnode ;
minimbl = dstimbl ;
mindsum = dstdsum ;
minip = t ;
should_loop = true ;
}
}
}
DEBUG ( DEBUG_DEBUG , ( " ---------------------------------------- \n " ) ) ;
/* If we found one then assign it to the given node. */
if ( minnode ! = - 1 ) {
minip - > pnn = minnode ;
lcp2_imbalances [ minnode ] = minimbl ;
DEBUG ( DEBUG_INFO , ( " %s -> %d [+%d] \n " ,
ctdb_sock_addr_to_string (
ipalloc_state ,
2017-09-11 08:00:10 +03:00
& ( minip - > addr ) , false ) ,
2015-11-23 08:18:16 +03:00
minnode ,
mindsum ) ) ;
}
/* There might be a better way but at least this is clear. */
have_unassigned = false ;
for ( t = ipalloc_state - > all_ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn = = - 1 ) {
have_unassigned = true ;
}
}
}
/* We know if we have an unassigned addresses so we might as
* well optimise .
*/
if ( have_unassigned ) {
for ( t = ipalloc_state - > all_ips ; t ! = NULL ; t = t - > next ) {
if ( t - > pnn = = - 1 ) {
DEBUG ( DEBUG_WARNING ,
( " Failed to find node to cover ip %s \n " ,
ctdb_sock_addr_to_string ( ipalloc_state ,
2017-09-11 08:00:10 +03:00
& t - > addr ,
false ) ) ) ;
2015-11-23 08:18:16 +03:00
}
}
}
}
/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
* to move IPs from , determines the best IP / destination node
* combination to move from the source node .
*/
static bool lcp2_failback_candidate ( struct ipalloc_state * ipalloc_state ,
int srcnode ,
uint32_t * lcp2_imbalances ,
bool * rebalance_candidates )
{
int dstnode , mindstnode , numnodes ;
uint32_t srcimbl , srcdsum , dstimbl , dstdsum ;
uint32_t minsrcimbl , mindstimbl ;
struct public_ip_list * minip ;
struct public_ip_list * t ;
/* Find an IP and destination node that best reduces imbalance. */
srcimbl = 0 ;
minip = NULL ;
minsrcimbl = 0 ;
mindstnode = - 1 ;
mindstimbl = 0 ;
numnodes = ipalloc_state - > num ;
DEBUG ( DEBUG_DEBUG , ( " ---------------------------------------- \n " ) ) ;
DEBUG ( DEBUG_DEBUG , ( " CONSIDERING MOVES FROM %d [%d] \n " ,
srcnode , lcp2_imbalances [ srcnode ] ) ) ;
for ( t = ipalloc_state - > all_ips ; t ! = NULL ; t = t - > next ) {
/* Only consider addresses on srcnode. */
if ( t - > pnn ! = srcnode ) {
continue ;
}
/* What is this IP address costing the source node? */
srcdsum = ip_distance_2_sum ( & ( t - > addr ) ,
ipalloc_state - > all_ips ,
srcnode ) ;
srcimbl = lcp2_imbalances [ srcnode ] - srcdsum ;
/* Consider this IP address would cost each potential
* destination node . Destination nodes are limited to
* those that are newly healthy , since we don ' t want
* to do gratuitous failover of IPs just to make minor
* balance improvements .
*/
for ( dstnode = 0 ; dstnode < numnodes ; dstnode + + ) {
if ( ! rebalance_candidates [ dstnode ] ) {
continue ;
}
/* only check nodes that can actually takeover this ip */
if ( ! can_node_takeover_ip ( ipalloc_state , dstnode ,
t ) ) {
/* no it couldnt so skip to the next node */
continue ;
}
dstdsum = ip_distance_2_sum ( & ( t - > addr ) ,
ipalloc_state - > all_ips ,
dstnode ) ;
dstimbl = lcp2_imbalances [ dstnode ] + dstdsum ;
DEBUG ( DEBUG_DEBUG , ( " %d [%d] -> %s -> %d [+%d] \n " ,
srcnode , - srcdsum ,
ctdb_sock_addr_to_string (
2017-09-11 08:00:10 +03:00
ipalloc_state ,
& ( t - > addr ) , false ) ,
2015-11-23 08:18:16 +03:00
dstnode , dstdsum ) ) ;
if ( ( dstimbl < lcp2_imbalances [ srcnode ] ) & &
( dstdsum < srcdsum ) & & \
( ( mindstnode = = - 1 ) | | \
( ( srcimbl + dstimbl ) < ( minsrcimbl + mindstimbl ) ) ) ) {
minip = t ;
minsrcimbl = srcimbl ;
mindstnode = dstnode ;
mindstimbl = dstimbl ;
}
}
}
DEBUG ( DEBUG_DEBUG , ( " ---------------------------------------- \n " ) ) ;
if ( mindstnode ! = - 1 ) {
/* We found a move that makes things better... */
DEBUG ( DEBUG_INFO ,
( " %d [%d] -> %s -> %d [+%d] \n " ,
srcnode , minsrcimbl - lcp2_imbalances [ srcnode ] ,
2017-09-11 08:00:10 +03:00
ctdb_sock_addr_to_string ( ipalloc_state ,
& ( minip - > addr ) , false ) ,
2015-11-23 08:18:16 +03:00
mindstnode , mindstimbl - lcp2_imbalances [ mindstnode ] ) ) ;
lcp2_imbalances [ srcnode ] = minsrcimbl ;
lcp2_imbalances [ mindstnode ] = mindstimbl ;
minip - > pnn = mindstnode ;
return true ;
}
return false ;
}
struct lcp2_imbalance_pnn {
uint32_t imbalance ;
int pnn ;
} ;
static int lcp2_cmp_imbalance_pnn ( const void * a , const void * b )
{
const struct lcp2_imbalance_pnn * lipa = ( const struct lcp2_imbalance_pnn * ) a ;
const struct lcp2_imbalance_pnn * lipb = ( const struct lcp2_imbalance_pnn * ) b ;
if ( lipa - > imbalance > lipb - > imbalance ) {
return - 1 ;
} else if ( lipa - > imbalance = = lipb - > imbalance ) {
return 0 ;
} else {
return 1 ;
}
}
/* LCP2 algorithm for rebalancing the cluster. This finds the source
* node with the highest LCP2 imbalance , and then determines the best
* IP / destination node combination to move from the source node .
*/
static void lcp2_failback ( struct ipalloc_state * ipalloc_state ,
uint32_t * lcp2_imbalances ,
bool * rebalance_candidates )
{
int i , numnodes ;
struct lcp2_imbalance_pnn * lips ;
bool again ;
numnodes = ipalloc_state - > num ;
try_again :
/* Put the imbalances and nodes into an array, sort them and
* iterate through candidates . Usually the 1 st one will be
* used , so this doesn ' t cost much . . .
*/
DEBUG ( DEBUG_DEBUG , ( " +++++++++++++++++++++++++++++++++++++++++ \n " ) ) ;
DEBUG ( DEBUG_DEBUG , ( " Selecting most imbalanced node from: \n " ) ) ;
lips = talloc_array ( ipalloc_state , struct lcp2_imbalance_pnn , numnodes ) ;
for ( i = 0 ; i < numnodes ; i + + ) {
lips [ i ] . imbalance = lcp2_imbalances [ i ] ;
lips [ i ] . pnn = i ;
DEBUG ( DEBUG_DEBUG , ( " %d [%d] \n " , i , lcp2_imbalances [ i ] ) ) ;
}
qsort ( lips , numnodes , sizeof ( struct lcp2_imbalance_pnn ) ,
lcp2_cmp_imbalance_pnn ) ;
again = false ;
for ( i = 0 ; i < numnodes ; i + + ) {
/* This means that all nodes had 0 or 1 addresses, so
* can ' t be imbalanced .
*/
if ( lips [ i ] . imbalance = = 0 ) {
break ;
}
if ( lcp2_failback_candidate ( ipalloc_state ,
lips [ i ] . pnn ,
lcp2_imbalances ,
rebalance_candidates ) ) {
again = true ;
break ;
}
}
talloc_free ( lips ) ;
if ( again ) {
goto try_again ;
}
}
bool ipalloc_lcp2 ( struct ipalloc_state * ipalloc_state )
{
uint32_t * lcp2_imbalances ;
bool * rebalance_candidates ;
int numnodes , num_rebalance_candidates , i ;
bool ret = true ;
unassign_unsuitable_ips ( ipalloc_state ) ;
if ( ! lcp2_init ( ipalloc_state ,
& lcp2_imbalances , & rebalance_candidates ) ) {
ret = false ;
goto finished ;
}
lcp2_allocate_unassigned ( ipalloc_state , lcp2_imbalances ) ;
/* If we don't want IPs to fail back then don't rebalance IPs. */
2016-06-21 09:34:44 +03:00
if ( ipalloc_state - > no_ip_failback ) {
2015-11-23 08:18:16 +03:00
goto finished ;
}
/* It is only worth continuing if we have suitable target
* nodes to transfer IPs to . This check is much cheaper than
* continuing on . . .
*/
numnodes = ipalloc_state - > num ;
num_rebalance_candidates = 0 ;
for ( i = 0 ; i < numnodes ; i + + ) {
if ( rebalance_candidates [ i ] ) {
num_rebalance_candidates + + ;
}
}
if ( num_rebalance_candidates = = 0 ) {
goto finished ;
}
/* Now, try to make sure the ip adresses are evenly distributed
across the nodes .
*/
lcp2_failback ( ipalloc_state , lcp2_imbalances , rebalance_candidates ) ;
finished :
return ret ;
}