/* ctdb ip takeover code Copyright (C) Ronnie Sahlberg 2007 Copyright (C) Andrew Tridgell 2007 Copyright (C) Martin Schwenke 2011 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . */ #include "replace.h" #include "system/network.h" #include "lib/util/debug.h" #include "common/logging.h" #include "protocol/protocol_api.h" #include "server/ipalloc_private.h" /* * This is the length of the longtest common prefix between the IPs. * It is calculated by XOR-ing the 2 IPs together and counting the * number of leading zeroes. The implementation means that all * addresses end up being 128 bits long. * * FIXME? Should we consider IPv4 and IPv6 separately given that the * 12 bytes of 0 prefix padding will hurt the algorithm if there are * lots of nodes and IP addresses? */ static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2) { uint32_t ip1_k[IP_KEYLEN]; uint32_t *t; int i; uint32_t x; uint32_t distance = 0; memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k)); t = ip_key(ip2); for (i=0; inext) { if (t->pnn != pnn) { continue; } /* Optimisation: We never calculate the distance * between an address and itself. This allows us to * calculate the effect of removing an address from a * node by simply calculating the distance between * that address and all of the exitsing addresses. * Moreover, we assume that we're only ever dealing * with addresses from all_ips so we can identify an * address via a pointer rather than doing a more * expensive address comparison. */ if (&(t->addr) == ip) { continue; } d = ip_distance(ip, &(t->addr)); sum += d * d; /* Cheaper than pulling in math.h :-) */ } return sum; } /* Return the LCP2 imbalance metric for addresses currently assigned to the given node. */ static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn) { struct public_ip_list *t; uint32_t imbalance = 0; for (t = all_ips; t != NULL; t = t->next) { if (t->pnn != pnn) { continue; } /* Pass the rest of the IPs rather than the whole all_ips input list. */ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn); } return imbalance; } static bool lcp2_init(struct ipalloc_state *ipalloc_state, uint32_t **lcp2_imbalances, bool **rebalance_candidates) { int i, numnodes; struct public_ip_list *t; numnodes = ipalloc_state->num; *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes); if (*rebalance_candidates == NULL) { DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); return false; } *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes); if (*lcp2_imbalances == NULL) { DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); return false; } for (i=0; iall_ips, i); /* First step: assume all nodes are candidates */ (*rebalance_candidates)[i] = true; } /* 2nd step: if a node has IPs assigned then it must have been * healthy before, so we remove it from consideration. This * is overkill but is all we have because we don't maintain * state between takeover runs. An alternative would be to * keep state and invalidate it every time the recovery master * changes. */ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { if (t->pnn != -1) { (*rebalance_candidates)[t->pnn] = false; } } /* 3rd step: if a node is forced to re-balance then we allow failback onto the node */ if (ipalloc_state->force_rebalance_nodes == NULL) { return true; } for (i = 0; i < talloc_array_length(ipalloc_state->force_rebalance_nodes); i++) { uint32_t pnn = ipalloc_state->force_rebalance_nodes[i]; if (pnn >= numnodes) { DEBUG(DEBUG_ERR, (__location__ "unknown node %u\n", pnn)); continue; } DEBUG(DEBUG_NOTICE, ("Forcing rebalancing of IPs to node %u\n", pnn)); (*rebalance_candidates)[pnn] = true; } return true; } /* Allocate any unassigned addresses using the LCP2 algorithm to find * the IP/node combination that will cost the least. */ static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state, uint32_t *lcp2_imbalances) { struct public_ip_list *t; int dstnode, numnodes; int minnode; uint32_t mindsum, dstdsum, dstimbl; uint32_t minimbl = 0; struct public_ip_list *minip; bool should_loop = true; bool have_unassigned = true; numnodes = ipalloc_state->num; while (have_unassigned && should_loop) { should_loop = false; DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n")); minnode = -1; mindsum = 0; minip = NULL; /* loop over each unassigned ip. */ for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) { if (t->pnn != -1) { continue; } for (dstnode = 0; dstnode < numnodes; dstnode++) { /* only check nodes that can actually takeover this ip */ if (!can_node_takeover_ip(ipalloc_state, dstnode, t)) { /* no it couldnt so skip to the next node */ continue; } dstdsum = ip_distance_2_sum(&(t->addr), ipalloc_state->all_ips, dstnode); dstimbl = lcp2_imbalances[dstnode] + dstdsum; DEBUG(DEBUG_DEBUG, (" %s -> %d [+%d]\n", ctdb_sock_addr_to_string(ipalloc_state, &(t->addr)), dstnode, dstimbl - lcp2_imbalances[dstnode])); if ((minnode == -1) || (dstdsum < mindsum)) { minnode = dstnode; minimbl = dstimbl; mindsum = dstdsum; minip = t; should_loop = true; } } } DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); /* If we found one then assign it to the given node. */ if (minnode != -1) { minip->pnn = minnode; lcp2_imbalances[minnode] = minimbl; DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n", ctdb_sock_addr_to_string( ipalloc_state, &(minip->addr)), minnode, mindsum)); } /* There might be a better way but at least this is clear. */ have_unassigned = false; for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { if (t->pnn == -1) { have_unassigned = true; } } } /* We know if we have an unassigned addresses so we might as * well optimise. */ if (have_unassigned) { for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { if (t->pnn == -1) { DEBUG(DEBUG_WARNING, ("Failed to find node to cover ip %s\n", ctdb_sock_addr_to_string(ipalloc_state, &t->addr))); } } } } /* LCP2 algorithm for rebalancing the cluster. Given a candidate node * to move IPs from, determines the best IP/destination node * combination to move from the source node. */ static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state, int srcnode, uint32_t *lcp2_imbalances, bool *rebalance_candidates) { int dstnode, mindstnode, numnodes; uint32_t srcimbl, srcdsum, dstimbl, dstdsum; uint32_t minsrcimbl, mindstimbl; struct public_ip_list *minip; struct public_ip_list *t; /* Find an IP and destination node that best reduces imbalance. */ srcimbl = 0; minip = NULL; minsrcimbl = 0; mindstnode = -1; mindstimbl = 0; numnodes = ipalloc_state->num; DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, lcp2_imbalances[srcnode])); for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { /* Only consider addresses on srcnode. */ if (t->pnn != srcnode) { continue; } /* What is this IP address costing the source node? */ srcdsum = ip_distance_2_sum(&(t->addr), ipalloc_state->all_ips, srcnode); srcimbl = lcp2_imbalances[srcnode] - srcdsum; /* Consider this IP address would cost each potential * destination node. Destination nodes are limited to * those that are newly healthy, since we don't want * to do gratuitous failover of IPs just to make minor * balance improvements. */ for (dstnode = 0; dstnode < numnodes; dstnode++) { if (!rebalance_candidates[dstnode]) { continue; } /* only check nodes that can actually takeover this ip */ if (!can_node_takeover_ip(ipalloc_state, dstnode, t)) { /* no it couldnt so skip to the next node */ continue; } dstdsum = ip_distance_2_sum(&(t->addr), ipalloc_state->all_ips, dstnode); dstimbl = lcp2_imbalances[dstnode] + dstdsum; DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n", srcnode, -srcdsum, ctdb_sock_addr_to_string( ipalloc_state, &(t->addr)), dstnode, dstdsum)); if ((dstimbl < lcp2_imbalances[srcnode]) && (dstdsum < srcdsum) && \ ((mindstnode == -1) || \ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) { minip = t; minsrcimbl = srcimbl; mindstnode = dstnode; mindstimbl = dstimbl; } } } DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); if (mindstnode != -1) { /* We found a move that makes things better... */ DEBUG(DEBUG_INFO, ("%d [%d] -> %s -> %d [+%d]\n", srcnode, minsrcimbl - lcp2_imbalances[srcnode], ctdb_sock_addr_to_string(ipalloc_state, &(minip->addr)), mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); lcp2_imbalances[srcnode] = minsrcimbl; lcp2_imbalances[mindstnode] = mindstimbl; minip->pnn = mindstnode; return true; } return false; } struct lcp2_imbalance_pnn { uint32_t imbalance; int pnn; }; static int lcp2_cmp_imbalance_pnn(const void * a, const void * b) { const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a; const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b; if (lipa->imbalance > lipb->imbalance) { return -1; } else if (lipa->imbalance == lipb->imbalance) { return 0; } else { return 1; } } /* LCP2 algorithm for rebalancing the cluster. This finds the source * node with the highest LCP2 imbalance, and then determines the best * IP/destination node combination to move from the source node. */ static void lcp2_failback(struct ipalloc_state *ipalloc_state, uint32_t *lcp2_imbalances, bool *rebalance_candidates) { int i, numnodes; struct lcp2_imbalance_pnn * lips; bool again; numnodes = ipalloc_state->num; try_again: /* Put the imbalances and nodes into an array, sort them and * iterate through candidates. Usually the 1st one will be * used, so this doesn't cost much... */ DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n")); DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n")); lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes); for (i = 0; i < numnodes; i++) { lips[i].imbalance = lcp2_imbalances[i]; lips[i].pnn = i; DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i])); } qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn), lcp2_cmp_imbalance_pnn); again = false; for (i = 0; i < numnodes; i++) { /* This means that all nodes had 0 or 1 addresses, so * can't be imbalanced. */ if (lips[i].imbalance == 0) { break; } if (lcp2_failback_candidate(ipalloc_state, lips[i].pnn, lcp2_imbalances, rebalance_candidates)) { again = true; break; } } talloc_free(lips); if (again) { goto try_again; } } bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state) { uint32_t *lcp2_imbalances; bool *rebalance_candidates; int numnodes, num_rebalance_candidates, i; bool ret = true; unassign_unsuitable_ips(ipalloc_state); if (!lcp2_init(ipalloc_state, &lcp2_imbalances, &rebalance_candidates)) { ret = false; goto finished; } lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances); /* If we don't want IPs to fail back then don't rebalance IPs. */ if (ipalloc_state->no_ip_failback) { goto finished; } /* It is only worth continuing if we have suitable target * nodes to transfer IPs to. This check is much cheaper than * continuing on... */ numnodes = ipalloc_state->num; num_rebalance_candidates = 0; for (i=0; i