From 7705d669a7dedff4a3cbcb9d2105b9f951fd8e18 Mon Sep 17 00:00:00 2001 From: Pavel Czerny Date: Fri, 15 May 2020 16:15:52 +0200 Subject: [PATCH 1/2] F #3859: Monitord HA --- include/InformationManager.h | 6 ++ include/RaftManager.h | 35 ++++++++ share/etc/oned.conf | 4 +- share/hooks/raft/vip.sh | 87 +++++++++++++------ src/cli/one_helper/onezone_helper.rb | 2 + src/im/InformationManager.cc | 67 ++++++++------ .../remotes/common.d/monitord-client.rb | 3 +- src/monitor/include/HostMonitorManager.h | 7 ++ src/monitor/include/OneMonitorDriver.h | 5 ++ src/monitor/include/OpenNebulaMessages.h | 5 +- src/monitor/src/monitor/HostMonitorManager.cc | 26 ++++++ src/monitor/src/monitor/OneMonitorDriver.cc | 11 +++ src/monitor/src/protocol/Message.cc | 1 + src/raft/RaftManager.cc | 7 ++ 14 files changed, 208 insertions(+), 58 deletions(-) diff --git a/include/InformationManager.h b/include/InformationManager.h index f04f13711c..051ae7a178 100644 --- a/include/InformationManager.h +++ b/include/InformationManager.h @@ -20,6 +20,7 @@ #include "DriverManager.h" #include "ActionManager.h" #include "OpenNebulaMessages.h" +#include "RaftManager.h" class HostPool; class Host; @@ -94,6 +95,11 @@ public: */ void delete_host(int hid); + /** + * Set raft status, send info to monitor daemon + */ + void raft_status(RaftManager::State raft); + protected: /** * Received undefined message -> print error diff --git a/include/RaftManager.h b/include/RaftManager.h index 8fa746543e..ce722cb0f5 100644 --- a/include/RaftManager.h +++ b/include/RaftManager.h @@ -134,6 +134,41 @@ public: */ void follower(unsigned int term); + static std::string state_to_str(State _state) + { + string st; + + switch (_state) + { + case SOLO: + st = "SOLO"; + break; + case CANDIDATE: + st = "CANDIDATE"; + break; + case FOLLOWER: + st = "FOLLOWER"; + break; + case LEADER: + st = "LEADER"; + break; + } + return st; + } + + State get_state() + { + State _state; + + pthread_mutex_lock(&mutex); + + _state = state; + + pthread_mutex_unlock(&mutex); + + return _state; + } + unsigned int get_term() { unsigned int _term; diff --git a/share/etc/oned.conf b/share/etc/oned.conf index 81a0baa83e..9ab09a6bb0 100644 --- a/share/etc/oned.conf +++ b/share/etc/oned.conf @@ -185,13 +185,13 @@ RAFT = [ # Executed when a server transits from follower->leader # RAFT_LEADER_HOOK = [ # COMMAND = "raft/vip.sh", -# ARGUMENTS = "leader " +# ARGUMENTS = "leader interface ip_cidr [interface ip_cidr ...]" # ] # Executed when a server transits from leader->follower # RAFT_FOLLOWER_HOOK = [ # COMMAND = "raft/vip.sh", -# ARGUMENTS = "follower " +# ARGUMENTS = "follower interface ip_cidr [interface ip_cidr ...]" # ] #******************************************************************************* diff --git a/share/hooks/raft/vip.sh b/share/hooks/raft/vip.sh index cb73724272..ff67669c83 100755 --- a/share/hooks/raft/vip.sh +++ b/share/hooks/raft/vip.sh @@ -1,9 +1,11 @@ -#!/bin/bash -e +#!/bin/bash -ACTION="$1" -INTERFACE="$2" -IFADDR="$3" -IP="${IFADDR%%/*}" +# Setup virtual IP +# usage: +# vip.sh action interface ip [interface ip ...] +# Where action is one of: +# leader - New raft leader, set virtual IPs +# follower - unset virtual IPs # # functions @@ -37,21 +39,65 @@ is_systemd_unit_startable() return 1 } +# (Un)set the virtual IP +function virtualip() { + INTERFACE="$1" + IFADDR="$2" + IP="${IFADDR%%/*}" + + if [ -z "$INTERFACE" ]; then + echo "Missing interface." >&2 + exit 1 + fi + + if [ -z "$IFADDR" ]; then + echo "Missing IP." >&2 + exit 1 + fi + + ### + + case $ACTION in + leader) + sudo -n ip address add $IFADDR dev $INTERFACE + + for i in $(seq 5); do + sudo -n arping -c 1 -U -I $INTERFACE ${IP} + sleep 1 + sudo -n arping -c 1 -A -I $INTERFACE ${IP} + sleep 1 + done + ;; + + follower) + if ip address show dev $INTERFACE | grep -qi " ${IP}/"; then + sudo -n ip address del $IFADDR dev $INTERFACE + fi + ;; + + *) + echo "Unknown action '$ACTION'" >&2 + exit 1 + ;; + esac +} + # # main # -if [ -z "$INTERFACE" ]; then - echo "Missing interface." >&2 - exit 1 -fi +ACTION="$1" +shift -if [ -z "$IFADDR" ]; then - echo "Missing IP." >&2 - exit 1 -fi +# Process all parameters in the form of interface:IP +while [[ $# -gt 0 ]] +do + virtualip $1 $2 + shift + shift +done -### +# Start or stop OpenNebula services if which systemctl &>/dev/null && [ -d /etc/systemd ]; then IS_SYSTEMD=yes @@ -61,15 +107,6 @@ fi case $ACTION in leader) - sudo -n ip address add $IFADDR dev $INTERFACE - - for i in $(seq 5); do - sudo -n arping -c 1 -U -I $INTERFACE ${IP} - sleep 1 - sudo -n arping -c 1 -A -I $INTERFACE ${IP} - sleep 1 - done - if [ "${IS_SYSTEMD}" = 'yes' ]; then if systemctl is-enabled opennebula-flow >/dev/null 2>&1; then sudo -n systemctl start opennebula-flow @@ -107,10 +144,6 @@ leader) ;; follower) - if ip address show dev $INTERFACE | grep -qi " ${IP}/"; then - sudo -n ip address del $IFADDR dev $INTERFACE - fi - if [ "${IS_SYSTEMD}" = 'yes' ]; then if systemctl is-enabled opennebula-flow >/dev/null 2>&1 || systemctl is-active opennebula-flow >/dev/null 2>&1; diff --git a/src/cli/one_helper/onezone_helper.rb b/src/cli/one_helper/onezone_helper.rb index 2fde82e52a..68a4229e1f 100644 --- a/src/cli/one_helper/onezone_helper.rb +++ b/src/cli/one_helper/onezone_helper.rb @@ -36,6 +36,8 @@ class Replicator :service => 'opennebula' }, { :name => 'ec2_driver.default', :service => 'opennebula' }, + { :name => 'monitord.conf', + :service => 'opennebula' }, { :name => 'econe.conf', :service => 'opennebula-econe' }, { :name => 'oneflow-server.conf', diff --git a/src/im/InformationManager.cc b/src/im/InformationManager.cc index b0fa230c95..ff68cedc7f 100644 --- a/src/im/InformationManager.cc +++ b/src/im/InformationManager.cc @@ -60,29 +60,10 @@ int InformationManager::start() NebulaLog::info("InM", "Information Manager stopped."); }); - // Send the list of hosts to the driver + auto rftm = Nebula::instance().get_raftm(); + raft_status(rftm->get_state()); - auto * imd = get_driver("monitord"); - - if (!imd) - { - NebulaLog::error("InM", "Could not find information driver 'monitor'"); - - return rc; - } - - string xml_hosts; - - hpool->dump(xml_hosts, "", 0, -1, false); - - Message msg; - - msg.type(OpenNebulaMessages::HOST_LIST); - msg.payload(xml_hosts); - - imd->write(msg); - - return rc; + return 0; } /* -------------------------------------------------------------------------- */ @@ -94,7 +75,7 @@ void InformationManager::stop_monitor(int hid, const string& name, const string& if (!imd) { - NebulaLog::error("InM", "Could not find information driver 'monitor'"); + NebulaLog::error("InM", "Could not find information driver 'monitord'"); return; } @@ -127,7 +108,7 @@ int InformationManager::start_monitor(Host * host, bool update_remotes) if (!imd) { - host->error("Cannot find driver: 'monitor'"); + host->error("Cannot find driver: 'monitord'"); return -1; } @@ -187,6 +168,43 @@ void InformationManager::delete_host(int hid) /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ +void InformationManager::raft_status(RaftManager::State state) +{ + auto imd = get_driver("monitord"); + + if (!imd) + { + NebulaLog::error("InM", "Could not find information driver 'monitord'"); + + return; + } + + if (state == RaftManager::SOLO || state == RaftManager::LEADER) + { + // Send host pool to Monitor Daemon + string xml_hosts; + + hpool->dump(xml_hosts, "", 0, -1, false); + + Message msg; + + msg.type(OpenNebulaMessages::HOST_LIST); + msg.payload(xml_hosts); + + imd->write(msg); + } + + Message msg; + + msg.type(OpenNebulaMessages::RAFT_STATUS); + msg.payload(RaftManager::state_to_str(state)); + + imd->write(msg); +} + +/* -------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------- */ + void InformationManager::_undefined(unique_ptr> msg) { NebulaLog::warn("InM", "Received undefined message: " + msg->payload() + @@ -555,4 +573,3 @@ void InformationManager::_vm_state(unique_ptr> msg) /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ - diff --git a/src/im_mad/remotes/common.d/monitord-client.rb b/src/im_mad/remotes/common.d/monitord-client.rb index b697cf0386..4438305657 100644 --- a/src/im_mad/remotes/common.d/monitord-client.rb +++ b/src/im_mad/remotes/common.d/monitord-client.rb @@ -44,7 +44,7 @@ class MonitorClient MESSAGE_TYPES.each do |mt| define_method("#{mt}_udp".downcase.to_sym) do |rc, payload| msg = "#{mt} #{MESSAGE_STATUS[rc]} #{@hostid} #{pack(payload)}" - @socket_udp.send(msg, 0) + @socket_udp.send(msg, 0, @host, @port) end end @@ -75,7 +75,6 @@ class MonitorClient @port = addr[1] @socket_udp = UDPSocket.new(@family) - @socket_udp.connect(@host, @port) @pubkey = @opts[:pubkey] diff --git a/src/monitor/include/HostMonitorManager.h b/src/monitor/include/HostMonitorManager.h index c5a86d78ae..d289022f31 100644 --- a/src/monitor/include/HostMonitorManager.h +++ b/src/monitor/include/HostMonitorManager.h @@ -81,6 +81,12 @@ public: */ void stop_host_monitor(int oid); + /** + * Raft status changed + * @param state SOLO, CANDIDATE, FOLLOWER, LEADER + */ + void raft_status(const string& state); + /** * Updates the information of the given host. If it does not exist it is * added to the pool @@ -158,6 +164,7 @@ private: */ int monitor_interval_host; + bool is_leader; /** * Time in seconds to expire a monitoring action (5 minutes) */ diff --git a/src/monitor/include/OneMonitorDriver.h b/src/monitor/include/OneMonitorDriver.h index 8437f80ebc..da3ce9baa9 100644 --- a/src/monitor/include/OneMonitorDriver.h +++ b/src/monitor/include/OneMonitorDriver.h @@ -85,6 +85,11 @@ private: */ static void _stop_monitor(message_t msg); + /** + * Raft status changed + */ + static void _raft_status(message_t msg); + private: static HostMonitorManager * hm; }; diff --git a/src/monitor/include/OpenNebulaMessages.h b/src/monitor/include/OpenNebulaMessages.h index 7fb4aeb247..2b638c8dbd 100644 --- a/src/monitor/include/OpenNebulaMessages.h +++ b/src/monitor/include/OpenNebulaMessages.h @@ -30,11 +30,12 @@ enum class OpenNebulaMessages : unsigned short int HOST_LIST, UPDATE_HOST, DEL_HOST, - START_MONITOR, // not used - STOP_MONITOR, // not used + START_MONITOR, + STOP_MONITOR, HOST_STATE, VM_STATE, HOST_SYSTEM, + RAFT_STATUS, ENUM_MAX }; diff --git a/src/monitor/src/monitor/HostMonitorManager.cc b/src/monitor/src/monitor/HostMonitorManager.cc index 4785e19a74..cc993fe3ee 100644 --- a/src/monitor/src/monitor/HostMonitorManager.cc +++ b/src/monitor/src/monitor/HostMonitorManager.cc @@ -50,6 +50,7 @@ HostMonitorManager::HostMonitorManager( , threads(_threads) , timer_period(timer_period) , monitor_interval_host(monitor_interval_host) + , is_leader(false) { oned_driver = new OneMonitorDriver(this); udp_driver = new UDPMonitorDriver(addr, port); @@ -220,6 +221,16 @@ void HostMonitorManager::stop_host_monitor(int oid) /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ +void HostMonitorManager::raft_status(const string& state) +{ + NebulaLog::info("HMM", "Raft status: " + state); + + is_leader = state == "LEADER" || state == "SOLO"; +} + +/* -------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------- */ + void HostMonitorManager::monitor_host(int oid, bool result, const Template &tmpl) { auto host = hpool->get(oid); @@ -403,6 +414,11 @@ void HostMonitorManager::timer_action() hpool->clean_expired_monitoring(); vmpool->clean_expired_monitoring(); + if (!is_leader) + { + return; + } + set discovered_hosts; time_t now = time(nullptr); time_t target_time = now - monitor_interval_host; @@ -458,6 +474,11 @@ void HostMonitorManager::timer_action() void HostMonitorManager::start_host_monitor(const HostRPCPool::HostBaseLock& host) { + if (!is_leader) + { + return; + } + auto driver = driver_manager->get_driver(host->im_mad()); if (!driver) @@ -482,6 +503,11 @@ void HostMonitorManager::start_host_monitor(const HostRPCPool::HostBaseLock& hos void HostMonitorManager::stop_host_monitor(const HostRPCPool::HostBaseLock& host) { + if (!is_leader) + { + return; + } + auto driver = driver_manager->get_driver(host->im_mad()); if (!driver) diff --git a/src/monitor/src/monitor/OneMonitorDriver.cc b/src/monitor/src/monitor/OneMonitorDriver.cc index 586420b76c..6ead71a72d 100644 --- a/src/monitor/src/monitor/OneMonitorDriver.cc +++ b/src/monitor/src/monitor/OneMonitorDriver.cc @@ -42,6 +42,9 @@ OneMonitorDriver::OneMonitorDriver(HostMonitorManager * _hm) register_action(OpenNebulaMessages::STOP_MONITOR, &OneMonitorDriver::_stop_monitor); + + register_action(OpenNebulaMessages::RAFT_STATUS, + &OneMonitorDriver::_raft_status); } /* -------------------------------------------------------------------------- */ @@ -153,3 +156,11 @@ void OneMonitorDriver::_stop_monitor(message_t msg) { hm->stop_host_monitor(msg->oid()); } + +/* -------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------- */ + +void OneMonitorDriver::_raft_status(message_t msg) +{ + hm->raft_status(msg->payload()); +} diff --git a/src/monitor/src/protocol/Message.cc b/src/monitor/src/protocol/Message.cc index 51df3985d1..3c85f43f92 100644 --- a/src/monitor/src/protocol/Message.cc +++ b/src/monitor/src/protocol/Message.cc @@ -53,6 +53,7 @@ const EString Message::_type_str({ {"HOST_STATE", OpenNebulaMessages::HOST_STATE}, {"VM_STATE", OpenNebulaMessages::VM_STATE}, {"HOST_SYSTEM", OpenNebulaMessages::HOST_SYSTEM}, + {"RAFT_STATUS", OpenNebulaMessages::RAFT_STATUS}, }); /* ************************************************************************** */ diff --git a/src/raft/RaftManager.cc b/src/raft/RaftManager.cc index 797a2e422f..64a4341ad3 100644 --- a/src/raft/RaftManager.cc +++ b/src/raft/RaftManager.cc @@ -22,6 +22,7 @@ #include "LogDB.h" #include "AclManager.h" #include "Nebula.h" +#include "InformationManager.h" #include @@ -475,6 +476,9 @@ void RaftManager::leader() aclm->reload_rules(); + auto im = nd.get_im(); + im->raft_status(state); + if ( nd.is_federation_master() ) { frm->start_replica_threads(); @@ -542,6 +546,9 @@ void RaftManager::follower(unsigned int _term) commit = lapplied; leader_id = -1; + auto im = nd.get_im(); + im->raft_status(state); + NebulaLog::log("RCM", Log::INFO, "oned is set to follower mode"); next.clear(); From 7db3994f0e94849d4074cce9f14eae62e16e14f8 Mon Sep 17 00:00:00 2001 From: "Ruben S. Montero" Date: Mon, 25 May 2020 15:02:47 +0200 Subject: [PATCH 2/2] More leader check --- src/monitor/src/monitor/HostMonitorManager.cc | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/monitor/src/monitor/HostMonitorManager.cc b/src/monitor/src/monitor/HostMonitorManager.cc index cc993fe3ee..0a199de8ee 100644 --- a/src/monitor/src/monitor/HostMonitorManager.cc +++ b/src/monitor/src/monitor/HostMonitorManager.cc @@ -233,6 +233,11 @@ void HostMonitorManager::raft_status(const string& state) void HostMonitorManager::monitor_host(int oid, bool result, const Template &tmpl) { + if (!is_leader) + { + return; + } + auto host = hpool->get(oid); if (!host.valid()) @@ -297,6 +302,11 @@ void HostMonitorManager::monitor_host(int oid, bool result, const Template &tmpl void HostMonitorManager::update_last_monitor(int oid) { + if (!is_leader) + { + return; + } + auto host = hpool->get(oid); if (!host.valid()) @@ -321,6 +331,11 @@ void HostMonitorManager::monitor_vm(int oid, const string& uuid, const Template &tmpl) { + if (!is_leader) + { + return; + } + if (oid < 0) { // Wild VM, check if it is imported to OpenNebula @@ -357,6 +372,11 @@ void HostMonitorManager::monitor_vm(int oid, void HostMonitorManager::start_monitor_failure(int oid) { + if (!is_leader) + { + return; + } + NebulaLog::error("HMM", "Unable to monitor host id: " + to_string(oid)); auto host = hpool->get(oid); @@ -375,6 +395,11 @@ void HostMonitorManager::start_monitor_failure(int oid) void HostMonitorManager::start_monitor_success(int oid) { + if (!is_leader) + { + return; + } + NebulaLog::debug("HMM", "Start monitor success, host: " + to_string(oid)); auto host = hpool->get(oid);