1
0
mirror of https://github.com/OpenNebula/one.git synced 2024-12-25 23:21:29 +03:00
one/share/hooks/ft/host_error.rb

274 lines
7.5 KiB
Ruby
Raw Normal View History

2011-07-11 20:42:22 +04:00
#!/usr/bin/env ruby
2011-01-17 17:27:10 +03:00
# -------------------------------------------------------------------------- #
2017-05-25 17:07:35 +03:00
# Copyright 2002-2017, OpenNebula Project, OpenNebula Systems #
2011-01-17 17:27:10 +03:00
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may #
# not use this file except in compliance with the License. You may obtain #
# a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
#--------------------------------------------------------------------------- #
##############################################################################
2011-01-17 17:27:10 +03:00
# Script to implement host failure tolerance
# One of the following modes must be chosen
# -m resched VMs to another host. (Only for images in shared storage!)
# -r recreate VMs running in the host. State will be lost.
2011-07-11 20:42:22 +04:00
# -d delete VMs running in the host
#
# Additional flags
# -f resubmit suspended and powered off VMs (only for recreate)
# -p <n> avoid resubmission if host comes back after n monitoring
# cycles. 0 to disable it. Default is 2.
# -u disables fencing. Fencing is enabled by default. Don't disable it
# unless you are very sure about what you're doing
##############################################################################
2011-01-17 17:27:10 +03:00
ONE_LOCATION=ENV["ONE_LOCATION"]
if !ONE_LOCATION
RUBY_LIB_LOCATION="/usr/lib/one/ruby"
VMDIR="/var/lib/one"
CONFIG_FILE="/var/lib/one/config"
LOG_FILE="/var/log/one/host_error.log"
2011-01-17 17:27:10 +03:00
else
RUBY_LIB_LOCATION=ONE_LOCATION+"/lib/ruby"
VMDIR=ONE_LOCATION+"/var"
CONFIG_FILE=ONE_LOCATION+"/var/config"
LOG_FILE=ONE_LOCATION+"/var/host_error.log"
2011-01-17 17:27:10 +03:00
end
FENCE_HOST = File.dirname(__FILE__) + '/fence_host.sh'
2011-01-17 17:27:10 +03:00
$: << RUBY_LIB_LOCATION
2013-05-29 12:42:15 +04:00
require 'opennebula'
include OpenNebula
2012-10-19 15:17:27 +04:00
require 'getoptlong'
require 'base64'
require 'open3'
################################################################################
# Arguments
################################################################################
2011-01-17 17:27:10 +03:00
HOST_ID = ARGV[0]
if HOST_ID.nil?
2011-01-17 17:27:10 +03:00
exit -1
end
################################################################################
# Methods
################################################################################
def log(msg, level="I")
File.open(LOG_FILE, 'a') do |f|
msg.lines do |l|
f.puts "[#{Time.now}][HOST #{HOST_ID}][#{level}] #{l}"
end
end
end
def log_error(msg)
log(msg, "E")
end
def exit_error
log_error("Exiting due to previous error.")
exit(-1)
end
def states_xpath(*arr)
arr.map{|e| "STATE=#{e}"}.join(" or ")
end
################################################################################
# Options
################################################################################
mode = nil # **must** be set to something other than nil using the options
force = false # By default, don't recreate/delete suspended and poweroff VMs
repeat = 2 # By default, wait for 2 monitorization cycles
fencing = true
2011-01-17 17:27:10 +03:00
2012-10-19 15:17:27 +04:00
opts = GetoptLong.new(
['--migrate', '-m', GetoptLong::NO_ARGUMENT],
['--delete', '-d', GetoptLong::NO_ARGUMENT],
['--recreate', '-r', GetoptLong::NO_ARGUMENT],
['--force', '-f', GetoptLong::NO_ARGUMENT],
['--pause', '-p', GetoptLong::REQUIRED_ARGUMENT],
['--no-fencing', '-u', GetoptLong::NO_ARGUMENT]
2012-10-19 15:17:27 +04:00
)
begin
opts.each do |opt, arg|
case opt
when '--migrate'
mode = :migrate
2012-10-19 15:17:27 +04:00
when '--delete'
mode = :delete
when '--recreate'
mode = :recreate
2012-10-19 15:17:27 +04:00
when '--force'
force = true
2012-10-19 15:17:27 +04:00
when '--pause'
repeat = arg.to_i
when '--no-fencing'
fencing = false
2012-10-19 15:17:27 +04:00
end
end
rescue Exception => e
log_error e.to_s
exit_error
end
if mode.nil?
log_error "Exiting. A mode must be supplied."
exit_error
2012-10-19 15:17:27 +04:00
end
################################################################################
# Main
################################################################################
log "Hook launched"
2011-01-17 17:27:10 +03:00
begin
client = Client.new()
rescue Exception => e
log_error e.to_s
exit_error
end
sys = OpenNebula::System.new(client)
conf = sys.get_configuration
begin
MONITORING_INTERVAL = conf['MONITORING_INTERVAL'] || 60
rescue Exception => e
log_error "Could not get MONITORING_INTERVAL"
log_error e.to_s
exit_error
2011-01-17 17:27:10 +03:00
end
# Retrieve hostname
host = OpenNebula::Host.new_with_id(HOST_ID, client)
rc = host.info
if OpenNebula.is_error?(rc)
log_error "Could not get host info"
exit_error
end
log "hostname: #{host.name}"
if repeat > 0
log "Wait #{repeat} cycles."
# Sleep through the desired number of monitor interval
period = repeat * MONITORING_INTERVAL.to_i
log "Sleeping #{period} seconds."
sleep(period)
rc = host.info
if OpenNebula.is_error?(rc)
log_error "Could not get host info"
exit_error
end
# If the host came back, exit! avoid duplicated VMs
if host.state != 3 && host.state != 5
log "Exiting. Host came back after waiting."
exit 0
end
end
# Do fencing
if fencing
host64 = Base64::strict_encode64(host.to_xml)
log "Fencing enabled"
begin
i, oe, w = Open3.popen2e(FENCE_HOST, host64)
if w.value.success?
log oe.read
log "Fencing success"
else
raise oe.read << "\n" << "Fencing error"
end
rescue Exception => e
log_error e.to_s
exit_error
end
else
log "WARNING: Fencing disabled"
end
2011-01-17 17:27:10 +03:00
# Loop through all vms
2011-07-11 20:42:22 +04:00
vms = VirtualMachinePool.new(client)
rc = vms.info_all
if OpenNebula.is_error?(rc)
exit_error "Could not get vm pool"
end
2011-01-17 17:27:10 +03:00
# STATE=3: ACTIVE (LCM unknown)
# STATE=5: SUSPENDED
# STATE=8: POWEROFF
2011-01-17 17:27:10 +03:00
if mode == :recreate && !force
log "states: 3"
state = states_xpath(3)
else
log "states: 3, 5, 8"
state = states_xpath(3, 5, 8)
end
2011-01-17 17:27:10 +03:00
xpath = "/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host.name}\" and last()]"
vm_ids_array = vms.retrieve_elements("#{xpath}/../../ID")
if vm_ids_array
log "vms: #{vm_ids_array}"
2011-07-11 20:42:22 +04:00
vm_ids_array.each do |vm_id|
vm = OpenNebula::VirtualMachine.new_with_id(vm_id, client)
rc = vm.info
if OpenNebula.is_error?(rc)
log_error "Could not get info of VM #{vm_id}"
next
end
case mode
when :recreate
log "recreate #{vm_id}"
2013-04-11 18:39:55 +04:00
vm.delete(true)
when :delete
log "delete #{vm_id}"
2013-04-11 18:39:55 +04:00
vm.delete
when :migrate
log "resched #{vm_id}"
vm.resched
else
2016-09-29 12:06:28 +03:00
log_error "unknown mode '#{mode}'"
exit_error
end
end
else
log "No VMs found."
end
log "Hook finished"
exit 0