From f5d6f11730ab79aec01631e16d916d07d8546af3 Mon Sep 17 00:00:00 2001 From: "Ruben S. Montero" Date: Mon, 18 Sep 2023 15:41:22 +0200 Subject: [PATCH] F #6030: Cancel backup in pre-backup phase * New Cancel module to backup_qcow2.rb script. This module is a helper to cancel ongoing backup operations in the prebackup phase. * Checks libvirt for active backup domain operations to stop them. Remove @ongoing flag * Trap SIGTERM in backup_qcow2, prebackup and prebackup_live scripts * Cleaner backup func in onevm_exec.rb (reuse post array) co-authored-by: Michal Opala --- src/datastore_mad/remotes/rsync/backup_cancel | 7 +- src/tm_mad/lib/backup_qcow2.rb | 104 ++++++++++++++++-- src/tm_mad/qcow2/prebackup | 36 ++++++ src/tm_mad/qcow2/prebackup_live | 36 ++++++ src/vmm_mad/exec/one_vmm_exec.rb | 47 ++++---- 5 files changed, 194 insertions(+), 36 deletions(-) diff --git a/src/datastore_mad/remotes/rsync/backup_cancel b/src/datastore_mad/remotes/rsync/backup_cancel index 0251070ba7..965927a3cb 100755 --- a/src/datastore_mad/remotes/rsync/backup_cancel +++ b/src/datastore_mad/remotes/rsync/backup_cancel @@ -64,12 +64,11 @@ vm_host = dir[0] vm_dir = Pathname.new(dir[1]).cleanpath.to_s begin - # Kill the backup script. + # Kill the pre/backup/_live scripts. script = <<~EOS set -x -e -o pipefail; shopt -qs failglob (ps --no-headers -o pid,cmd -C ruby \ - | awk '$0 !~ "prebackup(_live)? .*#{vm_uuid} " { print }' \ - | awk '$0 ~ "backup .*#{vm_uuid} " { print $1 } END { print "\\0" }' || :) \\ + | awk '$0 ~ "(pre)?backup(_live)? .*#{vm_uuid} " { print $1 } END { print "\\0" }' || :) \\ | (read -d '' PIDS [[ -n "$PIDS" ]] || exit 0 # empty [[ -z "${PIDS//[[:space:][:digit:]]/}" ]] || exit -1 # !integers @@ -78,7 +77,7 @@ begin rc = LocalCommand.run '/bin/bash -s', nil, script - raise StandardError, "Unable to stop rsync backup action: #{rc.stderr}" \ + raise StandardError, "Unable to stop rsync backup actions: #{rc.stderr}" \ if rc.code != 0 # Kill the rsync process. This step is a failsafe in case the TERM signal handler diff --git a/src/tm_mad/lib/backup_qcow2.rb b/src/tm_mad/lib/backup_qcow2.rb index 2e1d5d5356..f78ef6ebed 100755 --- a/src/tm_mad/lib/backup_qcow2.rb +++ b/src/tm_mad/lib/backup_qcow2.rb @@ -37,6 +37,7 @@ require_relative 'kvm' # # BDRV_MAX_REQUEST is the limit for the sieze of qemu-io operations #------------------------------------------------------------------------------- +CMD_ARGV = [$PROGRAM_NAME] + ARGV LOG_FILE = nil QEMU_IO_OPEN = '-t none -i native -o driver=qcow2' IO_ASYNC = false @@ -100,6 +101,69 @@ module Command end +#--------------------------------------------------------------------------- +# Helper module to kill running processes +#--------------------------------------------------------------------------- +module Cancel + + extend Command + + def self.find_task(select = /#{$PROGRAM_NAME}/) + out = cmd('ps', '--no-headers -o pid,cmd -C ruby') + + pids = out.lines.each_with_object([]) do |line, acc| + line.strip! + next if line.empty? + + pid, command = line.split(' ', 2) + next unless command.match?(select) + + acc << pid.to_i + end - [Process.pid] + + raise StandardError, 'Too many tasks found, ambiguous result' if pids.size > 1 + + pids.first + end + + def self.find_subtasks(ppid, reject = / (blockcommit|snapshot-delete) /) + begin + out = cmd('ps', "--no-headers -o pid,cmd --ppid '#{ppid}'") + rescue StandardError + return [] + end + + out.lines.each_with_object([]) do |line, acc| + line.strip! + next if line.empty? + + pid, command = line.split(' ', 2) + next if command.match?(reject) + + acc << pid.to_i + end - [Process.pid] + end + + def self.running?(vxml) + ppid = find_task(/#{$PROGRAM_NAME}.*#{vxml}/) + !ppid.nil? + end + + def self.killall(vxml, signal = :TERM) + ppid = find_task(/#{$PROGRAM_NAME}.*#{vxml}/) + + raise StandardError, 'Parent task not running' if ppid.nil? + + pids = find_subtasks(ppid) + + pids.each do |pid| + log("[KIL]: sending #{signal} to pid=#{pid}") + Process.kill(signal, pid) + end + end + +end + #------------------------------------------------------------------------------- # Setup an NBD server to pull changes, an optional map can be provided #------------------------------------------------------------------------------- @@ -339,7 +403,7 @@ class KVMDomain include TransferManager::KVM include Command - attr_reader :parent_id, :backup_id, :checkpoint + attr_reader :parent_id, :backup_id, :checkpoint, :tmp_dir, :bck_dir #--------------------------------------------------------------------------- # @param vm[REXML::Document] OpenNebula XML VM information @@ -383,7 +447,6 @@ class KVMDomain @socket = "#{opts[:vm_dir]}/backup.socket" # State variables for domain operations - @ongoing = false @frozen = nil end @@ -835,19 +898,25 @@ class KVMDomain opts[:checkpointxml] = check_path if checkpoint cmd("#{virsh} backup-begin", @dom, opts) - - @ongoing = true end #--------------------------------------------------------------------------- # Stop an ongoing Backup operation on the domain #--------------------------------------------------------------------------- def stop_backup - return unless @ongoing + out = cmd("#{virsh} domjobinfo", @dom, {}) + + # Parse domjobinfo's output. + job = out.lines.each_with_object({}) do |item, acc| + key, value = item.split(':', 2) + acc[key.strip] = value.strip unless value.nil? + end + + # Check if there is an ongoing backup operation. + return unless job.key?('Job type') && job['Job type'] != 'None' + return unless job['Operation'] == 'Backup' cmd("#{virsh} domjobabort", @dom, {}) - ensure - @ongoing = false end end @@ -890,6 +959,27 @@ begin exit(0) end + #--------------------------------------------------------------------------- + # Cancel logic. When SIGTERM is received it kills all subtasks and + # terminates current backup operation + #--------------------------------------------------------------------------- + pipe_r, pipe_w = IO.pipe + + Thread.new do + loop do + rs, _ws, _es = IO.select([pipe_r]) + break if rs[0] == pipe_r + end + + Cancel.killall(vxml) if Cancel.running?(vxml) + + exit(-1) + end + + Signal.trap(:TERM) do + pipe_w.write 'W' + end + #--------------------------------------------------------------------------- # Backup operation # - (live - full) Creates a snapshot to copy the disks via qemu-convert diff --git a/src/tm_mad/qcow2/prebackup b/src/tm_mad/qcow2/prebackup index a43892a0ac..2716e1b62c 100755 --- a/src/tm_mad/qcow2/prebackup +++ b/src/tm_mad/qcow2/prebackup @@ -75,6 +75,42 @@ bck_dir = "#{rdir}/backup" qcow2_util = '/var/tmp/one/tm/lib/backup_qcow2.rb' qcow2_cmd = "#{qcow2_util} -d \"#{disks}\" -x #{bck_dir}/vm.xml -p #{rdir}" +pipe_r, pipe_w = IO.pipe + +Thread.new do + loop do + rs, _ws, _es = IO.select([pipe_r]) + break if rs[0] == pipe_r + end + + script = <<~EOS + set -x -e -o pipefail; shopt -qs failglob + (ps --no-headers -o pid,cmd -C ruby \ + | awk '$0 ~ "#{qcow2_util} .* -p #{rdir}" { print $1 } END { print "\\0" }' || :) \\ + | (read -d '' PIDS + [[ -n "$PIDS" ]] || exit 0 # empty + [[ -z "${PIDS//[[:space:][:digit:]]/}" ]] || exit -1 # !integers + kill -s TERM $PIDS) + EOS + + TransferManager::Action.ssh 'prebackup_cancel', + :host => rhost, + :cmds => script, + :nostdout => true, + :nostderr => false + + STDERR.puts "Prebackup cancelled: #{bck_dir}" + STDERR.flush + + # Suppress "`read': stream closed in another thread (IOError)". + STDOUT.reopen IO::NULL + STDERR.reopen IO::NULL +end + +Signal.trap(:TERM) do + pipe_w.write 'W' +end + ds = TransferManager::Datastore.new(:vm_xml => vm_xml) cmd = ds.cmd_confinement(qcow2_cmd, rdir) diff --git a/src/tm_mad/qcow2/prebackup_live b/src/tm_mad/qcow2/prebackup_live index 439a3e5b59..3a1066c47e 100755 --- a/src/tm_mad/qcow2/prebackup_live +++ b/src/tm_mad/qcow2/prebackup_live @@ -81,6 +81,42 @@ bck_dir = "#{rdir}/backup" qcow2_util = '/var/tmp/one/tm/lib/backup_qcow2.rb' qcow2_cmd = "#{qcow2_util} -l -d \"#{disks}\" -x #{bck_dir}/vm.xml -p #{rdir}" +pipe_r, pipe_w = IO.pipe + +Thread.new do + loop do + rs, _ws, _es = IO.select([pipe_r]) + break if rs[0] == pipe_r + end + + script = <<~EOS + set -x -e -o pipefail; shopt -qs failglob + (ps --no-headers -o pid,cmd -C ruby \ + | awk '$0 ~ "#{qcow2_util} .* -p #{rdir}" { print $1 } END { print "\\0" }' || :) \\ + | (read -d '' PIDS + [[ -n "$PIDS" ]] || exit 0 # empty + [[ -z "${PIDS//[[:space:][:digit:]]/}" ]] || exit -1 # !integers + kill -s TERM $PIDS) + EOS + + TransferManager::Action.ssh 'prebackup_live_cancel', + :host => rhost, + :cmds => script, + :nostdout => true, + :nostderr => false + + STDERR.puts "Live prebackup cancelled: #{bck_dir}" + STDERR.flush + + # Suppress "`read': stream closed in another thread (IOError)". + STDOUT.reopen IO::NULL + STDERR.reopen IO::NULL +end + +Signal.trap(:TERM) do + pipe_w.write 'W' +end + ds = TransferManager::Datastore.new(:vm_xml => vm_xml) cmd = ds.cmd_confinement(qcow2_cmd, rdir) diff --git a/src/vmm_mad/exec/one_vmm_exec.rb b/src/vmm_mad/exec/one_vmm_exec.rb index bc2083637e..ff1fc3ebd6 100755 --- a/src/vmm_mad/exec/one_vmm_exec.rb +++ b/src/vmm_mad/exec/one_vmm_exec.rb @@ -1335,31 +1335,7 @@ class ExecDriver < VirtualMachineDriver vm_xml = xml_data.elements['/VMM_DRIVER_ACTION_DATA/VM'] - # Backup operation steps - # TODO: failover steps - steps = [ - # Generate backup files for VM disks - { - :driver => :tm, - :action => pre_name, - :parameters => pre_tm, - :stdin => vm_xml - }, - # Upload backup files to repo - { - :driver => :ds, - :action => :backup, - :parameters => ds_command, - :stdin => xml_data.elements['DATASTORE'].to_s, - :fail_actions => [ - { - :driver => :tm, - :action => post_name, - :parameters => post_tm, - :stdin => vm_xml - } - ] - }, + cleanup_steps = [ # Cleanup backup and tmp files { :driver => :tm, @@ -1369,6 +1345,27 @@ class ExecDriver < VirtualMachineDriver } ] + # Backup operation steps + # TODO: failover steps + steps = [ + # Generate backup files for VM disks + { + :driver => :tm, + :action => pre_name, + :parameters => pre_tm, + :stdin => vm_xml, + :fail_actions => cleanup_steps + }, + # Upload backup files to repo + { + :driver => :ds, + :action => :backup, + :parameters => ds_command, + :stdin => xml_data.elements['DATASTORE'].to_s, + :fail_actions => cleanup_steps + } + ] + cleanup_steps + action.run(steps) end