mirror of
git://git.proxmox.com/git/qemu-server.git
synced 2025-01-25 06:03:52 +03:00
vm_resume: correctly honor $nocheck
for both vm_mon_cmd calls. under certain circumstances, the following sequence of events can otherwise fail when live-migrating under load: S...source node T...target node 0: migration is complete, handover from S to T starts 1: S: logically move VM config file from S to T via rename() 2: S: rename returns, config file is (visibly) moved on S 3: S: trigger resume on T via mtunnel 4a: T: call vm_resume while config file move is not yet visible on T 4b: T: call vm_resume while config file move is already visible on T 4a instead of 4b means vm_mon_cmd will die in check_running unless vm_mon_cmd_nocheck is used. under heavy pmxcfs load and a slow cluster/corosync network, there can be a few seconds of delay between 1 and 2, with a subsequent race ending in 4a instead of 4b. this issue was reported to occur on bulk migrations. Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
This commit is contained in:
parent
e6afd9e13b
commit
3e24733bdf
@ -5766,8 +5766,8 @@ sub vm_resume {
|
||||
my ($vmid, $skiplock, $nocheck) = @_;
|
||||
|
||||
PVE::QemuConfig->lock_config($vmid, sub {
|
||||
|
||||
my $res = vm_mon_cmd($vmid, 'query-status');
|
||||
my $vm_mon_cmd = $nocheck ? \&vm_mon_cmd_nocheck : \&vm_mon_cmd;
|
||||
my $res = $vm_mon_cmd->($vmid, 'query-status');
|
||||
my $resume_cmd = 'cont';
|
||||
|
||||
if ($res->{status} && $res->{status} eq 'suspended') {
|
||||
@ -5780,12 +5780,9 @@ sub vm_resume {
|
||||
|
||||
PVE::QemuConfig->check_lock($conf)
|
||||
if !($skiplock || PVE::QemuConfig->has_lock($conf, 'backup'));
|
||||
|
||||
vm_mon_cmd($vmid, $resume_cmd);
|
||||
|
||||
} else {
|
||||
vm_mon_cmd_nocheck($vmid, $resume_cmd);
|
||||
}
|
||||
|
||||
$vm_mon_cmd->($vmid, $resume_cmd);
|
||||
});
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user