replication: improve stale volume detection, allow sync from parent snapshot
We pass a list of storage to scan for stale volumes to prepare_local_job(). So we make sure that we only activate/scan related storages. Snapshot rollback may remove local replication shapshots. In that case we still have the $conf->{parent} snapshot on both sides, so we can use that as base snapshot.
This commit is contained in:
parent
4ce4ae7047
commit
91ee6a2fec
160
PVE/CLI/pvesr.pm
160
PVE/CLI/pvesr.pm
@ -25,6 +25,28 @@ sub setup_environment {
|
||||
PVE::RPCEnvironment->setup_default_cli_env();
|
||||
}
|
||||
|
||||
# fixme: get from plugin??
|
||||
my $replicatable_storage_types = {
|
||||
zfspool => 1,
|
||||
};
|
||||
|
||||
my $check_wanted_volid = sub {
|
||||
my ($storecfg, $vmid, $volid, $local_node) = @_;
|
||||
|
||||
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
|
||||
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
|
||||
die "storage '$storeid' is not replicatable\n"
|
||||
if !$replicatable_storage_types->{$scfg->{type}};
|
||||
|
||||
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
|
||||
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
|
||||
if $vtype ne 'images';
|
||||
die "volume '$volid' has wrong owner\n"
|
||||
if !$ownervm || $vmid != $ownervm;
|
||||
|
||||
return $storeid;
|
||||
};
|
||||
|
||||
__PACKAGE__->register_method ({
|
||||
name => 'prepare_local_job',
|
||||
path => 'prepare_local_job',
|
||||
@ -36,6 +58,11 @@ __PACKAGE__->register_method ({
|
||||
id => get_standard_option('pve-replication-id'),
|
||||
'extra-args' => get_standard_option('extra-args', {
|
||||
description => "The list of volume IDs to consider." }),
|
||||
scan => {
|
||||
description => "List of storage IDs to scan for stale volumes.",
|
||||
type => 'string', format => 'pve-storage-id-list',
|
||||
optional => 1,
|
||||
},
|
||||
force => {
|
||||
description => "Allow to remove all existion volumes (empty volume list).",
|
||||
type => 'boolean',
|
||||
@ -48,74 +75,87 @@ __PACKAGE__->register_method ({
|
||||
minimum => 0,
|
||||
optional => 1,
|
||||
},
|
||||
parent_snapname => get_standard_option('pve-snapshot-name', {
|
||||
optional => 1,
|
||||
}),
|
||||
},
|
||||
},
|
||||
returns => { type => 'null' },
|
||||
code => sub {
|
||||
my ($param) = @_;
|
||||
|
||||
my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
|
||||
my $last_sync = $param->{last_sync} // 0;
|
||||
|
||||
my $local_node = PVE::INotify::nodename();
|
||||
|
||||
my $vms = PVE::Cluster::get_vmlist();
|
||||
die "guest '$vmid' is on local node\n"
|
||||
if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq $local_node;
|
||||
|
||||
my $storecfg = PVE::Storage::config();
|
||||
|
||||
my $dl = PVE::Storage::vdisk_list($storecfg, undef, $vmid);
|
||||
|
||||
my $volids = [];
|
||||
|
||||
die "no volumes specified\n"
|
||||
if !$param->{force} && !scalar(@{$param->{'extra-args'}});
|
||||
|
||||
foreach my $volid (@{$param->{'extra-args'}}) {
|
||||
|
||||
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
|
||||
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
|
||||
die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
|
||||
|
||||
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
|
||||
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
|
||||
if $vtype ne 'images';
|
||||
die "volume '$volid' has wrong owner\n"
|
||||
if !$ownervm || $vmid != $ownervm;
|
||||
|
||||
my $found = 0;
|
||||
foreach my $info (@{$dl->{$storeid}}) {
|
||||
if ($info->{volid} eq $volid) {
|
||||
$found = 1;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
push @$volids, $volid if $found;
|
||||
}
|
||||
|
||||
$volids = [ sort @$volids ];
|
||||
|
||||
my $logfunc = sub {
|
||||
my ($msg) = @_;
|
||||
print STDERR "$msg\n";
|
||||
};
|
||||
|
||||
# remove stale volumes
|
||||
foreach my $storeid (keys %$dl) {
|
||||
my $local_node = PVE::INotify::nodename();
|
||||
|
||||
die "no volumes specified\n"
|
||||
if !$param->{force} && !scalar(@{$param->{'extra-args'}});
|
||||
|
||||
my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
|
||||
|
||||
my $vms = PVE::Cluster::get_vmlist();
|
||||
die "guest '$vmid' is on local node\n"
|
||||
if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq $local_node;
|
||||
|
||||
my $last_sync = $param->{last_sync} // 0;
|
||||
my $parent_snapname = $param->{parent_snapname};
|
||||
|
||||
my $storecfg = PVE::Storage::config();
|
||||
|
||||
# compute list of storages we want to scan
|
||||
my $storage_hash = {};
|
||||
foreach my $storeid (PVE::Tools::split_list($param->{scan})) {
|
||||
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node, 1);
|
||||
next if !$scfg || $scfg->{shared};
|
||||
foreach my $info (@{$dl->{$storeid}}) {
|
||||
my $volid = $info->{volid};
|
||||
next if grep { $_ eq $volid } @$volids;
|
||||
$logfunc->("$jobid: delete stale volume '$volid'");
|
||||
PVE::Storage::vdisk_free($storecfg, $volid);
|
||||
}
|
||||
next if !$scfg; # simply ignore unavailable storages here
|
||||
die "storage '$storeid' is not replicatable\n" if !$replicatable_storage_types->{$scfg->{type}};
|
||||
$storage_hash->{$storeid} = 1;
|
||||
}
|
||||
|
||||
my $last_snapshots = PVE::Replication::prepare(
|
||||
$storecfg, $volids, $jobid, $last_sync, undef, $logfunc);
|
||||
my $wanted_volids = {};
|
||||
foreach my $volid (@{$param->{'extra-args'}}) {
|
||||
my $storeid = $check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
|
||||
$wanted_volids->{$volid} = 1;
|
||||
$storage_hash->{$storeid} = 1;
|
||||
}
|
||||
my $storage_list = [ sort keys %$storage_hash ];
|
||||
|
||||
# activate all used storage
|
||||
my $cache = {};
|
||||
PVE::Storage::activate_storage_list($storecfg, $storage_list, $cache);
|
||||
|
||||
my $snapname = PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
|
||||
|
||||
# find replication snapshots
|
||||
my $last_snapshots = {};
|
||||
foreach my $storeid (@$storage_list) {
|
||||
my $scfg = PVE::Storage::storage_config($storecfg, $storeid);
|
||||
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
|
||||
my $volids = $plugin->list_images($storeid, $scfg, $vmid, undef, $cache);
|
||||
foreach my $volid (@$volids) {
|
||||
my ($storeid, $volname) = parse_volume_id($volid);
|
||||
my $list = $plugin->volume_snapshot_list($scfg, $storeid, $volname); # fixme: pass $cache
|
||||
my $found_replication_snapshots = 0;
|
||||
foreach my $snap (@$list) {
|
||||
if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
|
||||
$last_snapshots->{$volid}->{$snap} = 1 if $wanted_volids->{$volid};
|
||||
} elsif ($snap =~ m/^__replication_/) {
|
||||
$found_replication_snapshots = 1;
|
||||
if ($wanted_volids->{$volid}) {
|
||||
$logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
|
||||
PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
|
||||
}
|
||||
}
|
||||
}
|
||||
# remove stale volumes
|
||||
if ($found_replication_snapshots && !$wanted_volids->{$volid}) {
|
||||
$logfunc->("$jobid: delete stale volume '$volid'");
|
||||
PVE::Storage::vdisk_free($storecfg, $volid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print to_json($last_snapshots) . "\n";
|
||||
|
||||
@ -161,17 +201,7 @@ __PACKAGE__->register_method ({
|
||||
die "no volumes specified\n" if !scalar(@{$param->{'extra-args'}});
|
||||
|
||||
foreach my $volid (@{$param->{'extra-args'}}) {
|
||||
|
||||
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
|
||||
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
|
||||
die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
|
||||
|
||||
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
|
||||
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
|
||||
if $vtype ne 'images';
|
||||
die "volume '$volid' has wrong owner\n"
|
||||
if !$ownervm || $vmid != $ownervm;
|
||||
|
||||
$check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
|
||||
push @$volids, $volid;
|
||||
}
|
||||
|
||||
|
@ -113,13 +113,15 @@ my $get_next_job = sub {
|
||||
};
|
||||
|
||||
sub remote_prepare_local_job {
|
||||
my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
|
||||
my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
|
||||
|
||||
my $ssh_cmd = PVE::Cluster::ssh_info_to_command($ssh_info);
|
||||
my $cmd = [@$ssh_cmd, '--', 'pvesr', 'prepare-local-job', $jobid];
|
||||
push @$cmd, '--scan', join(',', @$storeid_list) if scalar(@$storeid_list);
|
||||
push @$cmd, @$volumes if scalar(@$volumes);
|
||||
|
||||
push @$cmd, '--last_sync', $last_sync;
|
||||
push @$cmd, '--parent_snapname', $parent_snapname;
|
||||
push @$cmd, '--force' if $force;
|
||||
|
||||
my $remote_snapshots;
|
||||
@ -147,21 +149,25 @@ sub remote_finalize_local_job {
|
||||
PVE::Tools::run_command($cmd);
|
||||
}
|
||||
|
||||
# finds local replication snapshots from $last_sync
|
||||
# and removes all replication snapshots with other time stamps
|
||||
sub prepare {
|
||||
my ($storecfg, $volids, $jobid, $last_sync, $start_time, $logfunc) = @_;
|
||||
my ($storecfg, $volids, $jobid, $last_sync, $parent_snapname, $logfunc) = @_;
|
||||
|
||||
$last_sync //= 0;
|
||||
|
||||
my ($prefix, $snapname) =
|
||||
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
|
||||
|
||||
my $last_snapshots = {};
|
||||
foreach my $volid (@$volids) {
|
||||
my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid, $prefix);
|
||||
my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid);
|
||||
my $found = 0;
|
||||
foreach my $snap (@$list) {
|
||||
if ($snap eq $snapname) {
|
||||
$last_snapshots->{$volid} = 1;
|
||||
} else {
|
||||
$logfunc->("$jobid: delete stale snapshot '$snap' on $volid");
|
||||
if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
|
||||
$last_snapshots->{$volid}->{$snap} = 1;
|
||||
} elsif ($snap =~ m/^\Q$prefix\E/) {
|
||||
$logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
|
||||
PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
|
||||
}
|
||||
}
|
||||
@ -251,11 +257,11 @@ sub replicate {
|
||||
if ($remove_job eq 'full' && $jobcfg->{target} ne $local_node) {
|
||||
# remove all remote volumes
|
||||
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target});
|
||||
remote_prepare_local_job($ssh_info, $jobid, $vmid, [], 0, 1);
|
||||
remote_prepare_local_job($ssh_info, $jobid, $vmid, [], $state->{storeid_list}, 0, undef, 1);
|
||||
|
||||
}
|
||||
# remove all local replication snapshots (lastsync => 0)
|
||||
prepare($storecfg, $sorted_volids, $jobid, 0, $start_time, $logfunc);
|
||||
prepare($storecfg, $sorted_volids, $jobid, 0, undef, $logfunc);
|
||||
|
||||
delete_job($jobid); # update config
|
||||
$logfunc->("$jobid: job removed");
|
||||
@ -265,19 +271,22 @@ sub replicate {
|
||||
|
||||
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target}, $migration_network);
|
||||
|
||||
# prepare remote side
|
||||
my $remote_snapshots = remote_prepare_local_job(
|
||||
$ssh_info, $jobid, $vmid, $sorted_volids, $last_sync);
|
||||
|
||||
# test if we have a replication_ snapshot from last sync
|
||||
# and remove all other/stale replication snapshots
|
||||
my $last_sync_snapname =
|
||||
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
|
||||
my $sync_snapname =
|
||||
PVE::ReplicationState::replication_snapshot_name($jobid, $start_time);
|
||||
|
||||
my $parent_snapname = $conf->{parent};
|
||||
|
||||
# test if we have a replication_ snapshot from last sync
|
||||
# and remove all other/stale replication snapshots
|
||||
|
||||
my $last_snapshots = prepare(
|
||||
$storecfg, $sorted_volids, $jobid, $last_sync, $start_time, $logfunc);
|
||||
$storecfg, $sorted_volids, $jobid, $last_sync, $parent_snapname, $logfunc);
|
||||
|
||||
# prepare remote side
|
||||
my $remote_snapshots = remote_prepare_local_job(
|
||||
$ssh_info, $jobid, $vmid, $sorted_volids, $state->{storeid_list}, $last_sync, $parent_snapname, 0);
|
||||
|
||||
my $storeid_hash = {};
|
||||
foreach my $volid (@$sorted_volids) {
|
||||
@ -313,7 +322,7 @@ sub replicate {
|
||||
my $cleanup_local_snapshots = sub {
|
||||
my ($volid_hash, $snapname) = @_;
|
||||
foreach my $volid (sort keys %$volid_hash) {
|
||||
$logfunc->("$jobid: delete snapshot '$snapname' on $volid");
|
||||
$logfunc->("$jobid: delete previous replication snapshot '$snapname' on $volid");
|
||||
eval { PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snapname, $running); };
|
||||
warn $@ if $@;
|
||||
}
|
||||
@ -331,12 +340,21 @@ sub replicate {
|
||||
|
||||
foreach my $volid (@$sorted_volids) {
|
||||
my $base_snapname;
|
||||
if ($last_snapshots->{$volid} && $remote_snapshots->{$volid}) {
|
||||
$logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
|
||||
$base_snapname = $last_sync_snapname;
|
||||
} else {
|
||||
$logfunc->("$jobid: full sync '$volid' ($sync_snapname)");
|
||||
|
||||
if (defined($last_snapshots->{$volid}) && defined($remote_snapshots->{$volid})) {
|
||||
if ($last_snapshots->{$volid}->{$last_sync_snapname} &&
|
||||
$remote_snapshots->{$volid}->{$last_sync_snapname}) {
|
||||
$logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
|
||||
$base_snapname = $last_sync_snapname;
|
||||
} elsif (defined($parent_snapname) &&
|
||||
($last_snapshots->{$volid}->{$parent_snapname} &&
|
||||
$remote_snapshots->{$volid}->{$parent_snapname})) {
|
||||
$logfunc->("$jobid: incremental sync '$volid' ($parent_snapname => $sync_snapname)");
|
||||
$base_snapname = $parent_snapname;
|
||||
}
|
||||
}
|
||||
|
||||
$logfunc->("$jobid: full sync '$volid' ($sync_snapname)") if !defined($base_snapname);
|
||||
replicate_volume($ssh_info, $storecfg, $volid, $base_snapname, $sync_snapname, $rate, $insecure);
|
||||
}
|
||||
};
|
||||
|
@ -13,7 +13,7 @@
|
||||
1840 job_900_to_node2: volumes => local-zfs:vm-900-disk-1
|
||||
1840 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
|
||||
1840 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1000__ => __replicate_job_900_to_node2_1840__)
|
||||
1840 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
|
||||
1840 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
|
||||
1840 job_900_to_node2: end replication job
|
||||
1840 job_900_to_node2: changed config next_sync => 2700
|
||||
1840 job_900_to_node2: changed state last_try => 1840, last_sync => 1840
|
||||
@ -22,7 +22,7 @@
|
||||
2740 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
|
||||
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
|
||||
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-2
|
||||
2740 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
|
||||
2740 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
|
||||
2740 job_900_to_node2: end replication job with error: no such volid 'local-zfs:vm-900-disk-2'
|
||||
2740 job_900_to_node2: changed config next_sync => 3040
|
||||
2740 job_900_to_node2: changed state last_try => 2740, fail_count => 1, error => no such volid 'local-zfs:vm-900-disk-2'
|
||||
@ -33,7 +33,7 @@
|
||||
3040 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
|
||||
3040 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1840__ => __replicate_job_900_to_node2_3040__)
|
||||
3040 job_900_to_node2: full sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__)
|
||||
3040 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
|
||||
3040 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
|
||||
3040 job_900_to_node2: end replication job
|
||||
3040 job_900_to_node2: changed config next_sync => 3600
|
||||
3040 job_900_to_node2: changed state last_try => 3040, last_sync => 3040, fail_count => 0, error =>
|
||||
@ -44,8 +44,8 @@
|
||||
3640 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
|
||||
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
|
||||
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
|
||||
3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
|
||||
3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
|
||||
3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
|
||||
3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
|
||||
3640 job_900_to_node2: end replication job
|
||||
3640 job_900_to_node2: changed config next_sync => 4500
|
||||
3640 job_900_to_node2: changed state last_try => 3640, last_sync => 3640
|
||||
@ -53,8 +53,8 @@
|
||||
3700 job_900_to_node2: guest => 900, type => qemu, running => 0
|
||||
3700 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
|
||||
3700 job_900_to_node2: start job removal - mode 'full'
|
||||
3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
|
||||
3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
|
||||
3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
|
||||
3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
|
||||
3700 job_900_to_node2: job removed
|
||||
3700 job_900_to_node2: end replication job
|
||||
3700 job_900_to_node2: vanished job
|
||||
|
@ -32,7 +32,7 @@ use PVE::Storage;
|
||||
my $replicated_volume_status = {};
|
||||
|
||||
my $mocked_remote_prepare_local_job = sub {
|
||||
my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
|
||||
my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
|
||||
|
||||
my $target = $ssh_info->{node};
|
||||
|
||||
@ -49,7 +49,7 @@ my $mocked_remote_prepare_local_job = sub {
|
||||
}
|
||||
my $snapname = $replicated_volume_status->{$target}->{$volid};
|
||||
|
||||
$last_snapshots->{$volid} = 1 if $last_sync_snapname eq $snapname;
|
||||
$last_snapshots->{$volid}->{$snapname} = 1 if $last_sync_snapname eq $snapname;
|
||||
}
|
||||
|
||||
return $last_snapshots;
|
||||
|
Loading…
x
Reference in New Issue
Block a user