replication: improve stale volume detection, allow sync from parent snapshot

We pass a list of storage to scan for stale volumes to prepare_local_job().
So we make sure that we only activate/scan related storages.

Snapshot rollback may remove local replication shapshots. In that case
we still have the $conf->{parent} snapshot on both sides, so we
can use that as base snapshot.
This commit is contained in:
Dietmar Maurer 2017-06-07 09:30:52 +02:00
parent 4ce4ae7047
commit 91ee6a2fec
4 changed files with 144 additions and 96 deletions

View File

@ -25,6 +25,28 @@ sub setup_environment {
PVE::RPCEnvironment->setup_default_cli_env();
}
# fixme: get from plugin??
my $replicatable_storage_types = {
zfspool => 1,
};
my $check_wanted_volid = sub {
my ($storecfg, $vmid, $volid, $local_node) = @_;
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
die "storage '$storeid' is not replicatable\n"
if !$replicatable_storage_types->{$scfg->{type}};
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
if $vtype ne 'images';
die "volume '$volid' has wrong owner\n"
if !$ownervm || $vmid != $ownervm;
return $storeid;
};
__PACKAGE__->register_method ({
name => 'prepare_local_job',
path => 'prepare_local_job',
@ -36,6 +58,11 @@ __PACKAGE__->register_method ({
id => get_standard_option('pve-replication-id'),
'extra-args' => get_standard_option('extra-args', {
description => "The list of volume IDs to consider." }),
scan => {
description => "List of storage IDs to scan for stale volumes.",
type => 'string', format => 'pve-storage-id-list',
optional => 1,
},
force => {
description => "Allow to remove all existion volumes (empty volume list).",
type => 'boolean',
@ -48,74 +75,87 @@ __PACKAGE__->register_method ({
minimum => 0,
optional => 1,
},
parent_snapname => get_standard_option('pve-snapshot-name', {
optional => 1,
}),
},
},
returns => { type => 'null' },
code => sub {
my ($param) = @_;
my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
my $last_sync = $param->{last_sync} // 0;
my $local_node = PVE::INotify::nodename();
my $vms = PVE::Cluster::get_vmlist();
die "guest '$vmid' is on local node\n"
if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq $local_node;
my $storecfg = PVE::Storage::config();
my $dl = PVE::Storage::vdisk_list($storecfg, undef, $vmid);
my $volids = [];
die "no volumes specified\n"
if !$param->{force} && !scalar(@{$param->{'extra-args'}});
foreach my $volid (@{$param->{'extra-args'}}) {
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
if $vtype ne 'images';
die "volume '$volid' has wrong owner\n"
if !$ownervm || $vmid != $ownervm;
my $found = 0;
foreach my $info (@{$dl->{$storeid}}) {
if ($info->{volid} eq $volid) {
$found = 1;
last;
}
}
push @$volids, $volid if $found;
}
$volids = [ sort @$volids ];
my $logfunc = sub {
my ($msg) = @_;
print STDERR "$msg\n";
};
# remove stale volumes
foreach my $storeid (keys %$dl) {
my $local_node = PVE::INotify::nodename();
die "no volumes specified\n"
if !$param->{force} && !scalar(@{$param->{'extra-args'}});
my ($vmid, undef, $jobid) = PVE::ReplicationConfig::parse_replication_job_id($param->{id});
my $vms = PVE::Cluster::get_vmlist();
die "guest '$vmid' is on local node\n"
if $vms->{ids}->{$vmid} && $vms->{ids}->{$vmid}->{node} eq $local_node;
my $last_sync = $param->{last_sync} // 0;
my $parent_snapname = $param->{parent_snapname};
my $storecfg = PVE::Storage::config();
# compute list of storages we want to scan
my $storage_hash = {};
foreach my $storeid (PVE::Tools::split_list($param->{scan})) {
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node, 1);
next if !$scfg || $scfg->{shared};
foreach my $info (@{$dl->{$storeid}}) {
my $volid = $info->{volid};
next if grep { $_ eq $volid } @$volids;
$logfunc->("$jobid: delete stale volume '$volid'");
PVE::Storage::vdisk_free($storecfg, $volid);
}
next if !$scfg; # simply ignore unavailable storages here
die "storage '$storeid' is not replicatable\n" if !$replicatable_storage_types->{$scfg->{type}};
$storage_hash->{$storeid} = 1;
}
my $last_snapshots = PVE::Replication::prepare(
$storecfg, $volids, $jobid, $last_sync, undef, $logfunc);
my $wanted_volids = {};
foreach my $volid (@{$param->{'extra-args'}}) {
my $storeid = $check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
$wanted_volids->{$volid} = 1;
$storage_hash->{$storeid} = 1;
}
my $storage_list = [ sort keys %$storage_hash ];
# activate all used storage
my $cache = {};
PVE::Storage::activate_storage_list($storecfg, $storage_list, $cache);
my $snapname = PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
# find replication snapshots
my $last_snapshots = {};
foreach my $storeid (@$storage_list) {
my $scfg = PVE::Storage::storage_config($storecfg, $storeid);
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
my $volids = $plugin->list_images($storeid, $scfg, $vmid, undef, $cache);
foreach my $volid (@$volids) {
my ($storeid, $volname) = parse_volume_id($volid);
my $list = $plugin->volume_snapshot_list($scfg, $storeid, $volname); # fixme: pass $cache
my $found_replication_snapshots = 0;
foreach my $snap (@$list) {
if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
$last_snapshots->{$volid}->{$snap} = 1 if $wanted_volids->{$volid};
} elsif ($snap =~ m/^__replication_/) {
$found_replication_snapshots = 1;
if ($wanted_volids->{$volid}) {
$logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
}
}
}
# remove stale volumes
if ($found_replication_snapshots && !$wanted_volids->{$volid}) {
$logfunc->("$jobid: delete stale volume '$volid'");
PVE::Storage::vdisk_free($storecfg, $volid);
}
}
}
print to_json($last_snapshots) . "\n";
@ -161,17 +201,7 @@ __PACKAGE__->register_method ({
die "no volumes specified\n" if !scalar(@{$param->{'extra-args'}});
foreach my $volid (@{$param->{'extra-args'}}) {
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storeid, $local_node);
die "storage '$storeid' is a shared storage\n" if $scfg->{shared};
my ($vtype, undef, $ownervm) = PVE::Storage::parse_volname($storecfg, $volid);
die "volume '$volid' has wrong vtype ($vtype != 'images')\n"
if $vtype ne 'images';
die "volume '$volid' has wrong owner\n"
if !$ownervm || $vmid != $ownervm;
$check_wanted_volid->($storecfg, $vmid, $volid, $local_node);
push @$volids, $volid;
}

View File

@ -113,13 +113,15 @@ my $get_next_job = sub {
};
sub remote_prepare_local_job {
my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
my $ssh_cmd = PVE::Cluster::ssh_info_to_command($ssh_info);
my $cmd = [@$ssh_cmd, '--', 'pvesr', 'prepare-local-job', $jobid];
push @$cmd, '--scan', join(',', @$storeid_list) if scalar(@$storeid_list);
push @$cmd, @$volumes if scalar(@$volumes);
push @$cmd, '--last_sync', $last_sync;
push @$cmd, '--parent_snapname', $parent_snapname;
push @$cmd, '--force' if $force;
my $remote_snapshots;
@ -147,21 +149,25 @@ sub remote_finalize_local_job {
PVE::Tools::run_command($cmd);
}
# finds local replication snapshots from $last_sync
# and removes all replication snapshots with other time stamps
sub prepare {
my ($storecfg, $volids, $jobid, $last_sync, $start_time, $logfunc) = @_;
my ($storecfg, $volids, $jobid, $last_sync, $parent_snapname, $logfunc) = @_;
$last_sync //= 0;
my ($prefix, $snapname) =
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
my $last_snapshots = {};
foreach my $volid (@$volids) {
my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid, $prefix);
my $list = PVE::Storage::volume_snapshot_list($storecfg, $volid);
my $found = 0;
foreach my $snap (@$list) {
if ($snap eq $snapname) {
$last_snapshots->{$volid} = 1;
} else {
$logfunc->("$jobid: delete stale snapshot '$snap' on $volid");
if ($snap eq $snapname || (defined($parent_snapname) && ($snap eq $parent_snapname))) {
$last_snapshots->{$volid}->{$snap} = 1;
} elsif ($snap =~ m/^\Q$prefix\E/) {
$logfunc->("$jobid: delete stale replication snapshot '$snap' on $volid");
PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
}
}
@ -251,11 +257,11 @@ sub replicate {
if ($remove_job eq 'full' && $jobcfg->{target} ne $local_node) {
# remove all remote volumes
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target});
remote_prepare_local_job($ssh_info, $jobid, $vmid, [], 0, 1);
remote_prepare_local_job($ssh_info, $jobid, $vmid, [], $state->{storeid_list}, 0, undef, 1);
}
# remove all local replication snapshots (lastsync => 0)
prepare($storecfg, $sorted_volids, $jobid, 0, $start_time, $logfunc);
prepare($storecfg, $sorted_volids, $jobid, 0, undef, $logfunc);
delete_job($jobid); # update config
$logfunc->("$jobid: job removed");
@ -265,19 +271,22 @@ sub replicate {
my $ssh_info = PVE::Cluster::get_ssh_info($jobcfg->{target}, $migration_network);
# prepare remote side
my $remote_snapshots = remote_prepare_local_job(
$ssh_info, $jobid, $vmid, $sorted_volids, $last_sync);
# test if we have a replication_ snapshot from last sync
# and remove all other/stale replication snapshots
my $last_sync_snapname =
PVE::ReplicationState::replication_snapshot_name($jobid, $last_sync);
my $sync_snapname =
PVE::ReplicationState::replication_snapshot_name($jobid, $start_time);
my $parent_snapname = $conf->{parent};
# test if we have a replication_ snapshot from last sync
# and remove all other/stale replication snapshots
my $last_snapshots = prepare(
$storecfg, $sorted_volids, $jobid, $last_sync, $start_time, $logfunc);
$storecfg, $sorted_volids, $jobid, $last_sync, $parent_snapname, $logfunc);
# prepare remote side
my $remote_snapshots = remote_prepare_local_job(
$ssh_info, $jobid, $vmid, $sorted_volids, $state->{storeid_list}, $last_sync, $parent_snapname, 0);
my $storeid_hash = {};
foreach my $volid (@$sorted_volids) {
@ -313,7 +322,7 @@ sub replicate {
my $cleanup_local_snapshots = sub {
my ($volid_hash, $snapname) = @_;
foreach my $volid (sort keys %$volid_hash) {
$logfunc->("$jobid: delete snapshot '$snapname' on $volid");
$logfunc->("$jobid: delete previous replication snapshot '$snapname' on $volid");
eval { PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snapname, $running); };
warn $@ if $@;
}
@ -331,12 +340,21 @@ sub replicate {
foreach my $volid (@$sorted_volids) {
my $base_snapname;
if ($last_snapshots->{$volid} && $remote_snapshots->{$volid}) {
$logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
$base_snapname = $last_sync_snapname;
} else {
$logfunc->("$jobid: full sync '$volid' ($sync_snapname)");
if (defined($last_snapshots->{$volid}) && defined($remote_snapshots->{$volid})) {
if ($last_snapshots->{$volid}->{$last_sync_snapname} &&
$remote_snapshots->{$volid}->{$last_sync_snapname}) {
$logfunc->("$jobid: incremental sync '$volid' ($last_sync_snapname => $sync_snapname)");
$base_snapname = $last_sync_snapname;
} elsif (defined($parent_snapname) &&
($last_snapshots->{$volid}->{$parent_snapname} &&
$remote_snapshots->{$volid}->{$parent_snapname})) {
$logfunc->("$jobid: incremental sync '$volid' ($parent_snapname => $sync_snapname)");
$base_snapname = $parent_snapname;
}
}
$logfunc->("$jobid: full sync '$volid' ($sync_snapname)") if !defined($base_snapname);
replicate_volume($ssh_info, $storecfg, $volid, $base_snapname, $sync_snapname, $rate, $insecure);
}
};

View File

@ -13,7 +13,7 @@
1840 job_900_to_node2: volumes => local-zfs:vm-900-disk-1
1840 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
1840 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1000__ => __replicate_job_900_to_node2_1840__)
1840 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
1840 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1000__' on local-zfs:vm-900-disk-1
1840 job_900_to_node2: end replication job
1840 job_900_to_node2: changed config next_sync => 2700
1840 job_900_to_node2: changed state last_try => 1840, last_sync => 1840
@ -22,7 +22,7 @@
2740 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
2740 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-2
2740 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
2740 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_2740__' on local-zfs:vm-900-disk-1
2740 job_900_to_node2: end replication job with error: no such volid 'local-zfs:vm-900-disk-2'
2740 job_900_to_node2: changed config next_sync => 3040
2740 job_900_to_node2: changed state last_try => 2740, fail_count => 1, error => no such volid 'local-zfs:vm-900-disk-2'
@ -33,7 +33,7 @@
3040 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
3040 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_1840__ => __replicate_job_900_to_node2_3040__)
3040 job_900_to_node2: full sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__)
3040 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
3040 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_1840__' on local-zfs:vm-900-disk-1
3040 job_900_to_node2: end replication job
3040 job_900_to_node2: changed config next_sync => 3600
3040 job_900_to_node2: changed state last_try => 3040, last_sync => 3040, fail_count => 0, error =>
@ -44,8 +44,8 @@
3640 job_900_to_node2: create snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-1' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
3640 job_900_to_node2: incremental sync 'local-zfs:vm-900-disk-2' (__replicate_job_900_to_node2_3040__ => __replicate_job_900_to_node2_3640__)
3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
3640 job_900_to_node2: delete snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-1
3640 job_900_to_node2: delete previous replication snapshot '__replicate_job_900_to_node2_3040__' on local-zfs:vm-900-disk-2
3640 job_900_to_node2: end replication job
3640 job_900_to_node2: changed config next_sync => 4500
3640 job_900_to_node2: changed state last_try => 3640, last_sync => 3640
@ -53,8 +53,8 @@
3700 job_900_to_node2: guest => 900, type => qemu, running => 0
3700 job_900_to_node2: volumes => local-zfs:vm-900-disk-1,local-zfs:vm-900-disk-2
3700 job_900_to_node2: start job removal - mode 'full'
3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
3700 job_900_to_node2: delete stale snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-1
3700 job_900_to_node2: delete stale replication snapshot '__replicate_job_900_to_node2_3640__' on local-zfs:vm-900-disk-2
3700 job_900_to_node2: job removed
3700 job_900_to_node2: end replication job
3700 job_900_to_node2: vanished job

View File

@ -32,7 +32,7 @@ use PVE::Storage;
my $replicated_volume_status = {};
my $mocked_remote_prepare_local_job = sub {
my ($ssh_info, $jobid, $vmid, $volumes, $last_sync, $force) = @_;
my ($ssh_info, $jobid, $vmid, $volumes, $storeid_list, $last_sync, $parent_snapname, $force) = @_;
my $target = $ssh_info->{node};
@ -49,7 +49,7 @@ my $mocked_remote_prepare_local_job = sub {
}
my $snapname = $replicated_volume_status->{$target}->{$volid};
$last_snapshots->{$volid} = 1 if $last_sync_snapname eq $snapname;
$last_snapshots->{$volid}->{$snapname} = 1 if $last_sync_snapname eq $snapname;
}
return $last_snapshots;