replication: split out error handling and include more info in mail

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2022-04-27 09:49:11 +02:00
parent e6b8af20a6
commit c945aaef85

View File

@ -5,7 +5,9 @@ use strict;
use PVE::JSONSchema qw(get_standard_option);
use PVE::RPCEnvironment;
use PVE::Format qw(render_timestamp);
use PVE::ProcFSTools;
use PVE::ReplicationConfig;
use PVE::ReplicationState;
use PVE::Replication;
@ -71,6 +73,7 @@ sub run_single_job {
}
# TODO: below two should probably part of the general job framework/plugin system
my sub _should_mail_at_failcount {
my ($fail_count) = @_;
@ -84,6 +87,47 @@ my sub _should_mail_at_failcount {
return $i * 48 == $fail_count;
};
my sub _handle_job_err {
my ($job, $err, $mail) = @_;
warn "$job->{id}: got unexpected replication job error - $err";
return if !$mail;
my $state = PVE::ReplicationState::read_state();
my $jobstate = PVE::ReplicationState::extract_job_state($state, $job);
my $fail_count = $jobstate->{fail_count};
return if !_should_mail_at_failcount($fail_count);
my $msg = "Replication job $job->{id} with target '$job->{target}' and schedule";
$msg .= " '$job->{schedule}' failed!\n";
$msg .= " Last successful sync: ";
if (my $last_sync = $jobstate->{last_sync}) {
$msg .= render_timestamp($last_sync) ."\n";
} else {
$msg .= "None/Unknown\n";
}
# not yet updated, so $job->next_sync here is actually the current one.
# NOTE: Copied from PVE::ReplicationState::job_status()
my $next_sync = $job->{next_sync} + 60 * ($fail_count <= 3 ? 5 * $fail_count : 30);
$msg .= " Next sync try: " . render_timestamp($next_sync) ."\n";
$msg .= " Failure count: $fail_count\n";
if ($fail_count == 3) {
$msg .= "\nNote: The system will now reduce the frequency of error reports,";
$msg .= " as the job appears to be stuck.\n";
}
$msg .= "\nError:\n$err";
eval {
PVE::Tools::sendmail('root', "Replication Job: $job->{id} failed", $msg)
};
warn ": $@" if $@;
}
# passing $now and $verbose is useful for regression testing
sub run_jobs {
my ($now, $logfunc, $verbose, $mail) = @_;
@ -102,14 +146,7 @@ sub run_jobs {
PVE::Replication::run_replication($guest_class, $jobcfg, $iteration, $start_time, $logfunc, $verbose);
};
if (my $err = $@) {
warn "$jobcfg->{id}: got unexpected replication job error - $err";
my $state = PVE::ReplicationState::read_state();
my $jobstate = PVE::ReplicationState::extract_job_state($state, $jobcfg);
eval {
PVE::Tools::sendmail('root', "Replication Job: $jobcfg->{id} failed", $err)
if $mail && _should_mail_at_failcount($jobstate->{fail_count});
};
warn ": $@" if $@;
_handle_job_err($jobcfg, $err, $mail);
}
$start_time = $now // time();