diff --git a/PVE/API2/Cluster/MetricServer.pm b/PVE/API2/Cluster/MetricServer.pm index 882cda101..209b92a75 100644 --- a/PVE/API2/Cluster/MetricServer.pm +++ b/PVE/API2/Cluster/MetricServer.pm @@ -6,8 +6,11 @@ use strict; use PVE::Tools qw(extract_param extract_sensitive_params); use PVE::Exception qw(raise_perm_exc raise_param_exc); use PVE::JSONSchema qw(get_standard_option); +use PVE::INotify; use PVE::RPCEnvironment; use PVE::ExtMetric; +use PVE::PullMetric; +use PVE::SafeSyslog; use PVE::RESTHandler; @@ -288,4 +291,184 @@ __PACKAGE__->register_method ({ return; }}); +__PACKAGE__->register_method ({ + name => 'export', + path => 'export', + method => 'GET', + protected => 1, + description => "Retrieve metrics of the cluster.", + permissions => { + check => ['perm', '/', ['Sys.Audit']], + }, + parameters => { + additionalProperties => 0, + properties => { + 'local-only' => { + type => 'boolean', + description => + 'Only return metrics for the current node instead of the whole cluster', + optional => 1, + default => 0, + }, + 'start-time' => { + type => 'integer', + description => 'Only include metrics with a timestamp > start-time.', + optional => 1, + default => 0, + }, + 'history' => { + type => 'boolean', + description => 'Also return historic values.' + . ' Returns full available metric history unless `start-time` is also set', + optional => 1, + default => 0, + }, + }, + }, + returns => { + type => 'object', + additionalProperties => 0, + properties => { + data => { + type => 'array', + description => 'Array of system metrics. Metrics are sorted by their timestamp.', + items => { + type => 'object', + additionalProperties => 0, + properties => { + timestamp => { + type => 'integer', + description => 'Time at which this metric was observed', + }, + id => { + type => 'string', + description => "Unique identifier for this metric object," + . " for instance 'node/' or" + . " 'qemu/'." + }, + metric => { + type => 'string', + description => "Name of the metric.", + }, + value => { + type => 'number', + description => 'Metric value.', + }, + type => { + type => 'string', + description => 'Type of the metric.', + enum => [qw(gauge counter derive)], + } + } + }, + + }, + + } + }, + code => sub { + my ($param) = @_; + my $local_only = $param->{'local-only'} // 0; + my $start = $param->{'start-time'}; + my $history = $param->{'history'} // 0; + + my $now = time(); + + my $generations; + if ($history) { + # Assuming update loop time of pvestatd of 10 seconds. + if (defined($start)) { + my $delta = $now - $start; + $generations = int($delta / 10); + } else { + $generations = PVE::PullMetric::max_generations(); + } + + } else { + $generations = 0; + }; + + my @metrics = @{PVE::PullMetric::get_local_metrics($generations)}; + + if (defined($start)) { + @metrics = grep { + $_->{timestamp} > ($start) + } @metrics; + } + + my $nodename = PVE::INotify::nodename(); + + # Fan out to cluster members + # Do NOT remove this check + if (!$local_only) { + my $members = PVE::Cluster::get_members(); + + my $rpcenv = PVE::RPCEnvironment::get(); + my $authuser = $rpcenv->get_user(); + + my ($user, undef) = PVE::AccessControl::split_tokenid($authuser, 1); + + my $ticket; + if ($user) { + # Theoretically, we might now bypass token privilege separation, since + # we use the regular user instead of the token, but + # since we already passed the permission check for this handler, + # this should be fine. + $ticket = PVE::AccessControl::assemble_ticket($user); + } else { + $ticket = PVE::AccessControl::assemble_ticket($authuser); + } + + for my $name (keys %$members) { + if ($name eq $nodename) { + # Skip own node, for that one we already have the metrics + next; + } + + if (!$members->{$name}->{online}) { + next; + } + + my $status = eval { + my $fingerprint = PVE::Cluster::get_node_fingerprint($name); + my $ip = scalar(PVE::Cluster::remote_node_ip($name)); + + my $conn_args = { + protocol => 'https', + host => $ip, + port => 8006, + ticket => $ticket, + timeout => 5, + }; + + $conn_args->{cached_fingerprints} = { $fingerprint => 1 }; + + my $api_client = PVE::APIClient::LWP->new(%$conn_args); + + my $params = { + # Do NOT remove 'local-only' - potential for request recursion! + 'local-only' => 1, + history => $history, + }; + $params->{'start-time'} = $start if defined($start); + + $api_client->get('/cluster/metrics/export', $params); + }; + + if ($@) { + syslog('warning', "could not fetch metrics from $name: $@"); + } else { + push @metrics, $status->{data}->@*; + } + } + } + + my @sorted = sort {$a->{timestamp} <=> $b->{timestamp}} @metrics; + + return { + data => \@sorted, + }; + }, +}); + 1; diff --git a/PVE/PullMetric.pm b/PVE/PullMetric.pm index c7cc12fb8..92f4daef1 100644 --- a/PVE/PullMetric.pm +++ b/PVE/PullMetric.pm @@ -51,4 +51,175 @@ sub update { $txn->{$subsystem}->{timestamp} = $timestamp; } +my sub gauge { + my ($id, $timestamp, $metric, $value) = @_; + + return { + metric => $metric, + id => $id, + value => $value + 0, + timestamp => $timestamp + 0, + type => 'gauge', + } +} + +my sub derive { + my ($id, $timestamp, $metric, $value) = @_; + + return { + metric => $metric, + id => $id, + value => $value + 0, + timestamp => $timestamp + 0, + type => 'derive', + } +} + +my $nodename = PVE::INotify::nodename(); + +my sub get_node_metrics { + my ($stats) = @_; + + my $metrics = []; + + my $data = $stats->{data}; + my $timestamp = $stats->{timestamp}; + + my $id = "node/$nodename"; + + push @$metrics, gauge($id, $timestamp, "uptime", $data->{uptime}); + + my ($netin, $netout) = (0, 0); + for my $dev (grep { /^$PVE::Network::PHYSICAL_NIC_RE$/ } keys $data->{nics}->%*) { + $netin += $data->{nics}->{$dev}->{receive}; + $netout += $data->{nics}->{$dev}->{transmit}; + } + push @$metrics, derive($id, $timestamp, "net_in", $netin); + push @$metrics, derive($id, $timestamp, "net_out", $netout); + + my $cpustat = $data->{cpustat}; + push @$metrics, gauge($id, $timestamp, "cpu_avg1", $cpustat->{avg1}); + push @$metrics, gauge($id, $timestamp, "cpu_avg5", $cpustat->{avg5}); + push @$metrics, gauge($id, $timestamp, "cpu_avg15", $cpustat->{avg15}); + push @$metrics, gauge($id, $timestamp, "cpu_max", $cpustat->{cpus}); + push @$metrics, gauge($id, $timestamp, "cpu_current", $cpustat->{cpu}); + push @$metrics, gauge($id, $timestamp, "cpu_iowait", $cpustat->{iowait}); + + my $memory = $data->{memory}; + push @$metrics, gauge($id, $timestamp, "mem_total", $memory->{memtotal}); + push @$metrics, gauge($id, $timestamp, "mem_used", $memory->{memused}); + push @$metrics, gauge($id, $timestamp, "swap_total", $memory->{swaptotal}); + push @$metrics, gauge($id, $timestamp, "swap_used", $memory->{swapused}); + + my $blockstat = $data->{blockstat}; + my $dused = $blockstat->{blocks} - $blockstat->{bfree}; + push @$metrics, gauge($id, $timestamp, "disk_total", $blockstat->{blocks}); + push @$metrics, gauge($id, $timestamp, "disk_used", $dused); + + return $metrics; +} + +my sub get_qemu_metrics { + my ($stats) = @_; + + my $metrics = []; + + my $timestamp = $stats->{timestamp}; + + for my $vmid (keys $stats->{data}->%*) { + my $id = "qemu/$vmid"; + my $guest_data = $stats->{data}->{$vmid}; + + if ($guest_data->{status} eq 'running') { + push @$metrics, gauge($id, $timestamp, "cpu_current", $guest_data->{cpu}); + push @$metrics, gauge($id, $timestamp, "mem_used", $guest_data->{mem}); + push @$metrics, derive($id, $timestamp, "disk_read", $guest_data->{diskread}); + push @$metrics, derive($id, $timestamp, "disk_write", $guest_data->{diskwrite}); + push @$metrics, derive($id, $timestamp, "net_in", $guest_data->{netin}); + push @$metrics, derive($id, $timestamp, "net_out", $guest_data->{netout}); + } + + push @$metrics, gauge($id, $timestamp, "uptime", $guest_data->{uptime}); + push @$metrics, gauge($id, $timestamp, "cpu_max", $guest_data->{cpus}); + push @$metrics, gauge($id, $timestamp, "mem_total", $guest_data->{maxmem}); + push @$metrics, gauge($id, $timestamp, "disk_total", $guest_data->{maxdisk}); + # TODO: This one always seems to be 0? + # push @$metrics, num_metric("disk_used", $id, $guest_data->{disk}, $timestamp); + } + + return $metrics; +} + +my sub get_lxc_metrics { + my ($stats) = @_; + + my $metrics = []; + + my $timestamp = $stats->{timestamp}; + + for my $vmid (keys $stats->{data}->%*) { + my $id = "lxc/$vmid"; + my $guest_data = $stats->{data}->{$vmid}; + + if ($guest_data->{status} eq 'running') { + push @$metrics, gauge($id, $timestamp, "cpu_current", $guest_data->{cpu}); + push @$metrics, gauge($id, $timestamp, "mem_used", $guest_data->{mem}); + push @$metrics, derive($id, $timestamp, "disk_read", $guest_data->{diskread}); + push @$metrics, derive($id, $timestamp, "disk_write", $guest_data->{diskwrite}); + push @$metrics, derive($id, $timestamp, "net_in", $guest_data->{netin}); + push @$metrics, derive($id, $timestamp, "net_out", $guest_data->{netout}); + push @$metrics, gauge($id, $timestamp, "disk_used", $guest_data->{disk}); + } + + push @$metrics, gauge($id, $timestamp, "uptime", $guest_data->{uptime}); + push @$metrics, gauge($id, $timestamp, "cpu_max", $guest_data->{cpus}); + push @$metrics, gauge($id, $timestamp, "mem_total", $guest_data->{maxmem}); + push @$metrics, gauge($id, $timestamp, "disk_total", $guest_data->{maxdisk}); + } + + return $metrics; +} + +my sub get_storage_metrics { + my ($stats) = @_; + + my $metrics = []; + + my $timestamp = $stats->{timestamp}; + + for my $sid (keys $stats->{data}->%*) { + my $id = "storage/$nodename/$sid"; + my $data = $stats->{data}->{$sid}; + + push @$metrics, gauge($id, $timestamp, "disk_total", $data->{total}); + push @$metrics, gauge($id, $timestamp, "disk_used", $data->{used}); + } + + return $metrics; +} + +# Return local metrics, including some recent history if needed. +# +sub get_local_metrics { + my ($history) = @_; + + # If we do not provide the history parameter, set it to 0 -> only + # query most recent metrics from the cache. + $history = $history // 0; + $history = int($history); + + my $metrics = []; + + my $data = $get_cache->()->get_last($history); + + for my $stat_gen ($data->@*) { + push @$metrics, get_node_metrics($stat_gen->{node})->@*; + push @$metrics, get_qemu_metrics($stat_gen->{qemu})->@*; + push @$metrics, get_lxc_metrics($stat_gen->{lxc})->@*; + push @$metrics, get_storage_metrics($stat_gen->{storage})->@*; + } + + return $metrics; +} + 1;