1
0
mirror of https://github.com/ansible/awx.git synced 2024-10-27 00:55:06 +03:00

Add subsystem metrics for the dispatcher (#13989)

This adds a handful of metrics to /api/v2/metrics/ recorded from the dispatcher main process

Adds logic in the dispatcher period tasks to calculate these for the last collection interval
Reports worker count, task count, scale up events, and availability

Add data to demo grafana dashboard
This commit is contained in:
Alan Rominger 2023-05-17 14:29:31 -04:00 committed by GitHub
parent 84f67c7f82
commit ef99770383
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 582 additions and 237 deletions

View File

@ -209,6 +209,11 @@ class Metrics:
SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow tasks'),
SetFloatM('workflow_manager_get_tasks_seconds', 'Time spent loading workflow tasks from db'),
# dispatcher subsystem metrics
SetIntM('dispatcher_pool_scale_up_events', 'Number of times local dispatcher scaled up a worker since startup'),
SetIntM('dispatcher_pool_active_task_count', 'Number of active tasks in the worker pool when last task was submitted'),
SetIntM('dispatcher_pool_max_worker_count', 'Highest number of workers in worker pool in last collection interval, about 20s'),
SetFloatM('dispatcher_availability', 'Fraction of time (in last collection interval) dispatcher was able to receive messages'),
]
# turn metric list into dictionary with the metric name as a key
self.METRICS = {}

View File

@ -339,6 +339,17 @@ class AutoscalePool(WorkerPool):
# but if the task takes longer than the time defined here, we will force it to stop here
self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD
# initialize some things for subsystem metrics periodic gathering
# the AutoscalePool class does not save these to redis directly, but reports via produce_subsystem_metrics
self.scale_up_ct = 0
self.worker_count_max = 0
def produce_subsystem_metrics(self, metrics_object):
metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct)
metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers))
metrics_object.set('dispatcher_pool_max_worker_count', self.worker_count_max)
self.worker_count_max = len(self.workers)
@property
def should_grow(self):
if len(self.workers) < self.min_workers:
@ -443,7 +454,12 @@ class AutoscalePool(WorkerPool):
idx = random.choice(range(len(self.workers)))
return idx, self.workers[idx]
else:
return super(AutoscalePool, self).up()
self.scale_up_ct += 1
ret = super(AutoscalePool, self).up()
new_worker_ct = len(self.workers)
if new_worker_ct > self.worker_count_max:
self.worker_count_max = new_worker_ct
return ret
def write(self, preferred_queue, body):
if 'guid' in body:

View File

@ -19,6 +19,7 @@ from awx.main.dispatch.pool import WorkerPool
from awx.main.dispatch import pg_bus_conn
from awx.main.utils.common import log_excess_runtime
from awx.main.utils.db import set_connection_name
import awx.main.analytics.subsystem_metrics as s_metrics
if 'run_callback_receiver' in sys.argv:
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
@ -154,17 +155,30 @@ class AWXConsumerPG(AWXConsumerBase):
self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE
# if no successful loops have ran since startup, then we should fail right away
self.pg_is_down = True # set so that we fail if we get database errors on startup
self.pg_down_time = time.time() - self.pg_max_wait # allow no grace period
self.last_cleanup = time.time()
init_time = time.time()
self.pg_down_time = init_time - self.pg_max_wait # allow no grace period
self.last_cleanup = init_time
self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
self.last_metrics_gather = init_time
self.listen_cumulative_time = 0.0
def run_periodic_tasks(self):
self.record_statistics() # maintains time buffer in method
if time.time() - self.last_cleanup > 60: # same as cluster_node_heartbeat
current_time = time.time()
if current_time - self.last_cleanup > 60: # same as cluster_node_heartbeat
# NOTE: if we run out of database connections, it is important to still run cleanup
# so that we scale down workers and free up connections
self.pool.cleanup()
self.last_cleanup = time.time()
self.last_cleanup = current_time
# record subsystem metrics for the dispatcher
if current_time - self.last_metrics_gather > 20:
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
self.subsystem_metrics.pipe_execute()
self.listen_cumulative_time = 0.0
self.last_metrics_gather = current_time
def run(self, *args, **kwargs):
super(AWXConsumerPG, self).run(*args, **kwargs)
@ -180,11 +194,14 @@ class AWXConsumerPG(AWXConsumerBase):
if init is False:
self.worker.on_start()
init = True
self.listen_start = time.time()
for e in conn.events(yield_timeouts=True):
self.listen_cumulative_time += time.time() - self.listen_start
if e is not None:
self.process_task(json.loads(e.payload))
self.run_periodic_tasks()
self.pg_is_down = False
self.listen_start = time.time()
if self.should_stop:
return
except psycopg2.InterfaceError:

View File

@ -29,244 +29,308 @@
"liveNow": false,
"panels": [
{
"collapsed": false,
"collapsed": true,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 37,
"panels": [],
"title": "System",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 1
},
"id": 14,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
"id": 38,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "awx_database_connections_total",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Database",
"type": "timeseries"
},
{
"datasource": {},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
"description": "Fraction of time dispatcher is listening for new messages",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"type": "special"
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 1
},
"id": 39,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "dispatcher_availability",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "light-blue",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
"title": "Dispatcher Availability",
"type": "timeseries"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 12,
"y": 1
},
"id": 25,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "/^tower_version$/",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "000000021"
},
"editorMode": "code",
"exemplar": false,
"expr": "awx_system_info",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Controller Version",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"fieldConfig": {
"defaults": {
"displayName": "Instances",
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "light-blue",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 12,
"y": 5
},
"id": 13,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "code",
"expr": "count(awx_instance_info)",
"interval": "",
"legendFormat": " ",
"range": true,
"refId": "A"
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 1
},
"id": 40,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "dispatcher_pool_max_worker_count",
"legendFormat": "__auto",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "dispatcher_pool_active_task_count",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
}
],
"title": "Dispatcher Workers",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"id": 41,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "dispatcher_pool_scale_up_events",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Dispatcher Pool Scale-Up Events",
"type": "timeseries"
}
],
"title": "Controller Node Count",
"type": "stat"
"title": "Dispatcher",
"type": "row"
},
{
"collapsed": true,
@ -274,7 +338,248 @@
"h": 1,
"w": 24,
"x": 0,
"y": 9
"y": 1
},
"id": 37,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 26
},
"id": 14,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "builder",
"expr": "awx_database_connections_total",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Database",
"type": "timeseries"
},
{
"datasource": {},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "light-blue",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 12,
"y": 26
},
"id": 25,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "/^tower_version$/",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "000000021"
},
"editorMode": "code",
"exemplar": false,
"expr": "awx_system_info",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Controller Version",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"fieldConfig": {
"defaults": {
"displayName": "Instances",
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "light-blue",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 12,
"y": 30
},
"id": 13,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "awx_prometheus"
},
"editorMode": "code",
"expr": "count(awx_instance_info)",
"interval": "",
"legendFormat": " ",
"range": true,
"refId": "A"
}
],
"title": "Controller Node Count",
"type": "stat"
}
],
"title": "System",
"type": "row"
},
{
"collapsed": true,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 2
},
"id": 35,
"panels": [
@ -385,7 +690,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 10
"y": 26
},
"id": 8,
"options": {
@ -523,7 +828,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 10
"y": 26
},
"id": 29,
"options": {
@ -616,7 +921,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 18
"y": 34
},
"id": 16,
"options": {
@ -740,7 +1045,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 18
"y": 34
},
"id": 18,
"options": {
@ -840,7 +1145,7 @@
"h": 6,
"w": 12,
"x": 0,
"y": 26
"y": 42
},
"id": 27,
"options": {
@ -932,7 +1237,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 26
"y": 42
},
"id": 20,
"options": {
@ -973,7 +1278,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 10
"y": 3
},
"id": 33,
"panels": [
@ -1022,7 +1327,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
@ -1064,7 +1370,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 2
"y": 27
},
"id": 12,
"options": {
@ -1164,7 +1470,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
@ -1179,7 +1486,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 2
"y": 27
},
"id": 10,
"options": {
@ -1268,7 +1575,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 11
"y": 4
},
"id": 31,
"panels": [
@ -1336,7 +1643,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 36
"y": 28
},
"id": 26,
"options": {
@ -1455,7 +1762,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 36
"y": 28
},
"id": 24,
"options": {
@ -1504,7 +1811,7 @@
}
],
"refresh": "5s",
"schemaVersion": 37,
"schemaVersion": 38,
"style": "dark",
"tags": [],
"templating": {
@ -1518,6 +1825,6 @@
"timezone": "",
"title": "awx-demo",
"uid": "GISWZOXnk",
"version": 12,
"version": 13,
"weekStart": ""
}