From 180ed4cd3dfd1c2cc8f25258a368c998d06829a8 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Tue, 23 Jun 2020 23:02:45 -0400 Subject: [PATCH 1/8] add a tool for discovering which tasks/hosts are slow for a JT this tool looks at the most recent jobs for a specific job template and attempts to discover the _slowest_ tasks and hosts $ awx-manage bottleneck --template N $ awx-manage bottleneck --template N --threshold 1 --ignore yum $ awx-manage bottleneck --template N --ignore pause --ignore yum --- awx/main/management/commands/bottleneck.py | 88 ++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 awx/main/management/commands/bottleneck.py diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py new file mode 100644 index 0000000000..a9cc997a17 --- /dev/null +++ b/awx/main/management/commands/bottleneck.py @@ -0,0 +1,88 @@ +from django.core.management.base import BaseCommand +from django.db import connection + +from awx.main.models import JobTemplate + + +class Command(BaseCommand): + help = 'Discover the slowest tasks and hosts for a specific Job Template.' + + def add_arguments(self, parser): + parser.add_argument('--template', dest='jt', type=int, + help='ID of the Job Template to profile') + parser.add_argument('--threshold', dest='threshold', type=float, default=5, + help='Only show tasks that took at least this many seconds (defaults to 5)') + parser.add_argument('--ignore', action='append', help='ignore a specific action (e.g., --ignore git)') + + def handle(self, *args, **options): + jt = options['jt'] + threshold = options['threshold'] + ignore = options['ignore'] + + print('## ' + JobTemplate.objects.get(pk=jt).name + ' (last 25 runs)\n') + with connection.cursor() as cursor: + cursor.execute( + f''' + SELECT + b.id, b.job_id, b.host_name, b.created - a.created delta, + b.event_data::json->'task_action' task_action, + b.event_data::json->'task_path' task_path + FROM main_jobevent a JOIN main_jobevent b + ON b.parent_uuid = a.parent_uuid AND a.host_name = b.host_name + WHERE + a.event = 'runner_on_start' AND + b.event != 'runner_on_start' AND + b.event != 'runner_on_skipped' AND + b.failed = false AND + a.job_id IN ( + SELECT unifiedjob_ptr_id FROM main_job + WHERE job_template_id={jt} + ORDER BY unifiedjob_ptr_id DESC + LIMIT 25 + ) + ORDER BY delta DESC; + ''' + ) + slowest_events = cursor.fetchall() + + fastest = dict() + for event in slowest_events: + _id, job_id, host, duration, action, playbook = event + playbook = playbook.rsplit('/')[-1] + if ignore and action in ignore: + continue + if host: + fastest[(action, playbook)] = (_id, host, str(duration).split('.')[0]) + + host_counts = dict() + warned = set() + print(f'slowest tasks (--threshold={threshold})\n---') + for event in slowest_events: + _id, job_id, host, duration, action, playbook = event + playbook = playbook.rsplit('/')[-1] + if ignore and action in ignore: + continue + if duration.total_seconds() < threshold: + break + + fastest_summary = '' + fastest_match = fastest.get((action, playbook)) + if fastest_match[2] != duration.total_seconds() and (host, action, playbook) not in warned: + warned.add((host, action, playbook)) + fastest_summary = f' \033[93m{fastest_match[1]} ran this in {fastest_match[2]}s at /api/v2/job_events/{fastest_match[0]}/\033[0m' + + url = f'/api/v2/jobs/{job_id}/' + human_duration = str(duration).split('.')[0] + print(' -- '.join([url, host, human_duration, action, playbook]) + fastest_summary) + host_counts.setdefault(host, []) + host_counts[host].append(duration) + + host_counts = sorted(host_counts.items(), key=lambda item: [e.total_seconds() for e in item[1]], reverse=True) + + print('\nslowest hosts\n---') + for h, matches in host_counts: + total = len(matches) + total_seconds = sum([e.total_seconds() for e in matches]) + print(f'{h} had {total} tasks that ran longer than {threshold} second(s) for a total of {total_seconds}') + + print('') From b01ac1bfa667f278eef51dfeac28f11a8fe9bbc1 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 09:22:53 -0400 Subject: [PATCH 2/8] add support to the bottleneck script for configuring history size --- awx/main/management/commands/bottleneck.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index a9cc997a17..fec18c320f 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -12,14 +12,17 @@ class Command(BaseCommand): help='ID of the Job Template to profile') parser.add_argument('--threshold', dest='threshold', type=float, default=5, help='Only show tasks that took at least this many seconds (defaults to 5)') + parser.add_argument('--history', dest='history', type=float, default=25, + help='The number of historic jobs to look at') parser.add_argument('--ignore', action='append', help='ignore a specific action (e.g., --ignore git)') def handle(self, *args, **options): jt = options['jt'] threshold = options['threshold'] + history = options['history'] ignore = options['ignore'] - print('## ' + JobTemplate.objects.get(pk=jt).name + ' (last 25 runs)\n') + print('## ' + JobTemplate.objects.get(pk=jt).name + f' (last {history} runs)\n') with connection.cursor() as cursor: cursor.execute( f''' @@ -38,7 +41,7 @@ class Command(BaseCommand): SELECT unifiedjob_ptr_id FROM main_job WHERE job_template_id={jt} ORDER BY unifiedjob_ptr_id DESC - LIMIT 25 + LIMIT {history} ) ORDER BY delta DESC; ''' From 21ea7eb5dc2791d44873e66e0ce2ec4f28914022 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 09:44:39 -0400 Subject: [PATCH 3/8] add the name of the task to awx-manage bottleneck output --- awx/main/management/commands/bottleneck.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index fec18c320f..54de59a2b6 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -28,6 +28,7 @@ class Command(BaseCommand): f''' SELECT b.id, b.job_id, b.host_name, b.created - a.created delta, + b.task task, b.event_data::json->'task_action' task_action, b.event_data::json->'task_path' task_path FROM main_jobevent a JOIN main_jobevent b @@ -50,7 +51,7 @@ class Command(BaseCommand): fastest = dict() for event in slowest_events: - _id, job_id, host, duration, action, playbook = event + _id, job_id, host, duration, task, action, playbook = event playbook = playbook.rsplit('/')[-1] if ignore and action in ignore: continue @@ -61,7 +62,7 @@ class Command(BaseCommand): warned = set() print(f'slowest tasks (--threshold={threshold})\n---') for event in slowest_events: - _id, job_id, host, duration, action, playbook = event + _id, job_id, host, duration, task, action, playbook = event playbook = playbook.rsplit('/')[-1] if ignore and action in ignore: continue @@ -76,7 +77,7 @@ class Command(BaseCommand): url = f'/api/v2/jobs/{job_id}/' human_duration = str(duration).split('.')[0] - print(' -- '.join([url, host, human_duration, action, playbook]) + fastest_summary) + print(' -- '.join([url, host, human_duration, action, task, playbook]) + fastest_summary) host_counts.setdefault(host, []) host_counts[host].append(duration) From 5532760ccfdf6576880f387cb0fb2f03e3e4e564 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 09:49:16 -0400 Subject: [PATCH 4/8] respect --no-color in awx-manage bottleneck --- awx/main/management/commands/bottleneck.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index 54de59a2b6..c52d1f0535 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -21,6 +21,9 @@ class Command(BaseCommand): threshold = options['threshold'] history = options['history'] ignore = options['ignore'] + colorize = True + if options.get('no_color') is True: + colorize = False print('## ' + JobTemplate.objects.get(pk=jt).name + f' (last {history} runs)\n') with connection.cursor() as cursor: @@ -61,6 +64,7 @@ class Command(BaseCommand): host_counts = dict() warned = set() print(f'slowest tasks (--threshold={threshold})\n---') + for event in slowest_events: _id, job_id, host, duration, task, action, playbook = event playbook = playbook.rsplit('/')[-1] @@ -73,7 +77,7 @@ class Command(BaseCommand): fastest_match = fastest.get((action, playbook)) if fastest_match[2] != duration.total_seconds() and (host, action, playbook) not in warned: warned.add((host, action, playbook)) - fastest_summary = f' \033[93m{fastest_match[1]} ran this in {fastest_match[2]}s at /api/v2/job_events/{fastest_match[0]}/\033[0m' + fastest_summary = ' ' + self.style.WARNING(f'{fastest_match[1]} ran this in {fastest_match[2]}s at /api/v2/job_events/{fastest_match[0]}/') url = f'/api/v2/jobs/{job_id}/' human_duration = str(duration).split('.')[0] From ce65f994dcb6c9e47c93099bdb5403ac087fb598 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 09:56:23 -0400 Subject: [PATCH 5/8] refactor some timedelta formatting in awx-manage bottleneck --- awx/main/management/commands/bottleneck.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index c52d1f0535..0cc8399b66 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -52,6 +52,9 @@ class Command(BaseCommand): ) slowest_events = cursor.fetchall() + def format_td(x): + return str(x).split('.')[0] + fastest = dict() for event in slowest_events: _id, job_id, host, duration, task, action, playbook = event @@ -59,7 +62,7 @@ class Command(BaseCommand): if ignore and action in ignore: continue if host: - fastest[(action, playbook)] = (_id, host, str(duration).split('.')[0]) + fastest[(action, playbook)] = (_id, host, format_td(duration)) host_counts = dict() warned = set() @@ -75,12 +78,12 @@ class Command(BaseCommand): fastest_summary = '' fastest_match = fastest.get((action, playbook)) - if fastest_match[2] != duration.total_seconds() and (host, action, playbook) not in warned: + if fastest_match[2] != format_td(duration) and (host, action, playbook) not in warned: warned.add((host, action, playbook)) fastest_summary = ' ' + self.style.WARNING(f'{fastest_match[1]} ran this in {fastest_match[2]}s at /api/v2/job_events/{fastest_match[0]}/') url = f'/api/v2/jobs/{job_id}/' - human_duration = str(duration).split('.')[0] + human_duration = format_td(duration) print(' -- '.join([url, host, human_duration, action, task, playbook]) + fastest_summary) host_counts.setdefault(host, []) host_counts[host].append(duration) From dd42f6351b284b68084466112073cd3563d6933c Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 10:17:10 -0400 Subject: [PATCH 6/8] minor refactoring of awx-manage bottleneck --- awx/main/management/commands/bottleneck.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index 0cc8399b66..7948507c70 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -21,9 +21,6 @@ class Command(BaseCommand): threshold = options['threshold'] history = options['history'] ignore = options['ignore'] - colorize = True - if options.get('no_color') is True: - colorize = False print('## ' + JobTemplate.objects.get(pk=jt).name + f' (last {history} runs)\n') with connection.cursor() as cursor: @@ -70,20 +67,20 @@ class Command(BaseCommand): for event in slowest_events: _id, job_id, host, duration, task, action, playbook = event - playbook = playbook.rsplit('/')[-1] if ignore and action in ignore: continue if duration.total_seconds() < threshold: break + playbook = playbook.rsplit('/')[-1] + human_duration = format_td(duration) fastest_summary = '' fastest_match = fastest.get((action, playbook)) - if fastest_match[2] != format_td(duration) and (host, action, playbook) not in warned: + if fastest_match[2] != human_duration and (host, action, playbook) not in warned: warned.add((host, action, playbook)) fastest_summary = ' ' + self.style.WARNING(f'{fastest_match[1]} ran this in {fastest_match[2]}s at /api/v2/job_events/{fastest_match[0]}/') url = f'/api/v2/jobs/{job_id}/' - human_duration = format_td(duration) print(' -- '.join([url, host, human_duration, action, task, playbook]) + fastest_summary) host_counts.setdefault(host, []) host_counts[host].append(duration) From b1fcc0578c6cb436892f67ec95562bd8ad8e8706 Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 10:23:01 -0400 Subject: [PATCH 7/8] default awx-manage bottleneck --threshold to 30s --- awx/main/management/commands/bottleneck.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index 7948507c70..1636f7e969 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -10,8 +10,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--template', dest='jt', type=int, help='ID of the Job Template to profile') - parser.add_argument('--threshold', dest='threshold', type=float, default=5, - help='Only show tasks that took at least this many seconds (defaults to 5)') + parser.add_argument('--threshold', dest='threshold', type=float, default=30, + help='Only show tasks that took at least this many seconds (defaults to 30)') parser.add_argument('--history', dest='history', type=float, default=25, help='The number of historic jobs to look at') parser.add_argument('--ignore', action='append', help='ignore a specific action (e.g., --ignore git)') From a62047809f96b042874ee3380be33af57c54938a Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Wed, 24 Jun 2020 10:25:09 -0400 Subject: [PATCH 8/8] update awx-manage bottleneck's help text --- awx/main/management/commands/bottleneck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/management/commands/bottleneck.py b/awx/main/management/commands/bottleneck.py index 1636f7e969..beac1d0745 100644 --- a/awx/main/management/commands/bottleneck.py +++ b/awx/main/management/commands/bottleneck.py @@ -5,7 +5,7 @@ from awx.main.models import JobTemplate class Command(BaseCommand): - help = 'Discover the slowest tasks and hosts for a specific Job Template.' + help = "Find the slowest tasks and hosts for a Job Template's most recent runs." def add_arguments(self, parser): parser.add_argument('--template', dest='jt', type=int,