diff --git a/awx/api/serializers.py b/awx/api/serializers.py index 3faa9c8dfb..c8d6028844 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -37,6 +37,7 @@ from polymorphic.models import PolymorphicModel # AWX from awx.main.constants import SCHEDULEABLE_PROVIDERS, ANSI_SGR_PATTERN from awx.main.models import * # noqa +from awx.main.models.unified_jobs import ACTIVE_STATES from awx.main.access import get_user_capabilities from awx.main.fields import ImplicitRoleField from awx.main.utils import ( @@ -2642,9 +2643,18 @@ class JobCancelSerializer(JobSerializer): class JobRelaunchSerializer(JobSerializer): passwords_needed_to_start = serializers.SerializerMethodField() + retry_counts = serializers.SerializerMethodField() + hosts = serializers.ChoiceField( + required=False, allow_null=True, default='all', + choices=[ + ('all', _('No change to job limit')), + ('failed', _('All failed and unreachable hosts')) + ], + write_only=True + ) class Meta: - fields = ('passwords_needed_to_start',) + fields = ('passwords_needed_to_start', 'retry_counts', 'hosts',) def to_internal_value(self, data): obj = self.context.get('obj') @@ -2666,6 +2676,14 @@ class JobRelaunchSerializer(JobSerializer): return obj.passwords_needed_to_start return '' + def get_retry_counts(self, obj): + if obj.status in ACTIVE_STATES: + return _('Relaunch by host status not available until job finishes running.') + data = OrderedDict([]) + for status in self.fields['hosts'].choices.keys(): + data[status] = obj.retry_qs(status).count() + return data + def validate_passwords_needed_to_start(self, value): obj = self.context.get('obj') data = self.context.get('data') @@ -2685,6 +2703,7 @@ class JobRelaunchSerializer(JobSerializer): raise serializers.ValidationError(dict(errors=[_("Job Template Project is missing or undefined.")])) if obj.inventory is None or obj.inventory.pending_deletion: raise serializers.ValidationError(dict(errors=[_("Job Template Inventory is missing or undefined.")])) + attrs.pop('hosts', None) attrs = super(JobRelaunchSerializer, self).validate(attrs) return attrs diff --git a/awx/api/views.py b/awx/api/views.py index d9a1ddfd50..2cc692e7e7 100644 --- a/awx/api/views.py +++ b/awx/api/views.py @@ -3834,7 +3834,26 @@ class JobRelaunch(RetrieveAPIView): if not serializer.is_valid(): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) - new_job = obj.copy_unified_job() + copy_kwargs = {} + retry_hosts = request.data.get('hosts', None) + if retry_hosts and retry_hosts != 'all': + if obj.status in ACTIVE_STATES: + return Response({'hosts': _( + 'Wait until job finishes before retrying on {status_value} hosts.' + ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST) + host_qs = obj.retry_qs(retry_hosts) + if not obj.job_events.filter(event='playbook_on_stats').exists(): + return Response({'hosts': _( + 'Cannot retry on {status_value} hosts, playbook stats not available.' + ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST) + retry_host_list = host_qs.values_list('name', flat=True) + if len(retry_host_list) == 0: + return Response({'hosts': _( + 'Cannot relaunch because previous job had 0 {status_value} hosts.' + ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST) + copy_kwargs['limit'] = ','.join(retry_host_list) + + new_job = obj.copy_unified_job(**copy_kwargs) result = new_job.signal_start(**request.data) if not result: data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start) diff --git a/awx/main/models/jobs.py b/awx/main/models/jobs.py index daf72ba006..8c6bec874b 100644 --- a/awx/main/models/jobs.py +++ b/awx/main/models/jobs.py @@ -23,6 +23,9 @@ from django.utils.timezone import utc from django.utils.translation import ugettext_lazy as _ from django.core.exceptions import ValidationError +# REST Framework +from rest_framework.exceptions import ParseError + # AWX from awx.api.versioning import reverse from awx.main.models.base import * # noqa @@ -588,10 +591,33 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana return self.passwords_needed_to_start def _get_hosts(self, **kwargs): - from awx.main.models.inventory import Host + Host = JobHostSummary._meta.get_field('host').related_model kwargs['job_host_summaries__job__pk'] = self.pk return Host.objects.filter(**kwargs) + def retry_qs(self, status): + ''' + Returns Host queryset that will be used to produce the `limit` + field in a retry on a subset of hosts + ''' + kwargs = {} + if status == 'all': + pass + elif status == 'failed': + # Special case for parity with Ansible .retry files + kwargs['job_host_summaries__failed'] = True + elif status in ['ok', 'changed', 'unreachable']: + if status == 'unreachable': + status_field = 'dark' + else: + status_field = status + kwargs['job_host_summaries__{}__gt'.format(status_field)] = 0 + else: + raise ParseError(_( + '{status_value} is not a valid status option.' + ).format(status_value=status)) + return self._get_hosts(**kwargs) + @property def task_impact(self): # NOTE: We sorta have to assume the host count matches and that forks default to 5 diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index ae705d4075..78c4d07137 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -734,7 +734,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique pass super(UnifiedJob, self).delete() - def copy_unified_job(self): + def copy_unified_job(self, limit=None): ''' Returns saved object, including related fields. Create a copy of this unified job for the purpose of relaunch @@ -746,6 +746,8 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique fields = unified_jt_class._get_unified_job_field_names() + [parent_field_name] unified_job = copy_model_by_class(self, unified_job_class, fields, {}) unified_job.launch_type = 'relaunch' + if limit: + unified_job.limit = limit unified_job.save() # Labels coppied here diff --git a/awx/main/tests/functional/api/test_job.py b/awx/main/tests/functional/api/test_job.py index e10156ce62..614f04f930 100644 --- a/awx/main/tests/functional/api/test_job.py +++ b/awx/main/tests/functional/api/test_job.py @@ -37,3 +37,31 @@ def test_job_relaunch_permission_denied_response( r = post(reverse('api:job_relaunch', kwargs={'pk':job.pk}), {}, jt_user, expect=403) assert 'launched with prompted fields' in r.data['detail'] assert 'do not have permission' in r.data['detail'] + + +@pytest.mark.django_db +@pytest.mark.parametrize("status,hosts", [ + ('all', 'host1,host2,host3'), + ('failed', 'host3'), +]) +def test_job_relaunch_on_failed_hosts(post, inventory, project, machine_credential, admin_user, status, hosts): + h1 = inventory.hosts.create(name='host1') # no-op + h2 = inventory.hosts.create(name='host2') # changed host + h3 = inventory.hosts.create(name='host3') # failed host + jt = JobTemplate.objects.create( + name='testjt', inventory=inventory, + project=project, credential=machine_credential + ) + job = jt.create_unified_job(_eager_fields={'status': 'failed', 'limit': 'host1,host2,host3'}) + job.job_events.create(event='playbook_on_stats') + job.job_host_summaries.create(host=h1, failed=False, ok=1, changed=0, failures=0, host_name=h1.name) + job.job_host_summaries.create(host=h2, failed=False, ok=0, changed=1, failures=0, host_name=h2.name) + job.job_host_summaries.create(host=h3, failed=False, ok=0, changed=0, failures=1, host_name=h3.name) + + r = post( + url=reverse('api:job_relaunch', kwargs={'pk':job.pk}), + data={'hosts': status}, + user=admin_user, + expect=201 + ) + assert r.data.get('limit') == hosts diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 9888c3bede..5d2a1f938a 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -53,3 +53,4 @@ `deprovision_node` -> `deprovision_instance`, and `instance_group_remove` -> `remove_from_queue`, which backward compatibility support for 3.1 use pattern [[#6915](https://github.com/ansible/ansible-tower/issues/6915)] +* Allow relaunching jobs on a subset of hosts, by status.[[#219](https://github.com/ansible/awx/issues/219)] diff --git a/docs/retry_by_status.md b/docs/retry_by_status.md new file mode 100644 index 0000000000..2156848a4f --- /dev/null +++ b/docs/retry_by_status.md @@ -0,0 +1,87 @@ +# Relaunch on Hosts with Status + +This feature allows the user to relaunch a job, targeting only hosts marked +as failed in the original job. + +### Definition of "failed" + +This feature will relaunch against "failed hosts" in the original job, which +is different from "hosts with failed tasks". Unreachable hosts can have +no failed tasks. This means that the count of "failed hosts" can be different +from the failed count, given in the summary at the end of a playbook. + +This definition corresponds to Ansible .retry files. + +### API Design of Relaunch + +#### Basic Relaunch + +POST to `/api/v2/jobs/N/relaunch/` without any request data should relaunch +the job with the same `limit` value that the original job used, which +may be an empty string. + +This is implicitly the "all" option below. + +#### Relaunch by Status + +Providing request data containing `{"hosts": "failed"}` should change +the `limit` of the relaunched job to target failed hosts from the previous +job. Hosts will be provided as a comma-separated list in the limit. Formally, +these are options + + - all: relaunch without changing the job limit + - failed: relaunch against all hos + +### Relaunch Endpoint + +Doing a GET to the relaunch endpoint should return additional information +regarding the host summary of the last job. Example response: + +```json +{ + "passwords_needed_to_start": [], + "retry_counts": { + "all": 30, + "failed": 18 + } +} +``` + +If the user launches, providing a status for which there were 0 hosts, +then the request will be rejected. For example, if a GET yielded: + +```json +{ + "passwords_needed_to_start": [], + "retry_counts": { + "all": 30, + "failed": 0 + } +} +``` + +Then a POST of `{"hosts": "failed"}` should return a descriptive response +with a 400-level status code. + +# Acceptance Criteria + +Scenario: user launches a job against host "foobar", and the run fails +against this host. User changes name of host to "foo", and relaunches job +against failed hosts. The `limit` of the relaunched job should reference +"foo" and not "foobar". + +The user should be able to provide passwords on relaunch, while also +running against hosts of a particular status. + +Not providing the "hosts" key in a POST to the relaunch endpoint should +relaunch the same way that relaunching has previously worked. + +If a playbook provisions a host, this feature should behave reasonably +when relaunching against a status that includes these hosts. + +Feature should work even if hosts have tricky characters in their names, +like commas. + +Also need to consider case where a task `meta: clear_host_errors` is present +inside a playbook, and that the retry subset behavior is the same as Ansible +for this case.