Merge pull request #439 from AlanCoding/retry_subset

Feature: retry on subset of jobs hosts
2024-10-31 06:51:10 +03:00 · 2017-11-01 11:33:15 -04:00 · 2017-11-01 11:33:15 -04:00 · 0579db1162
commit 0579db1162
parent 7f20118d48 41c84b4652
7 changed files with 186 additions and 4 deletions
--- a/awx/api/serializers.py
+++ b/awx/api/serializers.py
@ -37,6 +37,7 @@ from polymorphic.models import PolymorphicModel
 # AWX
 from awx.main.constants import SCHEDULEABLE_PROVIDERS, ANSI_SGR_PATTERN
 from awx.main.models import * # noqa
+from awx.main.models.unified_jobs import ACTIVE_STATES
 from awx.main.access import get_user_capabilities
 from awx.main.fields import ImplicitRoleField
 from awx.main.utils import (
@ -2642,9 +2643,18 @@ class JobCancelSerializer(JobSerializer):
 class JobRelaunchSerializer(JobSerializer):

    passwords_needed_to_start = serializers.SerializerMethodField()
+    retry_counts = serializers.SerializerMethodField()
+    hosts = serializers.ChoiceField(
+        required=False, allow_null=True, default='all',
+        choices=[
+            ('all', _('No change to job limit')),
+            ('failed', _('All failed and unreachable hosts'))
+        ],
+        write_only=True
+    )

    class Meta:
-        fields = ('passwords_needed_to_start',)
+        fields = ('passwords_needed_to_start', 'retry_counts', 'hosts',)

    def to_internal_value(self, data):
        obj = self.context.get('obj')
@ -2666,6 +2676,14 @@ class JobRelaunchSerializer(JobSerializer):
            return obj.passwords_needed_to_start
        return ''

+    def get_retry_counts(self, obj):
+        if obj.status in ACTIVE_STATES:
+            return _('Relaunch by host status not available until job finishes running.')
+        data = OrderedDict([])
+        for status in self.fields['hosts'].choices.keys():
+            data[status] = obj.retry_qs(status).count()
+        return data
+
    def validate_passwords_needed_to_start(self, value):
        obj = self.context.get('obj')
        data = self.context.get('data')
@ -2685,6 +2703,7 @@ class JobRelaunchSerializer(JobSerializer):
            raise serializers.ValidationError(dict(errors=[_("Job Template Project is missing or undefined.")]))
        if obj.inventory is None or obj.inventory.pending_deletion:
            raise serializers.ValidationError(dict(errors=[_("Job Template Inventory is missing or undefined.")]))
+        attrs.pop('hosts', None)
        attrs = super(JobRelaunchSerializer, self).validate(attrs)
        return attrs

--- a/awx/api/views.py
+++ b/awx/api/views.py
@ -3834,7 +3834,26 @@ class JobRelaunch(RetrieveAPIView):
        if not serializer.is_valid():
            return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

-        new_job = obj.copy_unified_job()
+        copy_kwargs = {}
+        retry_hosts = request.data.get('hosts', None)
+        if retry_hosts and retry_hosts != 'all':
+            if obj.status in ACTIVE_STATES:
+                return Response({'hosts': _(
+                    'Wait until job finishes before retrying on {status_value} hosts.'
+                ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
+            host_qs = obj.retry_qs(retry_hosts)
+            if not obj.job_events.filter(event='playbook_on_stats').exists():
+                return Response({'hosts': _(
+                    'Cannot retry on {status_value} hosts, playbook stats not available.'
+                ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
+            retry_host_list = host_qs.values_list('name', flat=True)
+            if len(retry_host_list) == 0:
+                return Response({'hosts': _(
+                    'Cannot relaunch because previous job had 0 {status_value} hosts.'
+                ).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
+            copy_kwargs['limit'] = ','.join(retry_host_list)
+
+        new_job = obj.copy_unified_job(**copy_kwargs)
        result = new_job.signal_start(**request.data)
        if not result:
            data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start)
--- a/awx/main/models/jobs.py
+++ b/awx/main/models/jobs.py
@ -23,6 +23,9 @@ from django.utils.timezone import utc
 from django.utils.translation import ugettext_lazy as _
 from django.core.exceptions import ValidationError

+# REST Framework
+from rest_framework.exceptions import ParseError
+
 # AWX
 from awx.api.versioning import reverse
 from awx.main.models.base import * # noqa
@ -588,10 +591,33 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
        return self.passwords_needed_to_start

    def _get_hosts(self, **kwargs):
-        from awx.main.models.inventory import Host
+        Host = JobHostSummary._meta.get_field('host').related_model
        kwargs['job_host_summaries__job__pk'] = self.pk
        return Host.objects.filter(**kwargs)

+    def retry_qs(self, status):
+        '''
+        Returns Host queryset that will be used to produce the `limit`
+        field in a retry on a subset of hosts
+        '''
+        kwargs = {}
+        if status == 'all':
+            pass
+        elif status == 'failed':
+            # Special case for parity with Ansible .retry files
+            kwargs['job_host_summaries__failed'] = True
+        elif status in ['ok', 'changed', 'unreachable']:
+            if status == 'unreachable':
+                status_field = 'dark'
+            else:
+                status_field = status
+            kwargs['job_host_summaries__{}__gt'.format(status_field)] = 0
+        else:
+            raise ParseError(_(
+                '{status_value} is not a valid status option.'
+            ).format(status_value=status))
+        return self._get_hosts(**kwargs)
+
    @property
    def task_impact(self):
        # NOTE: We sorta have to assume the host count matches and that forks default to 5
--- a/awx/main/models/unified_jobs.py
+++ b/awx/main/models/unified_jobs.py
@ -734,7 +734,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
                pass
        super(UnifiedJob, self).delete()

-    def copy_unified_job(self):
+    def copy_unified_job(self, limit=None):
        '''
        Returns saved object, including related fields.
        Create a copy of this unified job for the purpose of relaunch
@ -746,6 +746,8 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
        fields = unified_jt_class._get_unified_job_field_names() + [parent_field_name]
        unified_job = copy_model_by_class(self, unified_job_class, fields, {})
        unified_job.launch_type = 'relaunch'
+        if limit:
+            unified_job.limit = limit
        unified_job.save()

        # Labels coppied here
--- a/awx/main/tests/functional/api/test_job.py
+++ b/awx/main/tests/functional/api/test_job.py
@ -37,3 +37,31 @@ def test_job_relaunch_permission_denied_response(
    r = post(reverse('api:job_relaunch', kwargs={'pk':job.pk}), {}, jt_user, expect=403)
    assert 'launched with prompted fields' in r.data['detail']
    assert 'do not have permission' in r.data['detail']
+
+
+@pytest.mark.django_db
+@pytest.mark.parametrize("status,hosts", [
+    ('all', 'host1,host2,host3'),
+    ('failed', 'host3'),
+])
+def test_job_relaunch_on_failed_hosts(post, inventory, project, machine_credential, admin_user, status, hosts):
+    h1 = inventory.hosts.create(name='host1')  # no-op
+    h2 = inventory.hosts.create(name='host2')  # changed host
+    h3 = inventory.hosts.create(name='host3')  # failed host
+    jt = JobTemplate.objects.create(
+        name='testjt', inventory=inventory,
+        project=project, credential=machine_credential
+    )
+    job = jt.create_unified_job(_eager_fields={'status': 'failed', 'limit': 'host1,host2,host3'})
+    job.job_events.create(event='playbook_on_stats')
+    job.job_host_summaries.create(host=h1, failed=False, ok=1, changed=0, failures=0, host_name=h1.name)
+    job.job_host_summaries.create(host=h2, failed=False, ok=0, changed=1, failures=0, host_name=h2.name)
+    job.job_host_summaries.create(host=h3, failed=False, ok=0, changed=0, failures=1, host_name=h3.name)
+
+    r = post(
+        url=reverse('api:job_relaunch', kwargs={'pk':job.pk}),
+        data={'hosts': status},
+        user=admin_user,
+        expect=201
+    )
+    assert r.data.get('limit') == hosts
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -53,3 +53,4 @@
  `deprovision_node` -> `deprovision_instance`, and `instance_group_remove` -> `remove_from_queue`,
  which backward compatibility support for 3.1 use pattern
  [[#6915](https://github.com/ansible/ansible-tower/issues/6915)]
+* Allow relaunching jobs on a subset of hosts, by status.[[#219](https://github.com/ansible/awx/issues/219)]
--- a/docs/retry_by_status.md
+++ b/docs/retry_by_status.md
@ -0,0 +1,87 @@
+# Relaunch on Hosts with Status
+
+This feature allows the user to relaunch a job, targeting only hosts marked
+as failed in the original job.
+
+### Definition of "failed"
+
+This feature will relaunch against "failed hosts" in the original job, which
+is different from "hosts with failed tasks". Unreachable hosts can have
+no failed tasks. This means that the count of "failed hosts" can be different
+from the failed count, given in the summary at the end of a playbook.
+
+This definition corresponds to Ansible .retry files.
+
+### API Design of Relaunch
+
+#### Basic Relaunch
+
+POST to `/api/v2/jobs/N/relaunch/` without any request data should relaunch
+the job with the same `limit` value that the original job used, which
+may be an empty string.
+
+This is implicitly the "all" option below.
+
+#### Relaunch by Status
+
+Providing request data containing `{"hosts": "failed"}` should change
+the `limit` of the relaunched job to target failed hosts from the previous
+job. Hosts will be provided as a comma-separated list in the limit. Formally,
+these are options
+
+ - all: relaunch without changing the job limit
+ - failed: relaunch against all hos
+
+### Relaunch Endpoint
+
+Doing a GET to the relaunch endpoint should return additional information
+regarding the host summary of the last job. Example response:
+
+```json
+{
+    "passwords_needed_to_start": [],
+    "retry_counts": {
+        "all": 30,
+        "failed": 18
+    }
+}
+```
+
+If the user launches, providing a status for which there were 0 hosts,
+then the request will be rejected. For example, if a GET yielded:
+
+```json
+{
+    "passwords_needed_to_start": [],
+    "retry_counts": {
+        "all": 30,
+        "failed": 0
+    }
+}
+```
+
+Then a POST of `{"hosts": "failed"}` should return a descriptive response
+with a 400-level status code.
+
+# Acceptance Criteria
+
+Scenario: user launches a job against host "foobar", and the run fails
+against this host. User changes name of host to "foo", and relaunches job
+against failed hosts. The `limit` of the relaunched job should reference
+"foo" and not "foobar".
+
+The user should be able to provide passwords on relaunch, while also
+running against hosts of a particular status.
+
+Not providing the "hosts" key in a POST to the relaunch endpoint should
+relaunch the same way that relaunching has previously worked.
+
+If a playbook provisions a host, this feature should behave reasonably
+when relaunching against a status that includes these hosts.
+
+Feature should work even if hosts have tricky characters in their names,
+like commas.
+
+Also need to consider case where a task `meta: clear_host_errors` is present
+inside a playbook, and that the retry subset behavior is the same as Ansible
+for this case.