1
0
mirror of https://github.com/ansible/awx.git synced 2024-10-31 06:51:10 +03:00

Merge pull request #439 from AlanCoding/retry_subset

Feature: retry on subset of jobs hosts
This commit is contained in:
Alan Rominger 2017-11-01 11:33:15 -04:00 committed by GitHub
commit 0579db1162
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 186 additions and 4 deletions

View File

@ -37,6 +37,7 @@ from polymorphic.models import PolymorphicModel
# AWX
from awx.main.constants import SCHEDULEABLE_PROVIDERS, ANSI_SGR_PATTERN
from awx.main.models import * # noqa
from awx.main.models.unified_jobs import ACTIVE_STATES
from awx.main.access import get_user_capabilities
from awx.main.fields import ImplicitRoleField
from awx.main.utils import (
@ -2642,9 +2643,18 @@ class JobCancelSerializer(JobSerializer):
class JobRelaunchSerializer(JobSerializer):
passwords_needed_to_start = serializers.SerializerMethodField()
retry_counts = serializers.SerializerMethodField()
hosts = serializers.ChoiceField(
required=False, allow_null=True, default='all',
choices=[
('all', _('No change to job limit')),
('failed', _('All failed and unreachable hosts'))
],
write_only=True
)
class Meta:
fields = ('passwords_needed_to_start',)
fields = ('passwords_needed_to_start', 'retry_counts', 'hosts',)
def to_internal_value(self, data):
obj = self.context.get('obj')
@ -2666,6 +2676,14 @@ class JobRelaunchSerializer(JobSerializer):
return obj.passwords_needed_to_start
return ''
def get_retry_counts(self, obj):
if obj.status in ACTIVE_STATES:
return _('Relaunch by host status not available until job finishes running.')
data = OrderedDict([])
for status in self.fields['hosts'].choices.keys():
data[status] = obj.retry_qs(status).count()
return data
def validate_passwords_needed_to_start(self, value):
obj = self.context.get('obj')
data = self.context.get('data')
@ -2685,6 +2703,7 @@ class JobRelaunchSerializer(JobSerializer):
raise serializers.ValidationError(dict(errors=[_("Job Template Project is missing or undefined.")]))
if obj.inventory is None or obj.inventory.pending_deletion:
raise serializers.ValidationError(dict(errors=[_("Job Template Inventory is missing or undefined.")]))
attrs.pop('hosts', None)
attrs = super(JobRelaunchSerializer, self).validate(attrs)
return attrs

View File

@ -3834,7 +3834,26 @@ class JobRelaunch(RetrieveAPIView):
if not serializer.is_valid():
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
new_job = obj.copy_unified_job()
copy_kwargs = {}
retry_hosts = request.data.get('hosts', None)
if retry_hosts and retry_hosts != 'all':
if obj.status in ACTIVE_STATES:
return Response({'hosts': _(
'Wait until job finishes before retrying on {status_value} hosts.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
host_qs = obj.retry_qs(retry_hosts)
if not obj.job_events.filter(event='playbook_on_stats').exists():
return Response({'hosts': _(
'Cannot retry on {status_value} hosts, playbook stats not available.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
retry_host_list = host_qs.values_list('name', flat=True)
if len(retry_host_list) == 0:
return Response({'hosts': _(
'Cannot relaunch because previous job had 0 {status_value} hosts.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
copy_kwargs['limit'] = ','.join(retry_host_list)
new_job = obj.copy_unified_job(**copy_kwargs)
result = new_job.signal_start(**request.data)
if not result:
data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start)

View File

@ -23,6 +23,9 @@ from django.utils.timezone import utc
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ValidationError
# REST Framework
from rest_framework.exceptions import ParseError
# AWX
from awx.api.versioning import reverse
from awx.main.models.base import * # noqa
@ -588,10 +591,33 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
return self.passwords_needed_to_start
def _get_hosts(self, **kwargs):
from awx.main.models.inventory import Host
Host = JobHostSummary._meta.get_field('host').related_model
kwargs['job_host_summaries__job__pk'] = self.pk
return Host.objects.filter(**kwargs)
def retry_qs(self, status):
'''
Returns Host queryset that will be used to produce the `limit`
field in a retry on a subset of hosts
'''
kwargs = {}
if status == 'all':
pass
elif status == 'failed':
# Special case for parity with Ansible .retry files
kwargs['job_host_summaries__failed'] = True
elif status in ['ok', 'changed', 'unreachable']:
if status == 'unreachable':
status_field = 'dark'
else:
status_field = status
kwargs['job_host_summaries__{}__gt'.format(status_field)] = 0
else:
raise ParseError(_(
'{status_value} is not a valid status option.'
).format(status_value=status))
return self._get_hosts(**kwargs)
@property
def task_impact(self):
# NOTE: We sorta have to assume the host count matches and that forks default to 5

View File

@ -734,7 +734,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
pass
super(UnifiedJob, self).delete()
def copy_unified_job(self):
def copy_unified_job(self, limit=None):
'''
Returns saved object, including related fields.
Create a copy of this unified job for the purpose of relaunch
@ -746,6 +746,8 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
fields = unified_jt_class._get_unified_job_field_names() + [parent_field_name]
unified_job = copy_model_by_class(self, unified_job_class, fields, {})
unified_job.launch_type = 'relaunch'
if limit:
unified_job.limit = limit
unified_job.save()
# Labels coppied here

View File

@ -37,3 +37,31 @@ def test_job_relaunch_permission_denied_response(
r = post(reverse('api:job_relaunch', kwargs={'pk':job.pk}), {}, jt_user, expect=403)
assert 'launched with prompted fields' in r.data['detail']
assert 'do not have permission' in r.data['detail']
@pytest.mark.django_db
@pytest.mark.parametrize("status,hosts", [
('all', 'host1,host2,host3'),
('failed', 'host3'),
])
def test_job_relaunch_on_failed_hosts(post, inventory, project, machine_credential, admin_user, status, hosts):
h1 = inventory.hosts.create(name='host1') # no-op
h2 = inventory.hosts.create(name='host2') # changed host
h3 = inventory.hosts.create(name='host3') # failed host
jt = JobTemplate.objects.create(
name='testjt', inventory=inventory,
project=project, credential=machine_credential
)
job = jt.create_unified_job(_eager_fields={'status': 'failed', 'limit': 'host1,host2,host3'})
job.job_events.create(event='playbook_on_stats')
job.job_host_summaries.create(host=h1, failed=False, ok=1, changed=0, failures=0, host_name=h1.name)
job.job_host_summaries.create(host=h2, failed=False, ok=0, changed=1, failures=0, host_name=h2.name)
job.job_host_summaries.create(host=h3, failed=False, ok=0, changed=0, failures=1, host_name=h3.name)
r = post(
url=reverse('api:job_relaunch', kwargs={'pk':job.pk}),
data={'hosts': status},
user=admin_user,
expect=201
)
assert r.data.get('limit') == hosts

View File

@ -53,3 +53,4 @@
`deprovision_node` -> `deprovision_instance`, and `instance_group_remove` -> `remove_from_queue`,
which backward compatibility support for 3.1 use pattern
[[#6915](https://github.com/ansible/ansible-tower/issues/6915)]
* Allow relaunching jobs on a subset of hosts, by status.[[#219](https://github.com/ansible/awx/issues/219)]

87
docs/retry_by_status.md Normal file
View File

@ -0,0 +1,87 @@
# Relaunch on Hosts with Status
This feature allows the user to relaunch a job, targeting only hosts marked
as failed in the original job.
### Definition of "failed"
This feature will relaunch against "failed hosts" in the original job, which
is different from "hosts with failed tasks". Unreachable hosts can have
no failed tasks. This means that the count of "failed hosts" can be different
from the failed count, given in the summary at the end of a playbook.
This definition corresponds to Ansible .retry files.
### API Design of Relaunch
#### Basic Relaunch
POST to `/api/v2/jobs/N/relaunch/` without any request data should relaunch
the job with the same `limit` value that the original job used, which
may be an empty string.
This is implicitly the "all" option below.
#### Relaunch by Status
Providing request data containing `{"hosts": "failed"}` should change
the `limit` of the relaunched job to target failed hosts from the previous
job. Hosts will be provided as a comma-separated list in the limit. Formally,
these are options
- all: relaunch without changing the job limit
- failed: relaunch against all hos
### Relaunch Endpoint
Doing a GET to the relaunch endpoint should return additional information
regarding the host summary of the last job. Example response:
```json
{
"passwords_needed_to_start": [],
"retry_counts": {
"all": 30,
"failed": 18
}
}
```
If the user launches, providing a status for which there were 0 hosts,
then the request will be rejected. For example, if a GET yielded:
```json
{
"passwords_needed_to_start": [],
"retry_counts": {
"all": 30,
"failed": 0
}
}
```
Then a POST of `{"hosts": "failed"}` should return a descriptive response
with a 400-level status code.
# Acceptance Criteria
Scenario: user launches a job against host "foobar", and the run fails
against this host. User changes name of host to "foo", and relaunches job
against failed hosts. The `limit` of the relaunched job should reference
"foo" and not "foobar".
The user should be able to provide passwords on relaunch, while also
running against hosts of a particular status.
Not providing the "hosts" key in a POST to the relaunch endpoint should
relaunch the same way that relaunching has previously worked.
If a playbook provisions a host, this feature should behave reasonably
when relaunching against a status that includes these hosts.
Feature should work even if hosts have tricky characters in their names,
like commas.
Also need to consider case where a task `meta: clear_host_errors` is present
inside a playbook, and that the retry subset behavior is the same as Ansible
for this case.