mirror of
https://github.com/ansible/awx.git
synced 2024-10-31 06:51:10 +03:00
Merge pull request #439 from AlanCoding/retry_subset
Feature: retry on subset of jobs hosts
This commit is contained in:
commit
0579db1162
@ -37,6 +37,7 @@ from polymorphic.models import PolymorphicModel
|
||||
# AWX
|
||||
from awx.main.constants import SCHEDULEABLE_PROVIDERS, ANSI_SGR_PATTERN
|
||||
from awx.main.models import * # noqa
|
||||
from awx.main.models.unified_jobs import ACTIVE_STATES
|
||||
from awx.main.access import get_user_capabilities
|
||||
from awx.main.fields import ImplicitRoleField
|
||||
from awx.main.utils import (
|
||||
@ -2642,9 +2643,18 @@ class JobCancelSerializer(JobSerializer):
|
||||
class JobRelaunchSerializer(JobSerializer):
|
||||
|
||||
passwords_needed_to_start = serializers.SerializerMethodField()
|
||||
retry_counts = serializers.SerializerMethodField()
|
||||
hosts = serializers.ChoiceField(
|
||||
required=False, allow_null=True, default='all',
|
||||
choices=[
|
||||
('all', _('No change to job limit')),
|
||||
('failed', _('All failed and unreachable hosts'))
|
||||
],
|
||||
write_only=True
|
||||
)
|
||||
|
||||
class Meta:
|
||||
fields = ('passwords_needed_to_start',)
|
||||
fields = ('passwords_needed_to_start', 'retry_counts', 'hosts',)
|
||||
|
||||
def to_internal_value(self, data):
|
||||
obj = self.context.get('obj')
|
||||
@ -2666,6 +2676,14 @@ class JobRelaunchSerializer(JobSerializer):
|
||||
return obj.passwords_needed_to_start
|
||||
return ''
|
||||
|
||||
def get_retry_counts(self, obj):
|
||||
if obj.status in ACTIVE_STATES:
|
||||
return _('Relaunch by host status not available until job finishes running.')
|
||||
data = OrderedDict([])
|
||||
for status in self.fields['hosts'].choices.keys():
|
||||
data[status] = obj.retry_qs(status).count()
|
||||
return data
|
||||
|
||||
def validate_passwords_needed_to_start(self, value):
|
||||
obj = self.context.get('obj')
|
||||
data = self.context.get('data')
|
||||
@ -2685,6 +2703,7 @@ class JobRelaunchSerializer(JobSerializer):
|
||||
raise serializers.ValidationError(dict(errors=[_("Job Template Project is missing or undefined.")]))
|
||||
if obj.inventory is None or obj.inventory.pending_deletion:
|
||||
raise serializers.ValidationError(dict(errors=[_("Job Template Inventory is missing or undefined.")]))
|
||||
attrs.pop('hosts', None)
|
||||
attrs = super(JobRelaunchSerializer, self).validate(attrs)
|
||||
return attrs
|
||||
|
||||
|
@ -3834,7 +3834,26 @@ class JobRelaunch(RetrieveAPIView):
|
||||
if not serializer.is_valid():
|
||||
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
new_job = obj.copy_unified_job()
|
||||
copy_kwargs = {}
|
||||
retry_hosts = request.data.get('hosts', None)
|
||||
if retry_hosts and retry_hosts != 'all':
|
||||
if obj.status in ACTIVE_STATES:
|
||||
return Response({'hosts': _(
|
||||
'Wait until job finishes before retrying on {status_value} hosts.'
|
||||
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
|
||||
host_qs = obj.retry_qs(retry_hosts)
|
||||
if not obj.job_events.filter(event='playbook_on_stats').exists():
|
||||
return Response({'hosts': _(
|
||||
'Cannot retry on {status_value} hosts, playbook stats not available.'
|
||||
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
|
||||
retry_host_list = host_qs.values_list('name', flat=True)
|
||||
if len(retry_host_list) == 0:
|
||||
return Response({'hosts': _(
|
||||
'Cannot relaunch because previous job had 0 {status_value} hosts.'
|
||||
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
|
||||
copy_kwargs['limit'] = ','.join(retry_host_list)
|
||||
|
||||
new_job = obj.copy_unified_job(**copy_kwargs)
|
||||
result = new_job.signal_start(**request.data)
|
||||
if not result:
|
||||
data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start)
|
||||
|
@ -23,6 +23,9 @@ from django.utils.timezone import utc
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.core.exceptions import ValidationError
|
||||
|
||||
# REST Framework
|
||||
from rest_framework.exceptions import ParseError
|
||||
|
||||
# AWX
|
||||
from awx.api.versioning import reverse
|
||||
from awx.main.models.base import * # noqa
|
||||
@ -588,10 +591,33 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
|
||||
return self.passwords_needed_to_start
|
||||
|
||||
def _get_hosts(self, **kwargs):
|
||||
from awx.main.models.inventory import Host
|
||||
Host = JobHostSummary._meta.get_field('host').related_model
|
||||
kwargs['job_host_summaries__job__pk'] = self.pk
|
||||
return Host.objects.filter(**kwargs)
|
||||
|
||||
def retry_qs(self, status):
|
||||
'''
|
||||
Returns Host queryset that will be used to produce the `limit`
|
||||
field in a retry on a subset of hosts
|
||||
'''
|
||||
kwargs = {}
|
||||
if status == 'all':
|
||||
pass
|
||||
elif status == 'failed':
|
||||
# Special case for parity with Ansible .retry files
|
||||
kwargs['job_host_summaries__failed'] = True
|
||||
elif status in ['ok', 'changed', 'unreachable']:
|
||||
if status == 'unreachable':
|
||||
status_field = 'dark'
|
||||
else:
|
||||
status_field = status
|
||||
kwargs['job_host_summaries__{}__gt'.format(status_field)] = 0
|
||||
else:
|
||||
raise ParseError(_(
|
||||
'{status_value} is not a valid status option.'
|
||||
).format(status_value=status))
|
||||
return self._get_hosts(**kwargs)
|
||||
|
||||
@property
|
||||
def task_impact(self):
|
||||
# NOTE: We sorta have to assume the host count matches and that forks default to 5
|
||||
|
@ -734,7 +734,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
pass
|
||||
super(UnifiedJob, self).delete()
|
||||
|
||||
def copy_unified_job(self):
|
||||
def copy_unified_job(self, limit=None):
|
||||
'''
|
||||
Returns saved object, including related fields.
|
||||
Create a copy of this unified job for the purpose of relaunch
|
||||
@ -746,6 +746,8 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
fields = unified_jt_class._get_unified_job_field_names() + [parent_field_name]
|
||||
unified_job = copy_model_by_class(self, unified_job_class, fields, {})
|
||||
unified_job.launch_type = 'relaunch'
|
||||
if limit:
|
||||
unified_job.limit = limit
|
||||
unified_job.save()
|
||||
|
||||
# Labels coppied here
|
||||
|
@ -37,3 +37,31 @@ def test_job_relaunch_permission_denied_response(
|
||||
r = post(reverse('api:job_relaunch', kwargs={'pk':job.pk}), {}, jt_user, expect=403)
|
||||
assert 'launched with prompted fields' in r.data['detail']
|
||||
assert 'do not have permission' in r.data['detail']
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.parametrize("status,hosts", [
|
||||
('all', 'host1,host2,host3'),
|
||||
('failed', 'host3'),
|
||||
])
|
||||
def test_job_relaunch_on_failed_hosts(post, inventory, project, machine_credential, admin_user, status, hosts):
|
||||
h1 = inventory.hosts.create(name='host1') # no-op
|
||||
h2 = inventory.hosts.create(name='host2') # changed host
|
||||
h3 = inventory.hosts.create(name='host3') # failed host
|
||||
jt = JobTemplate.objects.create(
|
||||
name='testjt', inventory=inventory,
|
||||
project=project, credential=machine_credential
|
||||
)
|
||||
job = jt.create_unified_job(_eager_fields={'status': 'failed', 'limit': 'host1,host2,host3'})
|
||||
job.job_events.create(event='playbook_on_stats')
|
||||
job.job_host_summaries.create(host=h1, failed=False, ok=1, changed=0, failures=0, host_name=h1.name)
|
||||
job.job_host_summaries.create(host=h2, failed=False, ok=0, changed=1, failures=0, host_name=h2.name)
|
||||
job.job_host_summaries.create(host=h3, failed=False, ok=0, changed=0, failures=1, host_name=h3.name)
|
||||
|
||||
r = post(
|
||||
url=reverse('api:job_relaunch', kwargs={'pk':job.pk}),
|
||||
data={'hosts': status},
|
||||
user=admin_user,
|
||||
expect=201
|
||||
)
|
||||
assert r.data.get('limit') == hosts
|
||||
|
@ -53,3 +53,4 @@
|
||||
`deprovision_node` -> `deprovision_instance`, and `instance_group_remove` -> `remove_from_queue`,
|
||||
which backward compatibility support for 3.1 use pattern
|
||||
[[#6915](https://github.com/ansible/ansible-tower/issues/6915)]
|
||||
* Allow relaunching jobs on a subset of hosts, by status.[[#219](https://github.com/ansible/awx/issues/219)]
|
||||
|
87
docs/retry_by_status.md
Normal file
87
docs/retry_by_status.md
Normal file
@ -0,0 +1,87 @@
|
||||
# Relaunch on Hosts with Status
|
||||
|
||||
This feature allows the user to relaunch a job, targeting only hosts marked
|
||||
as failed in the original job.
|
||||
|
||||
### Definition of "failed"
|
||||
|
||||
This feature will relaunch against "failed hosts" in the original job, which
|
||||
is different from "hosts with failed tasks". Unreachable hosts can have
|
||||
no failed tasks. This means that the count of "failed hosts" can be different
|
||||
from the failed count, given in the summary at the end of a playbook.
|
||||
|
||||
This definition corresponds to Ansible .retry files.
|
||||
|
||||
### API Design of Relaunch
|
||||
|
||||
#### Basic Relaunch
|
||||
|
||||
POST to `/api/v2/jobs/N/relaunch/` without any request data should relaunch
|
||||
the job with the same `limit` value that the original job used, which
|
||||
may be an empty string.
|
||||
|
||||
This is implicitly the "all" option below.
|
||||
|
||||
#### Relaunch by Status
|
||||
|
||||
Providing request data containing `{"hosts": "failed"}` should change
|
||||
the `limit` of the relaunched job to target failed hosts from the previous
|
||||
job. Hosts will be provided as a comma-separated list in the limit. Formally,
|
||||
these are options
|
||||
|
||||
- all: relaunch without changing the job limit
|
||||
- failed: relaunch against all hos
|
||||
|
||||
### Relaunch Endpoint
|
||||
|
||||
Doing a GET to the relaunch endpoint should return additional information
|
||||
regarding the host summary of the last job. Example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"passwords_needed_to_start": [],
|
||||
"retry_counts": {
|
||||
"all": 30,
|
||||
"failed": 18
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If the user launches, providing a status for which there were 0 hosts,
|
||||
then the request will be rejected. For example, if a GET yielded:
|
||||
|
||||
```json
|
||||
{
|
||||
"passwords_needed_to_start": [],
|
||||
"retry_counts": {
|
||||
"all": 30,
|
||||
"failed": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then a POST of `{"hosts": "failed"}` should return a descriptive response
|
||||
with a 400-level status code.
|
||||
|
||||
# Acceptance Criteria
|
||||
|
||||
Scenario: user launches a job against host "foobar", and the run fails
|
||||
against this host. User changes name of host to "foo", and relaunches job
|
||||
against failed hosts. The `limit` of the relaunched job should reference
|
||||
"foo" and not "foobar".
|
||||
|
||||
The user should be able to provide passwords on relaunch, while also
|
||||
running against hosts of a particular status.
|
||||
|
||||
Not providing the "hosts" key in a POST to the relaunch endpoint should
|
||||
relaunch the same way that relaunching has previously worked.
|
||||
|
||||
If a playbook provisions a host, this feature should behave reasonably
|
||||
when relaunching against a status that includes these hosts.
|
||||
|
||||
Feature should work even if hosts have tricky characters in their names,
|
||||
like commas.
|
||||
|
||||
Also need to consider case where a task `meta: clear_host_errors` is present
|
||||
inside a playbook, and that the retry subset behavior is the same as Ansible
|
||||
for this case.
|
Loading…
Reference in New Issue
Block a user