mirror of
https://github.com/ansible/awx.git
synced 2024-10-27 17:55:10 +03:00
Merge pull request #3852 from ansible/pod-reaper
implement a simple periodic pod reaper for container groups
This commit is contained in:
commit
85f118c17d
@ -1,3 +1,4 @@
|
||||
import collections
|
||||
import os
|
||||
import stat
|
||||
import time
|
||||
@ -47,6 +48,27 @@ class PodManager(object):
|
||||
else:
|
||||
logger.warn(f"Pod {self.pod_name} did not start. Status is {pod.status.phase}.")
|
||||
|
||||
@classmethod
|
||||
def list_active_jobs(self, instance_group):
|
||||
task = collections.namedtuple('Task', 'id instance_group')(
|
||||
id='',
|
||||
instance_group=instance_group
|
||||
)
|
||||
pm = PodManager(task)
|
||||
try:
|
||||
for pod in pm.kube_api.list_namespaced_pod(
|
||||
pm.namespace,
|
||||
label_selector='ansible-awx={}'.format(settings.INSTALL_UUID)
|
||||
).to_dict().get('items', []):
|
||||
job = pod['metadata'].get('labels', {}).get('ansible-awx-job-id')
|
||||
if job:
|
||||
try:
|
||||
yield int(job)
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception('Failed to list pods for container group {}'.format(instance_group))
|
||||
|
||||
def delete(self):
|
||||
return self.kube_api.delete_namespaced_pod(name=self.pod_name,
|
||||
namespace=self.namespace,
|
||||
@ -71,7 +93,7 @@ class PodManager(object):
|
||||
|
||||
@property
|
||||
def pod_name(self):
|
||||
return f"job-{self.task.id}"
|
||||
return f"awx-job-{self.task.id}"
|
||||
|
||||
@property
|
||||
def pod_definition(self):
|
||||
@ -102,6 +124,10 @@ class PodManager(object):
|
||||
|
||||
if self.task:
|
||||
pod_spec['metadata']['name'] = self.pod_name
|
||||
pod_spec['metadata']['labels'] = {
|
||||
'ansible-awx': settings.INSTALL_UUID,
|
||||
'ansible-awx-job-id': str(self.task.id)
|
||||
}
|
||||
pod_spec['spec']['containers'][0]['name'] = self.pod_name
|
||||
|
||||
return pod_spec
|
||||
|
@ -458,6 +458,25 @@ def cluster_node_heartbeat():
|
||||
logger.exception('Error marking {} as lost'.format(other_inst.hostname))
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def awx_k8s_reaper():
|
||||
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
|
||||
for group in InstanceGroup.objects.filter(credential__isnull=False).iterator():
|
||||
if group.is_containerized:
|
||||
logger.debug("Checking for orphaned k8s pods for {}.".format(group))
|
||||
for job in UnifiedJob.objects.filter(
|
||||
pk__in=list(PodManager.list_active_jobs(group))
|
||||
).exclude(status__in=ACTIVE_STATES):
|
||||
logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format))
|
||||
try:
|
||||
PodManager(job).delete()
|
||||
except Exception:
|
||||
logger.exception("Failed to delete orphaned pod {} from {}".format(
|
||||
job.log_format, group
|
||||
))
|
||||
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def awx_isolated_heartbeat():
|
||||
local_hostname = settings.CLUSTER_HOST_ID
|
||||
|
@ -479,6 +479,11 @@ CELERYBEAT_SCHEDULE = {
|
||||
'schedule': timedelta(seconds=20),
|
||||
'options': {'expires': 20}
|
||||
},
|
||||
'k8s_reaper': {
|
||||
'task': 'awx.main.tasks.awx_k8s_reaper',
|
||||
'schedule': timedelta(seconds=60),
|
||||
'options': {'expires': 50,}
|
||||
},
|
||||
# 'isolated_heartbeat': set up at the end of production.py and development.py
|
||||
}
|
||||
AWX_INCONSISTENT_TASK_INTERVAL = 60 * 3
|
||||
|
Loading…
Reference in New Issue
Block a user