mirror of
https://github.com/ansible/awx.git
synced 2024-11-01 16:51:11 +03:00
isolated connection timeout and log file for playbook out
This commit is contained in:
parent
48371d433f
commit
70b1b9c81d
@ -159,7 +159,7 @@ register(
|
||||
'AWX_ISOLATED_CHECK_INTERVAL',
|
||||
field_class=fields.IntegerField,
|
||||
label=_('Isolated status check interval'),
|
||||
help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'), # noqa
|
||||
help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'),
|
||||
category=_('Jobs'),
|
||||
category_slug='jobs',
|
||||
)
|
||||
@ -168,7 +168,19 @@ register(
|
||||
'AWX_ISOLATED_LAUNCH_TIMEOUT',
|
||||
field_class=fields.IntegerField,
|
||||
label=_('Isolated launch timeout'),
|
||||
help_text=_('The timeout (in seconds) for launching jobs on isolated instances. This includes the time needed to copy source control files (playbooks) to the isolated instance.'),
|
||||
help_text=_('The timeout (in seconds) for launching jobs on isolated instances. '
|
||||
'This includes the time needed to copy source control files (playbooks) to the isolated instance.'),
|
||||
category=_('Jobs'),
|
||||
category_slug='jobs',
|
||||
)
|
||||
|
||||
register(
|
||||
'AWX_ISOLATED_CONNECTION_TIMEOUT',
|
||||
field_class=fields.IntegerField,
|
||||
default=10,
|
||||
label=_('Isolated connection timeout'),
|
||||
help_text=_('Ansible SSH connection timeout (in seconds) to use when communicating with isolated instances. '
|
||||
'Value should be substantially greater than expected network latency.'),
|
||||
category=_('Jobs'),
|
||||
category_slug='jobs',
|
||||
)
|
||||
|
@ -19,6 +19,7 @@ from awx.main.utils import OutputEventFilter
|
||||
from awx.main.queue import CallbackQueueDispatcher
|
||||
|
||||
logger = logging.getLogger('awx.isolated.manager')
|
||||
playbook_logger = logging.getLogger('awx.isolated.manager.playbooks')
|
||||
|
||||
|
||||
class IsolatedManager(object):
|
||||
@ -57,9 +58,8 @@ class IsolatedManager(object):
|
||||
"""
|
||||
self.args = args
|
||||
self.cwd = cwd
|
||||
self.env = env.copy()
|
||||
# Do not use callbacks for controller's management jobs
|
||||
self.env.update(self._base_management_env())
|
||||
self.isolated_env = self._redact_isolated_env(env.copy())
|
||||
self.management_env = self._base_management_env()
|
||||
self.stdout_handle = stdout_handle
|
||||
self.ssh_key_path = ssh_key_path
|
||||
self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()}
|
||||
@ -73,14 +73,48 @@ class IsolatedManager(object):
|
||||
|
||||
@staticmethod
|
||||
def _base_management_env():
|
||||
return {
|
||||
'ANSIBLE_CALLBACK_PLUGINS': '',
|
||||
'CALLBACK_QUEUE': '',
|
||||
'CALLBACK_CONNECTION': '',
|
||||
'ANSIBLE_RETRY_FILES_ENABLED': 'False',
|
||||
'ANSIBLE_HOST_KEY_CHECKING': 'False',
|
||||
'ANSIBLE_LIBRARY': os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated')
|
||||
}
|
||||
'''
|
||||
Returns environment variables to use when running a playbook
|
||||
that manages the isolated instance.
|
||||
Use of normal job callback and other such configurations are avoided.
|
||||
'''
|
||||
env = dict(os.environ.items())
|
||||
env['ANSIBLE_RETRY_FILES_ENABLED'] = 'False'
|
||||
env['ANSIBLE_HOST_KEY_CHECKING'] = 'False'
|
||||
env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated')
|
||||
return env
|
||||
|
||||
@staticmethod
|
||||
def _build_args(playbook, hosts, extra_vars=None):
|
||||
'''
|
||||
Returns list of Ansible CLI command arguments for a management task
|
||||
|
||||
:param playbook: name of the playbook to run
|
||||
:param hosts: host pattern to operate on, ex. "localhost,"
|
||||
:param extra_vars: optional dictionary of extra_vars to apply
|
||||
'''
|
||||
args = [
|
||||
'ansible-playbook',
|
||||
playbook,
|
||||
'-u', settings.AWX_ISOLATED_USERNAME,
|
||||
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT),
|
||||
'-i', hosts
|
||||
]
|
||||
if extra_vars:
|
||||
args.extend(['-e', json.dumps(extra_vars)])
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
def _redact_isolated_env(env):
|
||||
'''
|
||||
strips some environment variables that aren't applicable to
|
||||
job execution within the isolated instance
|
||||
'''
|
||||
for var in (
|
||||
'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE',
|
||||
'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'):
|
||||
env.pop(var, None)
|
||||
return env
|
||||
|
||||
@classmethod
|
||||
def awx_playbook_path(cls):
|
||||
@ -99,7 +133,7 @@ class IsolatedManager(object):
|
||||
'''
|
||||
self.started_at = time.time()
|
||||
secrets = {
|
||||
'env': self.env.copy(),
|
||||
'env': self.isolated_env,
|
||||
'passwords': self.expect_passwords,
|
||||
'ssh_key_data': None,
|
||||
'idle_timeout': self.idle_timeout,
|
||||
@ -116,12 +150,6 @@ class IsolatedManager(object):
|
||||
secrets['ssh_key_data'] = buff.getvalue()
|
||||
os.remove(self.ssh_key_path)
|
||||
|
||||
# strip some environment variables that aren't applicable to isolated
|
||||
# execution
|
||||
for var in (
|
||||
'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE',
|
||||
'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'):
|
||||
secrets['env'].pop(var, None)
|
||||
self.build_isolated_job_data()
|
||||
|
||||
extra_vars = {
|
||||
@ -137,15 +165,13 @@ class IsolatedManager(object):
|
||||
# - copies encrypted job data from the controlling host to the isolated host (with rsync)
|
||||
# - writes the encryption secret to a named pipe on the isolated host
|
||||
# - launches the isolated playbook runner via `tower-expect start <job-id>`
|
||||
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
|
||||
'%s,' % self.host, 'run_isolated.yml', '-e',
|
||||
json.dumps(extra_vars)]
|
||||
args = self._build_args('run_isolated.yml', '%s,' % self.host, extra_vars)
|
||||
if self.instance.verbosity:
|
||||
args.append('-%s' % ('v' * min(5, self.instance.verbosity)))
|
||||
buff = StringIO.StringIO()
|
||||
logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.')
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, self.awx_playbook_path(), self.env, buff,
|
||||
args, self.awx_playbook_path(), self.management_env, buff,
|
||||
expect_passwords={
|
||||
re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets))
|
||||
},
|
||||
@ -153,8 +179,10 @@ class IsolatedManager(object):
|
||||
job_timeout=settings.AWX_ISOLATED_LAUNCH_TIMEOUT,
|
||||
pexpect_timeout=5
|
||||
)
|
||||
output = buff.getvalue()
|
||||
playbook_logger.info('Job {} management started\n{}'.format(self.instance.id, output))
|
||||
if status != 'successful':
|
||||
self.stdout_handle.write(buff.getvalue())
|
||||
self.stdout_handle.write(output)
|
||||
return status, rc
|
||||
|
||||
@classmethod
|
||||
@ -201,13 +229,13 @@ class IsolatedManager(object):
|
||||
os.chmod(path, stat.S_IRUSR)
|
||||
|
||||
# symlink the scm checkout (if there is one) so that it's rsync'ed over, too
|
||||
if 'AD_HOC_COMMAND_ID' not in self.env:
|
||||
if 'AD_HOC_COMMAND_ID' not in self.isolated_env:
|
||||
os.symlink(self.cwd, self.path_to('project'))
|
||||
|
||||
# create directories for build artifacts to live in
|
||||
os.makedirs(self.path_to('artifacts', 'job_events'), mode=stat.S_IXUSR + stat.S_IWUSR + stat.S_IRUSR)
|
||||
|
||||
def _missing_artifacts(self, path_list, buff):
|
||||
def _missing_artifacts(self, path_list, output):
|
||||
missing_artifacts = filter(lambda path: not os.path.exists(path), path_list)
|
||||
for path in missing_artifacts:
|
||||
self.stdout_handle.write('ansible did not exit cleanly, missing `{}`.\n'.format(path))
|
||||
@ -219,7 +247,7 @@ class IsolatedManager(object):
|
||||
self.stdout_handle.write(f.read())
|
||||
else:
|
||||
# Provide the management playbook standard out if not available
|
||||
self.stdout_handle.write(buff.getvalue())
|
||||
self.stdout_handle.write(output)
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -239,9 +267,7 @@ class IsolatedManager(object):
|
||||
"""
|
||||
interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL
|
||||
extra_vars = {'src': self.private_data_dir}
|
||||
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
|
||||
'%s,' % self.host, 'check_isolated.yml', '-e',
|
||||
json.dumps(extra_vars)]
|
||||
args = self._build_args('check_isolated.yml', '%s,' % self.host, extra_vars)
|
||||
if self.instance.verbosity:
|
||||
args.append('-%s' % ('v' * min(5, self.instance.verbosity)))
|
||||
|
||||
@ -271,13 +297,15 @@ class IsolatedManager(object):
|
||||
buff = cStringIO.StringIO()
|
||||
logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.')
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, self.awx_playbook_path(), self.env, buff,
|
||||
args, self.awx_playbook_path(), self.management_env, buff,
|
||||
cancelled_callback=self.cancelled_callback,
|
||||
idle_timeout=remaining,
|
||||
job_timeout=remaining,
|
||||
pexpect_timeout=5,
|
||||
proot_cmd=self.proot_cmd
|
||||
)
|
||||
output = buff.getvalue()
|
||||
playbook_logger.info(output)
|
||||
|
||||
path = self.path_to('artifacts', 'stdout')
|
||||
if os.path.exists(path):
|
||||
@ -292,7 +320,7 @@ class IsolatedManager(object):
|
||||
if status == 'successful':
|
||||
status_path = self.path_to('artifacts', 'status')
|
||||
rc_path = self.path_to('artifacts', 'rc')
|
||||
if self._missing_artifacts([status_path, rc_path], buff):
|
||||
if self._missing_artifacts([status_path, rc_path], output):
|
||||
status = 'failed'
|
||||
rc = 1
|
||||
else:
|
||||
@ -303,7 +331,7 @@ class IsolatedManager(object):
|
||||
elif status == 'failed':
|
||||
# if we were unable to retrieve job reults from the isolated host,
|
||||
# print stdout of the `check_isolated.yml` playbook for clues
|
||||
self.stdout_handle.write(buff.getvalue())
|
||||
self.stdout_handle.write(output)
|
||||
|
||||
return status, rc
|
||||
|
||||
@ -316,20 +344,21 @@ class IsolatedManager(object):
|
||||
self.proot_temp_dir,
|
||||
],
|
||||
}
|
||||
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
|
||||
'%s,' % self.host, 'clean_isolated.yml', '-e',
|
||||
json.dumps(extra_vars)]
|
||||
args = self._build_args('clean_isolated.yml', '%s,' % self.host, extra_vars)
|
||||
logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.')
|
||||
buff = cStringIO.StringIO()
|
||||
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, self.awx_playbook_path(), self.env, buff,
|
||||
idle_timeout=60, job_timeout=60,
|
||||
args, self.awx_playbook_path(), self.management_env, buff,
|
||||
idle_timeout=timeout, job_timeout=timeout,
|
||||
pexpect_timeout=5
|
||||
)
|
||||
output = buff.getvalue()
|
||||
playbook_logger.info(output)
|
||||
|
||||
if status != 'successful':
|
||||
# stdout_handle is closed by this point so writing output to logs is our only option
|
||||
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue()))
|
||||
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(output))
|
||||
|
||||
@classmethod
|
||||
def health_check(cls, instance_qs):
|
||||
@ -345,15 +374,16 @@ class IsolatedManager(object):
|
||||
hostname_string = ''
|
||||
for instance in instance_qs:
|
||||
hostname_string += '{},'.format(instance.hostname)
|
||||
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
|
||||
hostname_string, 'heartbeat_isolated.yml']
|
||||
args = cls._build_args('heartbeat_isolated.yml', hostname_string)
|
||||
args.extend(['--forks', str(len(instance_qs))])
|
||||
env = cls._base_management_env()
|
||||
env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
|
||||
|
||||
buff = cStringIO.StringIO()
|
||||
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, cls.awx_playbook_path(), env, buff,
|
||||
idle_timeout=60, job_timeout=60,
|
||||
idle_timeout=timeout, job_timeout=timeout,
|
||||
pexpect_timeout=5
|
||||
)
|
||||
output = buff.getvalue()
|
||||
|
@ -259,7 +259,7 @@ def __run__(private_data_dir):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
__version__ = 'devel'
|
||||
__version__ = '3.2.0'
|
||||
try:
|
||||
import awx
|
||||
__version__ = awx.__version__
|
||||
|
@ -189,7 +189,7 @@ def cluster_node_heartbeat(self):
|
||||
for other_inst in recent_inst:
|
||||
if other_inst.version == "":
|
||||
continue
|
||||
if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version):
|
||||
if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version) and not settings.DEBUG:
|
||||
logger.error("Host {} reports Tower version {}, but this node {} is at {}, shutting down".format(other_inst.hostname,
|
||||
other_inst.version,
|
||||
inst.hostname,
|
||||
|
@ -264,11 +264,14 @@ def test_check_isolated_job(private_data_dir, rsa_key):
|
||||
|
||||
run_pexpect.assert_called_with(
|
||||
[
|
||||
'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', 'isolated-host,',
|
||||
'check_isolated.yml', '-e', '{"src": "%s"}' % private_data_dir,
|
||||
'ansible-playbook', 'check_isolated.yml',
|
||||
'-u', settings.AWX_ISOLATED_USERNAME,
|
||||
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT),
|
||||
'-i', 'isolated-host,',
|
||||
'-e', '{"src": "%s"}' % private_data_dir,
|
||||
'-vvvvv'
|
||||
],
|
||||
'/tower_devel/awx/playbooks', mgr.env, mock.ANY,
|
||||
'/tower_devel/awx/playbooks', mgr.management_env, mock.ANY,
|
||||
cancelled_callback=None,
|
||||
idle_timeout=0,
|
||||
job_timeout=0,
|
||||
|
@ -341,8 +341,9 @@ class TestIsolatedExecution(TestJobExecution):
|
||||
|
||||
playbook_run = self.run_pexpect.call_args_list[0][0]
|
||||
assert ' '.join(playbook_run[0]).startswith(' '.join([
|
||||
'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', self.REMOTE_HOST + ',',
|
||||
'run_isolated.yml', '-e',
|
||||
'ansible-playbook', 'run_isolated.yml', '-u', settings.AWX_ISOLATED_USERNAME,
|
||||
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), '-i', self.REMOTE_HOST + ',',
|
||||
'-e',
|
||||
]))
|
||||
extra_vars = playbook_run[0][playbook_run[0].index('-e') + 1]
|
||||
extra_vars = json.loads(extra_vars)
|
||||
|
@ -3,22 +3,25 @@
|
||||
# The following variables will be set by the runner of this playbook:
|
||||
# src: /tmp/some/path/private_data_dir/
|
||||
|
||||
- hosts: all
|
||||
- name: Poll for status of active job.
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
|
||||
- shell: "tower-expect is-alive {{src}}"
|
||||
- name: Determine if daemon process is alive.
|
||||
shell: "tower-expect is-alive {{src}}"
|
||||
register: is_alive
|
||||
ignore_errors: true
|
||||
|
||||
- name: copy artifacts from the isolated host
|
||||
- name: Copy artifacts from the isolated host.
|
||||
synchronize:
|
||||
src: "{{src}}/artifacts/"
|
||||
dest: "{{src}}/artifacts/"
|
||||
mode: pull
|
||||
recursive: yes
|
||||
|
||||
- fail:
|
||||
- name: Fail if previous check determined that process is not alive.
|
||||
fail:
|
||||
msg: "isolated task is still running"
|
||||
when: "is_alive.rc == 0"
|
||||
|
@ -4,7 +4,8 @@
|
||||
# cleanup_dirs: ['/tmp/path/private_data_dir/', '/tmp//path/proot_temp_dir/']
|
||||
# private_data_dir: '/tmp/path/private_data_dir/'
|
||||
|
||||
- hosts: all
|
||||
- name: Clean up from isolated job run.
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
|
@ -1,5 +1,6 @@
|
||||
---
|
||||
- hosts: all
|
||||
- name: Periodic background status check of isolated instances.
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
|
@ -5,11 +5,13 @@
|
||||
# dest: /tmp/some/path/
|
||||
# proot_temp_dir: /tmp/some/path
|
||||
|
||||
- hosts: all
|
||||
- name: Prepare data, dispatch job in isolated environment.
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
vars_prompt:
|
||||
- prompt: "Secret"
|
||||
name: "secret"
|
||||
private: yes
|
||||
|
||||
tasks:
|
||||
|
||||
|
@ -611,6 +611,9 @@ AWX_ISOLATED_CHECK_INTERVAL = 30
|
||||
# The timeout (in seconds) for launching jobs on isolated nodes
|
||||
AWX_ISOLATED_LAUNCH_TIMEOUT = 600
|
||||
|
||||
# Ansible connection timeout (in seconds) for communicating with isolated instances
|
||||
AWX_ISOLATED_CONNECTION_TIMEOUT = 10
|
||||
|
||||
# The time (in seconds) between the periodic isolated heartbeat status check
|
||||
AWX_ISOLATED_PERIODIC_CHECK = 600
|
||||
|
||||
@ -1008,6 +1011,15 @@ LOGGING = {
|
||||
'backupCount': 5,
|
||||
'formatter':'simple',
|
||||
},
|
||||
'management_playbooks': {
|
||||
'level': 'DEBUG',
|
||||
'class':'logging.handlers.RotatingFileHandler',
|
||||
'filters': ['require_debug_false'],
|
||||
'filename': os.path.join(LOG_ROOT, 'management_playbooks.log'),
|
||||
'maxBytes': 1024 * 1024 * 5, # 5 MB
|
||||
'backupCount': 5,
|
||||
'formatter':'simple',
|
||||
},
|
||||
'fact_receiver': {
|
||||
'level': 'WARNING',
|
||||
'class':'logging.handlers.RotatingFileHandler',
|
||||
@ -1066,10 +1078,13 @@ LOGGING = {
|
||||
},
|
||||
'awx.main': {
|
||||
'handlers': ['null']
|
||||
},
|
||||
'awx.main.commands.run_callback_receiver': {
|
||||
}, 'awx.main.commands.run_callback_receiver': {
|
||||
'handlers': ['callback_receiver'],
|
||||
},
|
||||
'awx.isolated.manager.playbooks': {
|
||||
'handlers': ['management_playbooks'],
|
||||
'propagate': False
|
||||
},
|
||||
'awx.main.commands.inventory_import': {
|
||||
'handlers': ['inventory_import'],
|
||||
'propagate': False
|
||||
|
@ -24,6 +24,9 @@ from defaults import * # NOQA
|
||||
LOGGING['handlers']['console']['()'] = 'awx.main.utils.handlers.ColorHandler'
|
||||
COLOR_LOGS = True
|
||||
|
||||
# Pipe management playbook output to console
|
||||
LOGGING['loggers']['awx.isolated.manager.playbooks']['propagate'] = True
|
||||
|
||||
ALLOWED_HOSTS = ['*']
|
||||
|
||||
mimetypes.add_type("image/svg+xml", ".svg", True)
|
||||
|
@ -54,6 +54,7 @@ LOGGING['handlers']['tower_warnings']['filename'] = '/var/log/tower/tower.log'
|
||||
LOGGING['handlers']['callback_receiver']['filename'] = '/var/log/tower/callback_receiver.log'
|
||||
LOGGING['handlers']['task_system']['filename'] = '/var/log/tower/task_system.log'
|
||||
LOGGING['handlers']['fact_receiver']['filename'] = '/var/log/tower/fact_receiver.log'
|
||||
LOGGING['handlers']['management_playbooks']['filename'] = '/var/log/tower/management_playbooks.log'
|
||||
LOGGING['handlers']['system_tracking_migrations']['filename'] = '/var/log/tower/tower_system_tracking_migrations.log'
|
||||
LOGGING['handlers']['rbac_migrations']['filename'] = '/var/log/tower/tower_rbac_migrations.log'
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user