1
0
mirror of https://github.com/ansible/awx.git synced 2024-11-01 16:51:11 +03:00

isolated connection timeout and log file for playbook out

This commit is contained in:
AlanCoding 2017-06-30 11:15:51 -04:00
parent 48371d433f
commit 70b1b9c81d
13 changed files with 131 additions and 59 deletions

View File

@ -159,7 +159,7 @@ register(
'AWX_ISOLATED_CHECK_INTERVAL',
field_class=fields.IntegerField,
label=_('Isolated status check interval'),
help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'), # noqa
help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'),
category=_('Jobs'),
category_slug='jobs',
)
@ -168,7 +168,19 @@ register(
'AWX_ISOLATED_LAUNCH_TIMEOUT',
field_class=fields.IntegerField,
label=_('Isolated launch timeout'),
help_text=_('The timeout (in seconds) for launching jobs on isolated instances. This includes the time needed to copy source control files (playbooks) to the isolated instance.'),
help_text=_('The timeout (in seconds) for launching jobs on isolated instances. '
'This includes the time needed to copy source control files (playbooks) to the isolated instance.'),
category=_('Jobs'),
category_slug='jobs',
)
register(
'AWX_ISOLATED_CONNECTION_TIMEOUT',
field_class=fields.IntegerField,
default=10,
label=_('Isolated connection timeout'),
help_text=_('Ansible SSH connection timeout (in seconds) to use when communicating with isolated instances. '
'Value should be substantially greater than expected network latency.'),
category=_('Jobs'),
category_slug='jobs',
)

View File

@ -19,6 +19,7 @@ from awx.main.utils import OutputEventFilter
from awx.main.queue import CallbackQueueDispatcher
logger = logging.getLogger('awx.isolated.manager')
playbook_logger = logging.getLogger('awx.isolated.manager.playbooks')
class IsolatedManager(object):
@ -57,9 +58,8 @@ class IsolatedManager(object):
"""
self.args = args
self.cwd = cwd
self.env = env.copy()
# Do not use callbacks for controller's management jobs
self.env.update(self._base_management_env())
self.isolated_env = self._redact_isolated_env(env.copy())
self.management_env = self._base_management_env()
self.stdout_handle = stdout_handle
self.ssh_key_path = ssh_key_path
self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()}
@ -73,14 +73,48 @@ class IsolatedManager(object):
@staticmethod
def _base_management_env():
return {
'ANSIBLE_CALLBACK_PLUGINS': '',
'CALLBACK_QUEUE': '',
'CALLBACK_CONNECTION': '',
'ANSIBLE_RETRY_FILES_ENABLED': 'False',
'ANSIBLE_HOST_KEY_CHECKING': 'False',
'ANSIBLE_LIBRARY': os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated')
}
'''
Returns environment variables to use when running a playbook
that manages the isolated instance.
Use of normal job callback and other such configurations are avoided.
'''
env = dict(os.environ.items())
env['ANSIBLE_RETRY_FILES_ENABLED'] = 'False'
env['ANSIBLE_HOST_KEY_CHECKING'] = 'False'
env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated')
return env
@staticmethod
def _build_args(playbook, hosts, extra_vars=None):
'''
Returns list of Ansible CLI command arguments for a management task
:param playbook: name of the playbook to run
:param hosts: host pattern to operate on, ex. "localhost,"
:param extra_vars: optional dictionary of extra_vars to apply
'''
args = [
'ansible-playbook',
playbook,
'-u', settings.AWX_ISOLATED_USERNAME,
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT),
'-i', hosts
]
if extra_vars:
args.extend(['-e', json.dumps(extra_vars)])
return args
@staticmethod
def _redact_isolated_env(env):
'''
strips some environment variables that aren't applicable to
job execution within the isolated instance
'''
for var in (
'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE',
'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'):
env.pop(var, None)
return env
@classmethod
def awx_playbook_path(cls):
@ -99,7 +133,7 @@ class IsolatedManager(object):
'''
self.started_at = time.time()
secrets = {
'env': self.env.copy(),
'env': self.isolated_env,
'passwords': self.expect_passwords,
'ssh_key_data': None,
'idle_timeout': self.idle_timeout,
@ -116,12 +150,6 @@ class IsolatedManager(object):
secrets['ssh_key_data'] = buff.getvalue()
os.remove(self.ssh_key_path)
# strip some environment variables that aren't applicable to isolated
# execution
for var in (
'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE',
'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'):
secrets['env'].pop(var, None)
self.build_isolated_job_data()
extra_vars = {
@ -137,15 +165,13 @@ class IsolatedManager(object):
# - copies encrypted job data from the controlling host to the isolated host (with rsync)
# - writes the encryption secret to a named pipe on the isolated host
# - launches the isolated playbook runner via `tower-expect start <job-id>`
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
'%s,' % self.host, 'run_isolated.yml', '-e',
json.dumps(extra_vars)]
args = self._build_args('run_isolated.yml', '%s,' % self.host, extra_vars)
if self.instance.verbosity:
args.append('-%s' % ('v' * min(5, self.instance.verbosity)))
buff = StringIO.StringIO()
logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.')
status, rc = IsolatedManager.run_pexpect(
args, self.awx_playbook_path(), self.env, buff,
args, self.awx_playbook_path(), self.management_env, buff,
expect_passwords={
re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets))
},
@ -153,8 +179,10 @@ class IsolatedManager(object):
job_timeout=settings.AWX_ISOLATED_LAUNCH_TIMEOUT,
pexpect_timeout=5
)
output = buff.getvalue()
playbook_logger.info('Job {} management started\n{}'.format(self.instance.id, output))
if status != 'successful':
self.stdout_handle.write(buff.getvalue())
self.stdout_handle.write(output)
return status, rc
@classmethod
@ -201,13 +229,13 @@ class IsolatedManager(object):
os.chmod(path, stat.S_IRUSR)
# symlink the scm checkout (if there is one) so that it's rsync'ed over, too
if 'AD_HOC_COMMAND_ID' not in self.env:
if 'AD_HOC_COMMAND_ID' not in self.isolated_env:
os.symlink(self.cwd, self.path_to('project'))
# create directories for build artifacts to live in
os.makedirs(self.path_to('artifacts', 'job_events'), mode=stat.S_IXUSR + stat.S_IWUSR + stat.S_IRUSR)
def _missing_artifacts(self, path_list, buff):
def _missing_artifacts(self, path_list, output):
missing_artifacts = filter(lambda path: not os.path.exists(path), path_list)
for path in missing_artifacts:
self.stdout_handle.write('ansible did not exit cleanly, missing `{}`.\n'.format(path))
@ -219,7 +247,7 @@ class IsolatedManager(object):
self.stdout_handle.write(f.read())
else:
# Provide the management playbook standard out if not available
self.stdout_handle.write(buff.getvalue())
self.stdout_handle.write(output)
return True
return False
@ -239,9 +267,7 @@ class IsolatedManager(object):
"""
interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL
extra_vars = {'src': self.private_data_dir}
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
'%s,' % self.host, 'check_isolated.yml', '-e',
json.dumps(extra_vars)]
args = self._build_args('check_isolated.yml', '%s,' % self.host, extra_vars)
if self.instance.verbosity:
args.append('-%s' % ('v' * min(5, self.instance.verbosity)))
@ -271,13 +297,15 @@ class IsolatedManager(object):
buff = cStringIO.StringIO()
logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.')
status, rc = IsolatedManager.run_pexpect(
args, self.awx_playbook_path(), self.env, buff,
args, self.awx_playbook_path(), self.management_env, buff,
cancelled_callback=self.cancelled_callback,
idle_timeout=remaining,
job_timeout=remaining,
pexpect_timeout=5,
proot_cmd=self.proot_cmd
)
output = buff.getvalue()
playbook_logger.info(output)
path = self.path_to('artifacts', 'stdout')
if os.path.exists(path):
@ -292,7 +320,7 @@ class IsolatedManager(object):
if status == 'successful':
status_path = self.path_to('artifacts', 'status')
rc_path = self.path_to('artifacts', 'rc')
if self._missing_artifacts([status_path, rc_path], buff):
if self._missing_artifacts([status_path, rc_path], output):
status = 'failed'
rc = 1
else:
@ -303,7 +331,7 @@ class IsolatedManager(object):
elif status == 'failed':
# if we were unable to retrieve job reults from the isolated host,
# print stdout of the `check_isolated.yml` playbook for clues
self.stdout_handle.write(buff.getvalue())
self.stdout_handle.write(output)
return status, rc
@ -316,20 +344,21 @@ class IsolatedManager(object):
self.proot_temp_dir,
],
}
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
'%s,' % self.host, 'clean_isolated.yml', '-e',
json.dumps(extra_vars)]
args = self._build_args('clean_isolated.yml', '%s,' % self.host, extra_vars)
logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.')
buff = cStringIO.StringIO()
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
status, rc = IsolatedManager.run_pexpect(
args, self.awx_playbook_path(), self.env, buff,
idle_timeout=60, job_timeout=60,
args, self.awx_playbook_path(), self.management_env, buff,
idle_timeout=timeout, job_timeout=timeout,
pexpect_timeout=5
)
output = buff.getvalue()
playbook_logger.info(output)
if status != 'successful':
# stdout_handle is closed by this point so writing output to logs is our only option
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue()))
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(output))
@classmethod
def health_check(cls, instance_qs):
@ -345,15 +374,16 @@ class IsolatedManager(object):
hostname_string = ''
for instance in instance_qs:
hostname_string += '{},'.format(instance.hostname)
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
hostname_string, 'heartbeat_isolated.yml']
args = cls._build_args('heartbeat_isolated.yml', hostname_string)
args.extend(['--forks', str(len(instance_qs))])
env = cls._base_management_env()
env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
buff = cStringIO.StringIO()
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
status, rc = IsolatedManager.run_pexpect(
args, cls.awx_playbook_path(), env, buff,
idle_timeout=60, job_timeout=60,
idle_timeout=timeout, job_timeout=timeout,
pexpect_timeout=5
)
output = buff.getvalue()

View File

@ -259,7 +259,7 @@ def __run__(private_data_dir):
if __name__ == '__main__':
__version__ = 'devel'
__version__ = '3.2.0'
try:
import awx
__version__ = awx.__version__

View File

@ -189,7 +189,7 @@ def cluster_node_heartbeat(self):
for other_inst in recent_inst:
if other_inst.version == "":
continue
if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version):
if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version) and not settings.DEBUG:
logger.error("Host {} reports Tower version {}, but this node {} is at {}, shutting down".format(other_inst.hostname,
other_inst.version,
inst.hostname,

View File

@ -264,11 +264,14 @@ def test_check_isolated_job(private_data_dir, rsa_key):
run_pexpect.assert_called_with(
[
'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', 'isolated-host,',
'check_isolated.yml', '-e', '{"src": "%s"}' % private_data_dir,
'ansible-playbook', 'check_isolated.yml',
'-u', settings.AWX_ISOLATED_USERNAME,
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT),
'-i', 'isolated-host,',
'-e', '{"src": "%s"}' % private_data_dir,
'-vvvvv'
],
'/tower_devel/awx/playbooks', mgr.env, mock.ANY,
'/tower_devel/awx/playbooks', mgr.management_env, mock.ANY,
cancelled_callback=None,
idle_timeout=0,
job_timeout=0,

View File

@ -341,8 +341,9 @@ class TestIsolatedExecution(TestJobExecution):
playbook_run = self.run_pexpect.call_args_list[0][0]
assert ' '.join(playbook_run[0]).startswith(' '.join([
'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', self.REMOTE_HOST + ',',
'run_isolated.yml', '-e',
'ansible-playbook', 'run_isolated.yml', '-u', settings.AWX_ISOLATED_USERNAME,
'-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), '-i', self.REMOTE_HOST + ',',
'-e',
]))
extra_vars = playbook_run[0][playbook_run[0].index('-e') + 1]
extra_vars = json.loads(extra_vars)

View File

@ -3,22 +3,25 @@
# The following variables will be set by the runner of this playbook:
# src: /tmp/some/path/private_data_dir/
- hosts: all
- name: Poll for status of active job.
hosts: all
gather_facts: false
tasks:
- shell: "tower-expect is-alive {{src}}"
- name: Determine if daemon process is alive.
shell: "tower-expect is-alive {{src}}"
register: is_alive
ignore_errors: true
- name: copy artifacts from the isolated host
- name: Copy artifacts from the isolated host.
synchronize:
src: "{{src}}/artifacts/"
dest: "{{src}}/artifacts/"
mode: pull
recursive: yes
- fail:
- name: Fail if previous check determined that process is not alive.
fail:
msg: "isolated task is still running"
when: "is_alive.rc == 0"

View File

@ -4,7 +4,8 @@
# cleanup_dirs: ['/tmp/path/private_data_dir/', '/tmp//path/proot_temp_dir/']
# private_data_dir: '/tmp/path/private_data_dir/'
- hosts: all
- name: Clean up from isolated job run.
hosts: all
gather_facts: false
tasks:

View File

@ -1,5 +1,6 @@
---
- hosts: all
- name: Periodic background status check of isolated instances.
hosts: all
gather_facts: false
tasks:

View File

@ -5,11 +5,13 @@
# dest: /tmp/some/path/
# proot_temp_dir: /tmp/some/path
- hosts: all
- name: Prepare data, dispatch job in isolated environment.
hosts: all
gather_facts: false
vars_prompt:
- prompt: "Secret"
name: "secret"
private: yes
tasks:

View File

@ -611,6 +611,9 @@ AWX_ISOLATED_CHECK_INTERVAL = 30
# The timeout (in seconds) for launching jobs on isolated nodes
AWX_ISOLATED_LAUNCH_TIMEOUT = 600
# Ansible connection timeout (in seconds) for communicating with isolated instances
AWX_ISOLATED_CONNECTION_TIMEOUT = 10
# The time (in seconds) between the periodic isolated heartbeat status check
AWX_ISOLATED_PERIODIC_CHECK = 600
@ -1008,6 +1011,15 @@ LOGGING = {
'backupCount': 5,
'formatter':'simple',
},
'management_playbooks': {
'level': 'DEBUG',
'class':'logging.handlers.RotatingFileHandler',
'filters': ['require_debug_false'],
'filename': os.path.join(LOG_ROOT, 'management_playbooks.log'),
'maxBytes': 1024 * 1024 * 5, # 5 MB
'backupCount': 5,
'formatter':'simple',
},
'fact_receiver': {
'level': 'WARNING',
'class':'logging.handlers.RotatingFileHandler',
@ -1066,10 +1078,13 @@ LOGGING = {
},
'awx.main': {
'handlers': ['null']
},
'awx.main.commands.run_callback_receiver': {
}, 'awx.main.commands.run_callback_receiver': {
'handlers': ['callback_receiver'],
},
'awx.isolated.manager.playbooks': {
'handlers': ['management_playbooks'],
'propagate': False
},
'awx.main.commands.inventory_import': {
'handlers': ['inventory_import'],
'propagate': False

View File

@ -24,6 +24,9 @@ from defaults import * # NOQA
LOGGING['handlers']['console']['()'] = 'awx.main.utils.handlers.ColorHandler'
COLOR_LOGS = True
# Pipe management playbook output to console
LOGGING['loggers']['awx.isolated.manager.playbooks']['propagate'] = True
ALLOWED_HOSTS = ['*']
mimetypes.add_type("image/svg+xml", ".svg", True)

View File

@ -54,6 +54,7 @@ LOGGING['handlers']['tower_warnings']['filename'] = '/var/log/tower/tower.log'
LOGGING['handlers']['callback_receiver']['filename'] = '/var/log/tower/callback_receiver.log'
LOGGING['handlers']['task_system']['filename'] = '/var/log/tower/task_system.log'
LOGGING['handlers']['fact_receiver']['filename'] = '/var/log/tower/fact_receiver.log'
LOGGING['handlers']['management_playbooks']['filename'] = '/var/log/tower/management_playbooks.log'
LOGGING['handlers']['system_tracking_migrations']['filename'] = '/var/log/tower/tower_system_tracking_migrations.log'
LOGGING['handlers']['rbac_migrations']['filename'] = '/var/log/tower/tower_rbac_migrations.log'