diff --git a/awx/main/conf.py b/awx/main/conf.py index 0015e768d0..fc13e4fa23 100644 --- a/awx/main/conf.py +++ b/awx/main/conf.py @@ -159,7 +159,7 @@ register( 'AWX_ISOLATED_CHECK_INTERVAL', field_class=fields.IntegerField, label=_('Isolated status check interval'), - help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'), # noqa + help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'), category=_('Jobs'), category_slug='jobs', ) @@ -168,7 +168,19 @@ register( 'AWX_ISOLATED_LAUNCH_TIMEOUT', field_class=fields.IntegerField, label=_('Isolated launch timeout'), - help_text=_('The timeout (in seconds) for launching jobs on isolated instances. This includes the time needed to copy source control files (playbooks) to the isolated instance.'), + help_text=_('The timeout (in seconds) for launching jobs on isolated instances. ' + 'This includes the time needed to copy source control files (playbooks) to the isolated instance.'), + category=_('Jobs'), + category_slug='jobs', +) + +register( + 'AWX_ISOLATED_CONNECTION_TIMEOUT', + field_class=fields.IntegerField, + default=10, + label=_('Isolated connection timeout'), + help_text=_('Ansible SSH connection timeout (in seconds) to use when communicating with isolated instances. ' + 'Value should be substantially greater than expected network latency.'), category=_('Jobs'), category_slug='jobs', ) diff --git a/awx/main/isolated/isolated_manager.py b/awx/main/isolated/isolated_manager.py index 9a5e940a58..73d8e2c970 100644 --- a/awx/main/isolated/isolated_manager.py +++ b/awx/main/isolated/isolated_manager.py @@ -19,6 +19,7 @@ from awx.main.utils import OutputEventFilter from awx.main.queue import CallbackQueueDispatcher logger = logging.getLogger('awx.isolated.manager') +playbook_logger = logging.getLogger('awx.isolated.manager.playbooks') class IsolatedManager(object): @@ -57,9 +58,8 @@ class IsolatedManager(object): """ self.args = args self.cwd = cwd - self.env = env.copy() - # Do not use callbacks for controller's management jobs - self.env.update(self._base_management_env()) + self.isolated_env = self._redact_isolated_env(env.copy()) + self.management_env = self._base_management_env() self.stdout_handle = stdout_handle self.ssh_key_path = ssh_key_path self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()} @@ -73,14 +73,48 @@ class IsolatedManager(object): @staticmethod def _base_management_env(): - return { - 'ANSIBLE_CALLBACK_PLUGINS': '', - 'CALLBACK_QUEUE': '', - 'CALLBACK_CONNECTION': '', - 'ANSIBLE_RETRY_FILES_ENABLED': 'False', - 'ANSIBLE_HOST_KEY_CHECKING': 'False', - 'ANSIBLE_LIBRARY': os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated') - } + ''' + Returns environment variables to use when running a playbook + that manages the isolated instance. + Use of normal job callback and other such configurations are avoided. + ''' + env = dict(os.environ.items()) + env['ANSIBLE_RETRY_FILES_ENABLED'] = 'False' + env['ANSIBLE_HOST_KEY_CHECKING'] = 'False' + env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated') + return env + + @staticmethod + def _build_args(playbook, hosts, extra_vars=None): + ''' + Returns list of Ansible CLI command arguments for a management task + + :param playbook: name of the playbook to run + :param hosts: host pattern to operate on, ex. "localhost," + :param extra_vars: optional dictionary of extra_vars to apply + ''' + args = [ + 'ansible-playbook', + playbook, + '-u', settings.AWX_ISOLATED_USERNAME, + '-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), + '-i', hosts + ] + if extra_vars: + args.extend(['-e', json.dumps(extra_vars)]) + return args + + @staticmethod + def _redact_isolated_env(env): + ''' + strips some environment variables that aren't applicable to + job execution within the isolated instance + ''' + for var in ( + 'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE', + 'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'): + env.pop(var, None) + return env @classmethod def awx_playbook_path(cls): @@ -99,7 +133,7 @@ class IsolatedManager(object): ''' self.started_at = time.time() secrets = { - 'env': self.env.copy(), + 'env': self.isolated_env, 'passwords': self.expect_passwords, 'ssh_key_data': None, 'idle_timeout': self.idle_timeout, @@ -116,12 +150,6 @@ class IsolatedManager(object): secrets['ssh_key_data'] = buff.getvalue() os.remove(self.ssh_key_path) - # strip some environment variables that aren't applicable to isolated - # execution - for var in ( - 'HOME', 'RABBITMQ_HOST', 'RABBITMQ_PASS', 'RABBITMQ_USER', 'CACHE', - 'DJANGO_PROJECT_DIR', 'DJANGO_SETTINGS_MODULE', 'RABBITMQ_VHOST'): - secrets['env'].pop(var, None) self.build_isolated_job_data() extra_vars = { @@ -137,15 +165,13 @@ class IsolatedManager(object): # - copies encrypted job data from the controlling host to the isolated host (with rsync) # - writes the encryption secret to a named pipe on the isolated host # - launches the isolated playbook runner via `tower-expect start ` - args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - '%s,' % self.host, 'run_isolated.yml', '-e', - json.dumps(extra_vars)] + args = self._build_args('run_isolated.yml', '%s,' % self.host, extra_vars) if self.instance.verbosity: args.append('-%s' % ('v' * min(5, self.instance.verbosity))) buff = StringIO.StringIO() logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.') status, rc = IsolatedManager.run_pexpect( - args, self.awx_playbook_path(), self.env, buff, + args, self.awx_playbook_path(), self.management_env, buff, expect_passwords={ re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets)) }, @@ -153,8 +179,10 @@ class IsolatedManager(object): job_timeout=settings.AWX_ISOLATED_LAUNCH_TIMEOUT, pexpect_timeout=5 ) + output = buff.getvalue() + playbook_logger.info('Job {} management started\n{}'.format(self.instance.id, output)) if status != 'successful': - self.stdout_handle.write(buff.getvalue()) + self.stdout_handle.write(output) return status, rc @classmethod @@ -201,13 +229,13 @@ class IsolatedManager(object): os.chmod(path, stat.S_IRUSR) # symlink the scm checkout (if there is one) so that it's rsync'ed over, too - if 'AD_HOC_COMMAND_ID' not in self.env: + if 'AD_HOC_COMMAND_ID' not in self.isolated_env: os.symlink(self.cwd, self.path_to('project')) # create directories for build artifacts to live in os.makedirs(self.path_to('artifacts', 'job_events'), mode=stat.S_IXUSR + stat.S_IWUSR + stat.S_IRUSR) - def _missing_artifacts(self, path_list, buff): + def _missing_artifacts(self, path_list, output): missing_artifacts = filter(lambda path: not os.path.exists(path), path_list) for path in missing_artifacts: self.stdout_handle.write('ansible did not exit cleanly, missing `{}`.\n'.format(path)) @@ -219,7 +247,7 @@ class IsolatedManager(object): self.stdout_handle.write(f.read()) else: # Provide the management playbook standard out if not available - self.stdout_handle.write(buff.getvalue()) + self.stdout_handle.write(output) return True return False @@ -239,9 +267,7 @@ class IsolatedManager(object): """ interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL extra_vars = {'src': self.private_data_dir} - args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - '%s,' % self.host, 'check_isolated.yml', '-e', - json.dumps(extra_vars)] + args = self._build_args('check_isolated.yml', '%s,' % self.host, extra_vars) if self.instance.verbosity: args.append('-%s' % ('v' * min(5, self.instance.verbosity))) @@ -271,13 +297,15 @@ class IsolatedManager(object): buff = cStringIO.StringIO() logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.') status, rc = IsolatedManager.run_pexpect( - args, self.awx_playbook_path(), self.env, buff, + args, self.awx_playbook_path(), self.management_env, buff, cancelled_callback=self.cancelled_callback, idle_timeout=remaining, job_timeout=remaining, pexpect_timeout=5, proot_cmd=self.proot_cmd ) + output = buff.getvalue() + playbook_logger.info(output) path = self.path_to('artifacts', 'stdout') if os.path.exists(path): @@ -292,7 +320,7 @@ class IsolatedManager(object): if status == 'successful': status_path = self.path_to('artifacts', 'status') rc_path = self.path_to('artifacts', 'rc') - if self._missing_artifacts([status_path, rc_path], buff): + if self._missing_artifacts([status_path, rc_path], output): status = 'failed' rc = 1 else: @@ -303,7 +331,7 @@ class IsolatedManager(object): elif status == 'failed': # if we were unable to retrieve job reults from the isolated host, # print stdout of the `check_isolated.yml` playbook for clues - self.stdout_handle.write(buff.getvalue()) + self.stdout_handle.write(output) return status, rc @@ -316,20 +344,21 @@ class IsolatedManager(object): self.proot_temp_dir, ], } - args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - '%s,' % self.host, 'clean_isolated.yml', '-e', - json.dumps(extra_vars)] + args = self._build_args('clean_isolated.yml', '%s,' % self.host, extra_vars) logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.') buff = cStringIO.StringIO() + timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT) status, rc = IsolatedManager.run_pexpect( - args, self.awx_playbook_path(), self.env, buff, - idle_timeout=60, job_timeout=60, + args, self.awx_playbook_path(), self.management_env, buff, + idle_timeout=timeout, job_timeout=timeout, pexpect_timeout=5 ) + output = buff.getvalue() + playbook_logger.info(output) if status != 'successful': # stdout_handle is closed by this point so writing output to logs is our only option - logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue())) + logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(output)) @classmethod def health_check(cls, instance_qs): @@ -345,15 +374,16 @@ class IsolatedManager(object): hostname_string = '' for instance in instance_qs: hostname_string += '{},'.format(instance.hostname) - args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - hostname_string, 'heartbeat_isolated.yml'] + args = cls._build_args('heartbeat_isolated.yml', hostname_string) + args.extend(['--forks', str(len(instance_qs))]) env = cls._base_management_env() env['ANSIBLE_STDOUT_CALLBACK'] = 'json' buff = cStringIO.StringIO() + timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT) status, rc = IsolatedManager.run_pexpect( args, cls.awx_playbook_path(), env, buff, - idle_timeout=60, job_timeout=60, + idle_timeout=timeout, job_timeout=timeout, pexpect_timeout=5 ) output = buff.getvalue() diff --git a/awx/main/isolated/run.py b/awx/main/isolated/run.py index 0b466228aa..00c6c423e9 100755 --- a/awx/main/isolated/run.py +++ b/awx/main/isolated/run.py @@ -259,7 +259,7 @@ def __run__(private_data_dir): if __name__ == '__main__': - __version__ = 'devel' + __version__ = '3.2.0' try: import awx __version__ = awx.__version__ diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 68172a3085..5ccdf589d9 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -189,7 +189,7 @@ def cluster_node_heartbeat(self): for other_inst in recent_inst: if other_inst.version == "": continue - if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version): + if Version(other_inst.version.split('-', 1)[0]) > Version(tower_application_version) and not settings.DEBUG: logger.error("Host {} reports Tower version {}, but this node {} is at {}, shutting down".format(other_inst.hostname, other_inst.version, inst.hostname, diff --git a/awx/main/tests/unit/isolated/test_expect.py b/awx/main/tests/unit/isolated/test_expect.py index 5d5c79f150..c44ba681f3 100644 --- a/awx/main/tests/unit/isolated/test_expect.py +++ b/awx/main/tests/unit/isolated/test_expect.py @@ -264,11 +264,14 @@ def test_check_isolated_job(private_data_dir, rsa_key): run_pexpect.assert_called_with( [ - 'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', 'isolated-host,', - 'check_isolated.yml', '-e', '{"src": "%s"}' % private_data_dir, + 'ansible-playbook', 'check_isolated.yml', + '-u', settings.AWX_ISOLATED_USERNAME, + '-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), + '-i', 'isolated-host,', + '-e', '{"src": "%s"}' % private_data_dir, '-vvvvv' ], - '/tower_devel/awx/playbooks', mgr.env, mock.ANY, + '/tower_devel/awx/playbooks', mgr.management_env, mock.ANY, cancelled_callback=None, idle_timeout=0, job_timeout=0, diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index c7abfe52cb..6607c25df2 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -341,8 +341,9 @@ class TestIsolatedExecution(TestJobExecution): playbook_run = self.run_pexpect.call_args_list[0][0] assert ' '.join(playbook_run[0]).startswith(' '.join([ - 'ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', self.REMOTE_HOST + ',', - 'run_isolated.yml', '-e', + 'ansible-playbook', 'run_isolated.yml', '-u', settings.AWX_ISOLATED_USERNAME, + '-T', str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), '-i', self.REMOTE_HOST + ',', + '-e', ])) extra_vars = playbook_run[0][playbook_run[0].index('-e') + 1] extra_vars = json.loads(extra_vars) diff --git a/awx/playbooks/check_isolated.yml b/awx/playbooks/check_isolated.yml index 5d4753025f..60ccfd1ddf 100644 --- a/awx/playbooks/check_isolated.yml +++ b/awx/playbooks/check_isolated.yml @@ -3,22 +3,25 @@ # The following variables will be set by the runner of this playbook: # src: /tmp/some/path/private_data_dir/ -- hosts: all +- name: Poll for status of active job. + hosts: all gather_facts: false tasks: - - shell: "tower-expect is-alive {{src}}" + - name: Determine if daemon process is alive. + shell: "tower-expect is-alive {{src}}" register: is_alive ignore_errors: true - - name: copy artifacts from the isolated host + - name: Copy artifacts from the isolated host. synchronize: src: "{{src}}/artifacts/" dest: "{{src}}/artifacts/" mode: pull recursive: yes - - fail: + - name: Fail if previous check determined that process is not alive. + fail: msg: "isolated task is still running" when: "is_alive.rc == 0" diff --git a/awx/playbooks/clean_isolated.yml b/awx/playbooks/clean_isolated.yml index 0ab4be6d48..0945411318 100644 --- a/awx/playbooks/clean_isolated.yml +++ b/awx/playbooks/clean_isolated.yml @@ -4,7 +4,8 @@ # cleanup_dirs: ['/tmp/path/private_data_dir/', '/tmp//path/proot_temp_dir/'] # private_data_dir: '/tmp/path/private_data_dir/' -- hosts: all +- name: Clean up from isolated job run. + hosts: all gather_facts: false tasks: diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml index 58b2f52b3c..9c100c84af 100644 --- a/awx/playbooks/heartbeat_isolated.yml +++ b/awx/playbooks/heartbeat_isolated.yml @@ -1,5 +1,6 @@ --- -- hosts: all +- name: Periodic background status check of isolated instances. + hosts: all gather_facts: false tasks: diff --git a/awx/playbooks/run_isolated.yml b/awx/playbooks/run_isolated.yml index c5f20c29c1..7bec2bf8a2 100644 --- a/awx/playbooks/run_isolated.yml +++ b/awx/playbooks/run_isolated.yml @@ -5,11 +5,13 @@ # dest: /tmp/some/path/ # proot_temp_dir: /tmp/some/path -- hosts: all +- name: Prepare data, dispatch job in isolated environment. + hosts: all gather_facts: false vars_prompt: - prompt: "Secret" name: "secret" + private: yes tasks: diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 263a0ab569..6ad511ef50 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -611,6 +611,9 @@ AWX_ISOLATED_CHECK_INTERVAL = 30 # The timeout (in seconds) for launching jobs on isolated nodes AWX_ISOLATED_LAUNCH_TIMEOUT = 600 +# Ansible connection timeout (in seconds) for communicating with isolated instances +AWX_ISOLATED_CONNECTION_TIMEOUT = 10 + # The time (in seconds) between the periodic isolated heartbeat status check AWX_ISOLATED_PERIODIC_CHECK = 600 @@ -1008,6 +1011,15 @@ LOGGING = { 'backupCount': 5, 'formatter':'simple', }, + 'management_playbooks': { + 'level': 'DEBUG', + 'class':'logging.handlers.RotatingFileHandler', + 'filters': ['require_debug_false'], + 'filename': os.path.join(LOG_ROOT, 'management_playbooks.log'), + 'maxBytes': 1024 * 1024 * 5, # 5 MB + 'backupCount': 5, + 'formatter':'simple', + }, 'fact_receiver': { 'level': 'WARNING', 'class':'logging.handlers.RotatingFileHandler', @@ -1066,10 +1078,13 @@ LOGGING = { }, 'awx.main': { 'handlers': ['null'] - }, - 'awx.main.commands.run_callback_receiver': { + }, 'awx.main.commands.run_callback_receiver': { 'handlers': ['callback_receiver'], }, + 'awx.isolated.manager.playbooks': { + 'handlers': ['management_playbooks'], + 'propagate': False + }, 'awx.main.commands.inventory_import': { 'handlers': ['inventory_import'], 'propagate': False diff --git a/awx/settings/development.py b/awx/settings/development.py index 8ca0f923a9..f098833ba1 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -24,6 +24,9 @@ from defaults import * # NOQA LOGGING['handlers']['console']['()'] = 'awx.main.utils.handlers.ColorHandler' COLOR_LOGS = True +# Pipe management playbook output to console +LOGGING['loggers']['awx.isolated.manager.playbooks']['propagate'] = True + ALLOWED_HOSTS = ['*'] mimetypes.add_type("image/svg+xml", ".svg", True) diff --git a/awx/settings/production.py b/awx/settings/production.py index d0e014f196..84179f643a 100644 --- a/awx/settings/production.py +++ b/awx/settings/production.py @@ -54,6 +54,7 @@ LOGGING['handlers']['tower_warnings']['filename'] = '/var/log/tower/tower.log' LOGGING['handlers']['callback_receiver']['filename'] = '/var/log/tower/callback_receiver.log' LOGGING['handlers']['task_system']['filename'] = '/var/log/tower/task_system.log' LOGGING['handlers']['fact_receiver']['filename'] = '/var/log/tower/fact_receiver.log' +LOGGING['handlers']['management_playbooks']['filename'] = '/var/log/tower/management_playbooks.log' LOGGING['handlers']['system_tracking_migrations']['filename'] = '/var/log/tower/tower_system_tracking_migrations.log' LOGGING['handlers']['rbac_migrations']['filename'] = '/var/log/tower/tower_rbac_migrations.log'