From 9b034ad5742930f1122846edb5d27cb7ecdf17f0 Mon Sep 17 00:00:00 2001 From: Shane McDonald Date: Mon, 1 Aug 2022 14:13:59 -0400 Subject: [PATCH] generate control node receptor.conf when a new remote execution/hop node is added regenerate the receptor.conf for all control node to peer out to the new remote execution node Signed-off-by: Hao Liu Co-Authored-By: Seth Foster Co-Authored-By: Shane McDonald --- awx/main/models/ha.py | 7 ++ awx/main/scheduler/task_manager.py | 4 - awx/main/tasks/jobs.py | 2 +- awx/main/tasks/receptor.py | 103 ++++++++++++++++-- awx/main/tasks/system.py | 8 +- docs/licenses/filelock.txt | 24 ++++ requirements/requirements.in | 1 + requirements/requirements.txt | 2 + .../ansible/roles/sources/tasks/main.yml | 14 +++ .../sources/templates/docker-compose.yml.j2 | 1 + 10 files changed, 150 insertions(+), 16 deletions(-) create mode 100644 docs/licenses/filelock.txt diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 7de957d4d5..1e52646958 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -423,6 +423,13 @@ def on_instance_group_saved(sender, instance, created=False, raw=False, **kwargs @receiver(post_save, sender=Instance) def on_instance_saved(sender, instance, created=False, raw=False, **kwargs): + # TODO: handle update to instance + if settings.IS_K8S and created and instance.node_type in ('execution', 'hop'): + from awx.main.tasks.receptor import write_receptor_config # prevents circular import + + # on commit broadcast to all control instance to update their receptor configs + connection.on_commit(lambda: write_receptor_config.apply_async(queue='tower_broadcast_all')) + if created or instance.has_policy_changes(): schedule_policy_task() diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 45f262ebe6..a0a125729d 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -642,10 +642,6 @@ class TaskManager(TaskBase): found_acceptable_queue = True break - # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control - if settings.IS_K8S and task.capacity_type == 'execution': - logger.debug("Skipping group {}, task cannot run on control plane".format(instance_group.name)) - continue # at this point we know the instance group is NOT a container group # because if it was, it would have started the task and broke out of the loop. execution_instance = self.instance_groups.fit_task_to_most_remaining_capacity_instance( diff --git a/awx/main/tasks/jobs.py b/awx/main/tasks/jobs.py index 33cfc30cd1..ff64f8ee64 100644 --- a/awx/main/tasks/jobs.py +++ b/awx/main/tasks/jobs.py @@ -145,7 +145,7 @@ class BaseTask(object): """ Return params structure to be executed by the container runtime """ - if settings.IS_K8S: + if settings.IS_K8S and instance.instance_group.is_container_group: return {} image = instance.execution_environment.image diff --git a/awx/main/tasks/receptor.py b/awx/main/tasks/receptor.py index 0350a96836..0dda5b48ad 100644 --- a/awx/main/tasks/receptor.py +++ b/awx/main/tasks/receptor.py @@ -27,12 +27,17 @@ from awx.main.utils.common import ( ) from awx.main.constants import MAX_ISOLATED_PATH_COLON_DELIMITER from awx.main.tasks.signals import signal_state, signal_callback, SignalExit +from awx.main.models import Instance +from awx.main.dispatch.publish import task # Receptorctl from receptorctl.socket_interface import ReceptorControl +from filelock import FileLock + logger = logging.getLogger('awx.main.tasks.receptor') __RECEPTOR_CONF = '/etc/receptor/receptor.conf' +__RECEPTOR_CONF_LOCKFILE = f'{__RECEPTOR_CONF}.lock' RECEPTOR_ACTIVE_STATES = ('Pending', 'Running') @@ -43,8 +48,10 @@ class ReceptorConnectionType(Enum): def get_receptor_sockfile(): - with open(__RECEPTOR_CONF, 'r') as f: - data = yaml.safe_load(f) + lock = FileLock(__RECEPTOR_CONF_LOCKFILE) + with lock: + with open(__RECEPTOR_CONF, 'r') as f: + data = yaml.safe_load(f) for section in data: for entry_name, entry_data in section.items(): if entry_name == 'control-service': @@ -60,8 +67,10 @@ def get_tls_client(use_stream_tls=None): if not use_stream_tls: return None - with open(__RECEPTOR_CONF, 'r') as f: - data = yaml.safe_load(f) + lock = FileLock(__RECEPTOR_CONF_LOCKFILE) + with lock: + with open(__RECEPTOR_CONF, 'r') as f: + data = yaml.safe_load(f) for section in data: for entry_name, entry_data in section.items(): if entry_name == 'tls-client': @@ -78,12 +87,25 @@ def get_receptor_ctl(): return ReceptorControl(receptor_sockfile) +def find_node_in_mesh(node_name, receptor_ctl): + attempts = 10 + backoff = 1 + for attempt in range(attempts): + all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None) + for node in all_nodes: + if node.get('NodeID') == node_name: + return node + else: + logger.warning(f"Instance {node_name} is not in the receptor mesh. {attempts-attempt} attempts left.") + time.sleep(backoff) + backoff += 1 + else: + raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') + + def get_conn_type(node_name, receptor_ctl): - all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None) - for node in all_nodes: - if node.get('NodeID') == node_name: - return ReceptorConnectionType(node.get('ConnType')) - raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') + node = find_node_in_mesh(node_name, receptor_ctl) + return ReceptorConnectionType(node.get('ConnType')) def administrative_workunit_reaper(work_list=None): @@ -574,3 +596,66 @@ class AWXReceptorJob: else: config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True return config + + +RECEPTOR_CONFIG_STARTER = ( + {'control-service': {'service': 'control', 'filename': '/var/run/receptor/receptor.sock', 'permissions': '0600'}}, + {'local-only': None}, + {'work-command': {'worktype': 'local', 'command': 'ansible-runner', 'params': 'worker', 'allowruntimeparams': True}}, + { + 'work-kubernetes': { + 'worktype': 'kubernetes-runtime-auth', + 'authmethod': 'runtime', + 'allowruntimeauth': True, + 'allowruntimepod': True, + 'allowruntimeparams': True, + } + }, + { + 'work-kubernetes': { + 'worktype': 'kubernetes-incluster-auth', + 'authmethod': 'incluster', + 'allowruntimeauth': True, + 'allowruntimepod': True, + 'allowruntimeparams': True, + } + }, + { + 'tls-client': { + 'name': 'tlsclient', + 'rootcas': '/etc/receptor/tls/ca/receptor-ca.crt', + 'cert': '/etc/receptor/tls/receptor.crt', + 'key': '/etc/receptor/tls/receptor.key', + } + }, +) + + +@task() +def write_receptor_config(): + receptor_config = list(RECEPTOR_CONFIG_STARTER) + + instances = Instance.objects.exclude(node_type='control') + for instance in instances: + peer = {'tcp-peer': {'address': f'{instance.hostname}:{instance.listener_port}', 'tls': 'tlsclient'}} + receptor_config.append(peer) + + lock = FileLock(__RECEPTOR_CONF_LOCKFILE) + with lock: + with open(__RECEPTOR_CONF, 'w') as file: + yaml.dump(receptor_config, file, default_flow_style=False) + + receptor_ctl = get_receptor_ctl() + + attempts = 10 + backoff = 1 + for attempt in range(attempts): + try: + receptor_ctl.simple_command("reload") + break + except ValueError: + logger.warning(f"Unable to reload Receptor configuration. {attempts-attempt} attempts left.") + time.sleep(backoff) + backoff += 1 + else: + raise RuntimeError("Receptor reload failed") diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index c2443b1a51..0c22051f5a 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -61,7 +61,7 @@ from awx.main.utils.common import ( from awx.main.utils.external_logging import reconfigure_rsyslog from awx.main.utils.reload import stop_local_services from awx.main.utils.pglock import advisory_lock -from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper +from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper, write_receptor_config from awx.main.consumers import emit_channel_notification from awx.main import analytics from awx.conf import settings_registry @@ -81,6 +81,10 @@ Try upgrading OpenSSH or providing your private key in an different format. \ def dispatch_startup(): startup_logger = logging.getLogger('awx.main.tasks') + # TODO: Enable this on VM installs + if settings.IS_K8S: + write_receptor_config() + startup_logger.debug("Syncing Schedules") for sch in Schedule.objects.all(): try: @@ -555,7 +559,7 @@ def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None): except Exception: logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) try: - if settings.AWX_AUTO_DEPROVISION_INSTANCES: + if settings.AWX_AUTO_DEPROVISION_INSTANCES and other_inst.node_type == "control": deprovision_hostname = other_inst.hostname other_inst.delete() # FIXME: what about associated inbound links? logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) diff --git a/docs/licenses/filelock.txt b/docs/licenses/filelock.txt new file mode 100644 index 0000000000..cf1ab25da0 --- /dev/null +++ b/docs/licenses/filelock.txt @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/requirements/requirements.in b/requirements/requirements.in index e0f707b8bc..3951faf06c 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -25,6 +25,7 @@ django-split-settings django-taggit djangorestframework==3.13.1 djangorestframework-yaml +filelock GitPython>=3.1.1 # minimum to fix https://github.com/ansible/awx/issues/6119 irc jinja2>=2.11.3 # CVE-2020-28493 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f05f27b807..143506c21e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -132,6 +132,8 @@ docutils==0.16 # via python-daemon ecdsa==0.18.0 # via python-jose +filelock==3.8.0 + # via -r /awx_devel/requirements/requirements.in # via # -r /awx_devel/requirements/requirements_git.txt # django-radius diff --git a/tools/docker-compose/ansible/roles/sources/tasks/main.yml b/tools/docker-compose/ansible/roles/sources/tasks/main.yml index e566a97073..b6dd95aedb 100644 --- a/tools/docker-compose/ansible/roles/sources/tasks/main.yml +++ b/tools/docker-compose/ansible/roles/sources/tasks/main.yml @@ -109,6 +109,20 @@ mode: '0600' with_sequence: start=1 end={{ control_plane_node_count }} +- name: Create Receptor Config Lock File + file: + path: "{{ sources_dest }}/receptor/receptor-awx-{{ item }}.conf.lock" + state: touch + mode: '0600' + with_sequence: start=1 end={{ control_plane_node_count }} + +- name: Render Receptor Config(s) for Control Plane + template: + src: "receptor-awx.conf.j2" + dest: "{{ sources_dest }}/receptor/receptor-awx-{{ item }}.conf" + mode: '0600' + with_sequence: start=1 end={{ control_plane_node_count }} + - name: Render Receptor Hop Config template: src: "receptor-hop.conf.j2" diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index 2f7fc3cf41..db4988b207 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -42,6 +42,7 @@ services: - "../../docker-compose/_sources/local_settings.py:/etc/tower/conf.d/local_settings.py" - "../../docker-compose/_sources/SECRET_KEY:/etc/tower/SECRET_KEY" - "../../docker-compose/_sources/receptor/receptor-awx-{{ loop.index }}.conf:/etc/receptor/receptor.conf" + - "../../docker-compose/_sources/receptor/receptor-awx-{{ loop.index }}.conf.lock:/etc/receptor/receptor.conf.lock" - "../../docker-compose/_sources/receptor/work_public_key.pem:/etc/receptor/work_public_key.pem" - "../../docker-compose/_sources/receptor/work_private_key.pem:/etc/receptor/work_private_key.pem" # - "../../docker-compose/_sources/certs:/etc/receptor/certs" # TODO: optionally generate certs