diff --git a/.yamllint b/.yamllint index a937588cdc..87a0d311a6 100644 --- a/.yamllint +++ b/.yamllint @@ -11,6 +11,8 @@ ignore: | # django template files awx/api/templates/instance_install_bundle/** .readthedocs.yaml + tools/loki + tools/otel extends: default diff --git a/Makefile b/Makefile index 2f5223621f..5df99c544a 100644 --- a/Makefile +++ b/Makefile @@ -47,6 +47,10 @@ VAULT ?= false VAULT_TLS ?= false # If set to true docker-compose will also start a tacacs+ instance TACACS ?= false +# If set to true docker-compose will also start an OpenTelemetry Collector instance +OTEL ?= false +# If set to true docker-compose will also start a Loki instance +LOKI ?= false # If set to true docker-compose will install editable dependencies EDITABLE_DEPENDENCIES ?= false @@ -535,6 +539,8 @@ docker-compose-sources: .git/hooks/pre-commit -e enable_vault=$(VAULT) \ -e vault_tls=$(VAULT_TLS) \ -e enable_tacacs=$(TACACS) \ + -e enable_otel=$(OTEL) \ + -e enable_loki=$(LOKI) \ -e install_editable_dependencies=$(EDITABLE_DEPENDENCIES) \ $(EXTRA_SOURCES_ANSIBLE_OPTS) diff --git a/awx/main/utils/handlers.py b/awx/main/utils/handlers.py index 15343463e8..4def0b6ba0 100644 --- a/awx/main/utils/handlers.py +++ b/awx/main/utils/handlers.py @@ -2,9 +2,11 @@ # All Rights Reserved. # Python +import base64 import logging import sys import traceback +import os from datetime import datetime # Django @@ -15,6 +17,15 @@ from django.utils.encoding import force_str # AWX from awx.main.exceptions import PostRunError +# OTEL +from opentelemetry._logs import set_logger_provider +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter as OTLPGrpcLogExporter +from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter as OTLPHttpLogExporter + +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.resources import Resource + class RSysLogHandler(logging.handlers.SysLogHandler): append_nul = False @@ -133,3 +144,39 @@ if settings.COLOR_LOGS is True: pass else: ColorHandler = logging.StreamHandler + + +class OTLPHandler(LoggingHandler): + def __init__(self, endpoint=None, protocol='grpc', service_name=None, instance_id=None, auth=None, username=None, password=None): + if not endpoint: + raise ValueError("endpoint required") + + if auth == 'basic' and (username is None or password is None): + raise ValueError("auth type basic requires username and passsword parameters") + + self.endpoint = endpoint + self.service_name = service_name or (sys.argv[1] if len(sys.argv) > 1 else (sys.argv[0] or 'unknown_service')) + self.instance_id = instance_id or os.uname().nodename + + logger_provider = LoggerProvider( + resource=Resource.create( + { + "service.name": self.service_name, + "service.instance.id": self.instance_id, + } + ), + ) + set_logger_provider(logger_provider) + + headers = {} + if auth == 'basic': + secret = f'{username}:{password}' + headers['Authorization'] = "Basic " + base64.b64encode(secret.encode()).decode() + + if protocol == 'grpc': + otlp_exporter = OTLPGrpcLogExporter(endpoint=self.endpoint, insecure=True, headers=headers) + elif protocol == 'http': + otlp_exporter = OTLPHttpLogExporter(endpoint=self.endpoint, headers=headers) + logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter)) + + super().__init__(level=logging.NOTSET, logger_provider=logger_provider) diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 9a144777bb..12a880a634 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -880,6 +880,7 @@ LOGGING = { 'address': '/var/run/awx-rsyslog/rsyslog.sock', 'filters': ['external_log_enabled', 'dynamic_level_filter', 'guid'], }, + 'otel': {'class': 'logging.NullHandler'}, }, 'loggers': { 'django': {'handlers': ['console']}, diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index 15f662fa8e..48437e53f2 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -30,3 +30,9 @@ pip>=21.3 # PEP 660 – Editable installs for pyproject.toml based builds (wheel debugpy remote-pdb sdb + +# OTEL +opentelemetry-api==1.24.0 +opentelemetry-sdk==1.24.0 +opentelemetry-instrumentation-logging +opentelemetry-exporter-otlp diff --git a/tools/docker-compose/README.md b/tools/docker-compose/README.md index 7139281d7b..22a3c7b390 100644 --- a/tools/docker-compose/README.md +++ b/tools/docker-compose/README.md @@ -613,3 +613,13 @@ docker exec -it -e VAULT_TOKEN= tools_vault_1 vault kv get --address=http ### Prometheus and Grafana integration See docs at https://github.com/ansible/awx/blob/devel/tools/grafana/README.md + +### OpenTelemetry Integration + +```bash +OTEL=true GRAFANA=true LOKI=true PROMETHEUS=true make docker-compose +``` + +This will start the sidecar container `tools_otel_1` and configure AWX logging to send to it. The OpenTelemetry Collector is configured to export logs to Loki. Grafana is configured with Loki as a datasource. AWX logs can be viewed in Grafana. + +`http://localhost:3001` grafana diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index e6cb929482..c6a0b4ed90 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -269,6 +269,42 @@ services: # pg_notify will NOT work in transaction mode. PGBOUNCER_POOL_MODE: session {% endif %} +{% if enable_otel|bool %} + otel: + image: otel/opentelemetry-collector-contrib:0.88.0 + container_name: tools_otel_1 + hostname: otel + command: ["--config=/etc/otel-collector-config.yaml", ""] + networks: + - awx + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP http receiver + - "55679:55679" # zpages http://localhost:55679/debug/servicez /tracez + volumes: + - "../../otel/otel-collector-config.yaml:/etc/otel-collector-config.yaml" + depends_on: + - loki +{% endif %} +{% if enable_loki|bool %} + loki: + image: grafana/loki:2.9.5 + container_name: tools_loki_1 + hostname: loki + ports: + - "3100:3100" + command: -config.file=/etc/loki/local-config.yaml + networks: + - awx + volumes: + - "loki_storage:/loki:rw" + #- "../../docker-compose/loki/volumes/index:/loki/index" + #- "../../docker-compose/loki/volumes/boltdb-cache:/loki/boltdb-cache" + - "../../loki/local-config.yaml:/etc/loki/local-config.yaml" + depends_on: + - grafana +{% endif %} + {% if execution_node_count|int > 0 %} receptor-hop: image: {{ receptor_image }} @@ -360,6 +396,10 @@ volumes: grafana_storage: name: tools_grafana_storage {% endif %} +{% if enable_loki|bool %} + loki_storage: + name: tools_loki_storage +{% endif %} networks: awx: diff --git a/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2 b/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2 index fe9596a7b0..fa93ccecc5 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2 @@ -46,6 +46,18 @@ OPTIONAL_API_URLPATTERN_PREFIX = '{{ api_urlpattern_prefix }}' # LOGGING['loggers']['django_auth_ldap']['handlers'] = ['console'] # LOGGING['loggers']['django_auth_ldap']['level'] = 'DEBUG' +{% if enable_otel|bool %} +LOGGING['handlers']['otel'] |= { + 'class': 'awx.main.utils.handlers.OTLPHandler', + 'endpoint': 'http://otel:4317', +} +# Add otel log handler to all log handlers +for name in LOGGING['loggers'].keys(): + handler = LOGGING['loggers'][name].get('handlers', []) + if 'otel' not in handler: + LOGGING['loggers'][name].get('handlers', []).append('otel') +{% endif %} + BROADCAST_WEBSOCKET_PORT = 8013 BROADCAST_WEBSOCKET_VERIFY_CERT = False BROADCAST_WEBSOCKET_PROTOCOL = 'http' diff --git a/tools/grafana/datasources/loki_source.yml b/tools/grafana/datasources/loki_source.yml new file mode 100644 index 0000000000..4a6c740f34 --- /dev/null +++ b/tools/grafana/datasources/loki_source.yml @@ -0,0 +1,11 @@ +--- +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + jsonData: + timeout: 60 + maxLines: 100000 diff --git a/tools/loki/local-config.yaml b/tools/loki/local-config.yaml new file mode 100644 index 0000000000..dde03673aa --- /dev/null +++ b/tools/loki/local-config.yaml @@ -0,0 +1,96 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_server_max_recv_msg_size: 524288000 # 500 MB + grpc_server_max_send_msg_size: 524288000 # 500 MB, might be too much, be careful + +frontend_worker: + match_max_concurrent: true + grpc_client_config: + max_send_msg_size: 524288000 # 500 MB + + +ingester: + max_chunk_age: 8766h + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +# compactor: +# retention_enabled: true +# # cmeyers: YOLO. 1s seems wrong but it works so right +# compaction_interval: 1s # default 10m + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/boltdb-cache + +ruler: + alertmanager_url: http://localhost:9093 + +limits_config: + retention_period: 3y + # cmeyers: The default of 30m triggers a loop of queries that take a long time + # to complete and the UI times out + split_queries_by_interval: 1d + # cmeyers: Default of 30d1h limits grafana time queries. Can't, for example, + # query last 90 days + max_query_length: 3y + # cmeyers: Made the batch post request succeed. + reject_old_samples: false + reject_old_samples_max_age: 365d + + ingestion_rate_mb: 32 + ingestion_burst_size_mb: 32 + per_stream_rate_limit: 32M + per_stream_rate_limit_burst: 32M + ingestion_rate_strategy: local # Default: global + max_global_streams_per_user: 100000000 + max_entries_limit_per_query: 100000000 + max_query_series: 1000000 + max_query_parallelism: 32 # Old Default: 14 + max_streams_per_user: 100000000 # Old Default: 10000 + +# Taken from aap-log-visualizer +frontend: + max_outstanding_per_tenant: 2048 + +query_scheduler: + max_outstanding_requests_per_tenant: 2048 + +query_range: + parallelise_shardable_queries: false + split_queries_by_interval: 0 + +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go +# Refer to the buildReport method to see what goes into a report. +# +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false diff --git a/tools/otel/otel-collector-config.yaml b/tools/otel/otel-collector-config.yaml new file mode 100644 index 0000000000..ebbf0606cb --- /dev/null +++ b/tools/otel/otel-collector-config.yaml @@ -0,0 +1,39 @@ +receivers: + otlp: + protocols: + grpc: + +exporters: + debug: + verbosity: detailed + + loki: + endpoint: http://loki:3100/loki/api/v1/push + tls: + insecure: true + headers: + "X-Scope-OrgID": "1" + default_labels_enabled: + exporter: true + job: true + instance: true + level: true + +processors: + batch: + +extensions: + health_check: + zpages: + endpoint: ":55679" + +service: + pipelines: + logs: + receivers: [otlp] + processors: [batch] + exporters: [loki] + + extensions: + - health_check + - zpages