diff --git a/Dockerfile b/Dockerfile index 451d41f..6136472 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ WORKDIR /etc/pdagentd/ssl RUN openssl req -x509 -newkey rsa:4096 -nodes -out cert.pem -keyout key.pem -days 3650 -subj "/C=US/ST=CA/L=San Francisco/CN=pagerduty.com" WORKDIR /tmp -COPY dist/pdaltagent-0.5.0*.whl . -RUN pip3 install ./pdaltagent-0.5.0*.whl --break-system-packages +COPY dist/pdaltagent-0.6.0*.whl . +RUN pip3 install ./pdaltagent-0.6.0*.whl --break-system-packages ENTRYPOINT /etc/run_supervisord.sh diff --git a/README.md b/README.md index eb61a45..55c799d 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ Okay, here's how you can get started: * When you run PDaltagent using the Docker Compose in this repo, you will get four running containers: * `pdaltagent_pdagentd` is the container that is running the queue consumers and the HTTPS listener. It also has [PagerDuty CLI](https://github.com/martindstone/pagerduty-cli) installed for your convenience. * `pdaltagent_rabbitmq` is the container that is running the RabbitMQ backend for Celery. You can access the Rabbit management interface at `http://your_pdaltagent_host:15672`. See or change the username and password in docker-compose.yml. - * `pdaltagent_mongo` is the container running MongoDB. PDaltagent uses Mongo to hold recently seen log entries when polling for webhooks, but you can also use it in your plugins. - * `pdaltagent_mongo-express` is the container runing [Mongo-Express](https://github.com/mongo-express/mongo-express), which is a simple web UI for managing data stored in Mongo. See docker-compose.yml for the port, username and password. You can remove this container if you don't want to use Mongo-Express. +* `pdaltagent_mongo` is the container running MongoDB. PDaltagent uses Mongo to hold recently seen log entries when polling for webhooks, but you can also use it in your plugins. +* `pdaltagent_mongo-express` is the container runing [Mongo-Express](https://github.com/mongo-express/mongo-express), which is a simple web UI for managing data stored in Mongo. See docker-compose.yml for the port, username and password. You can remove this container if you don't want to use Mongo-Express. * Docker Compose will also create a directory called `pdaltagent_pdagentd` in the current directory when you run it. This directory has the following subdirectories: * `plugins` is where you put plugins that you write. There are a couple of example plugins added to this directory when it is first created. You can read these to find out how to write plugins, or just ignore them; they are disabled by default. @@ -49,6 +49,55 @@ Okay, here's how you can get started: * By default, the pdagentd listener uses a self-signed cert and key for HTTPS. This can cause warnings and failures on some clients, unless you tell them to skip certificate verification. If you cant to use your own cert and key, see the example in docker-compose.yml to create a bindmoint for `/etc/pdagentd/ssl/cert.pem` and `/etc/pdagentd/ssl/key.pem`. * If you want to change the services that are run in the container, see the example in docker-compose.yml to create a bindmount for `/etc/supervisord.conf`. +### SNMP trap ingestion + +PDaltagent now ships with an optional SNMP trap listener that can turn traps into PagerDuty events. To enable it: + +1. Install with the SNMP extra (`pip install pdaltagent[snmp]`) or use the Docker image that already bundles the dependency. +2. Set the environment variables: + * `PDAGENTD_SNMP_ENABLED=true` + * `PDAGENTD_SNMP_ROUTING_KEY=` + * `PDAGENTD_SNMP_COMMUNITY` (defaults to `public`) + * `PDAGENTD_SNMP_PORT` (defaults to `9162`) + * `PDAGENTD_SNMP_SEVERITY` (defaults to `error`) +3. Allow UDP traffic to port `9162` (or your chosen port). + +Traps are converted to v2 events with a summary such as `SNMP trap from ` and the full bindings available in `custom_details`. + +### Kubernetes quick start + +Example manifests are provided under `k8s/`: + +```bash +kubectl apply -f k8s/deployment.yaml +kubectl apply -f k8s/service.yaml +``` + +The deployment exposes HTTP/HTTPS listeners and the optional SNMP trap port (UDP/9162). Override `CELERY_BROKER_URL`, `MONGODB_URL`, `PDAGENTD_SNMP_*`, and worker tuning variables (`PDAGENTD_WORKER_CONCURRENCY`, `PDAGENTD_PREFETCH_MULTIPLIER`) with environment variables or a ConfigMap/Secret to match your environment. Scale replicas horizontally and point them at a shared RabbitMQ and MongoDB for throughput. + +To persist or share plugins across replicas, swap the `emptyDir` volume examples in `k8s/deployment.yaml` for real mounts, e.g.: + +```yaml +volumes: + - name: plugins + persistentVolumeClaim: + claimName: pdaltagent-plugins-pvc + - name: plugin-lib + hostPath: + path: /srv/pdaltagent/plugin-lib +``` + +### Scaling and tuning + +Celery worker behavior can now be tuned via environment: + +* `PDAGENTD_WORKER_CONCURRENCY` — cap worker processes/threads (defaults to Celery auto-detect). +* `PDAGENTD_PREFETCH_MULTIPLIER` — number of tasks each worker prefetches (defaults to 1). +* `PDAGENTD_MAX_TASKS_PER_CHILD` — recycle workers after N tasks to control memory. +* `PDAGENTD_WORKER_SEND_EVENTS` — emit worker events for monitoring. + +Adjust these alongside your broker backend (e.g., RabbitMQ) to run under heavier load. + ## How to send events to PagerDuty through PDaltagent * If you have tools that want to send events directly via HTTPS POST to `events.pagerduty.com`, you can change the beginning of the URL to point to HTTP/HTTPS on the listening ports on the `pdaltagent_pdagentd` Docker container and leave the path the same, and the PDaltagent will enqueue the messages. This works for paths that look like `/integration//enqueue`, `/x-ere/`, and `/v2/enqueue`. For example, if you have an event that you send to `https://events.pagerduty.com/v2/enqueue`, and your pdagentd is listening for HTTPS on port 8443 on host 10.0.0.10, you can send the same event to `https://10.0.0.10:8443/v2/enqueue`. diff --git a/docker-compose.yml b/docker-compose.yml index ff65448..43f7c24 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,7 @@ services: restart: always pdagentd: - image: martindstone/pdaltagent:0.4-beta3 + image: martindstone/pdaltagent:0.6.0 container_name: pdaltagent_pdagentd environment: # Set HTTP_PROXY, HTTPS_PROXY and NO_PROXY if you are in a proxy environment @@ -57,6 +57,17 @@ services: # for details and regexes): # - PDAGENTD_SCRUB_PII=true + # Optional: Tune worker throughput + # - PDAGENTD_WORKER_CONCURRENCY=4 + # - PDAGENTD_PREFETCH_MULTIPLIER=4 + # - PDAGENTD_MAX_TASKS_PER_CHILD=1000 + + # Optional: Enable SNMP trap ingestion (listens on UDP/9162 by default) + # - PDAGENTD_SNMP_ENABLED=true + # - PDAGENTD_SNMP_ROUTING_KEY= + # - PDAGENTD_SNMP_COMMUNITY=public + # - PDAGENTD_SNMP_PORT=9162 + # Set PDSEND_EVENTS_BASE_URL to a URL where the pd-send command should send event payloads: - PDSEND_EVENTS_BASE_URL=https://localhost:8443 @@ -78,6 +89,8 @@ services: - '8443:8443' # admin - '8444:8444' + # SNMP traps (UDP) + - '9162:9162/udp' # supervisor - '9001:9001' depends_on: diff --git a/k8s/deployment.yaml b/k8s/deployment.yaml new file mode 100644 index 0000000..f6c74fc --- /dev/null +++ b/k8s/deployment.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pdaltagent + labels: + app: pdaltagent +spec: + replicas: 1 # scale horizontally for throughput (ensure shared RabbitMQ/Mongo) + selector: + matchLabels: + app: pdaltagent + template: + metadata: + labels: + app: pdaltagent + spec: + containers: + - name: pdaltagent + image: martindstone/pdaltagent:0.6.0 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8080 + name: http + protocol: TCP + - containerPort: 8443 + name: https + protocol: TCP + - containerPort: 9162 + name: snmp-trap + protocol: UDP + env: + # Queue backend (RabbitMQ/AMQP). Point at your shared broker service. + - name: CELERY_BROKER_URL + value: amqp://pdaltagent:pdaltagent@rabbitmq:5672// + # MongoDB connection for webhook polling/plugin storage. + - name: MONGODB_URL + value: mongodb://root:example@mongo:27017/ + # Admin credentials (consider sourcing from Secrets) + - name: PDAGENTD_ADMIN_USER + valueFrom: + secretKeyRef: + name: pdaltagent-admin + key: user + - name: PDAGENTD_ADMIN_PASS + valueFrom: + secretKeyRef: + name: pdaltagent-admin + key: password + # Disable SNMP listener by default; enable and set routing key to ingest traps. + - name: PDAGENTD_SNMP_ENABLED + value: "false" + - name: PDAGENTD_SNMP_PORT + value: "9162" + # Worker tuning examples; adjust to match CPU availability. + - name: PDAGENTD_WORKER_CONCURRENCY + value: "4" + - name: PDAGENTD_PREFETCH_MULTIPLIER + value: "4" + volumeMounts: + # Mount in-cluster plugins and their dependencies. Replace with PersistentVolumeClaims + # or ConfigMap/Secret mounts if you want shared/plugin-managed code. + - name: plugins + mountPath: /usr/lib/python3.12/site-packages/pdaltagent/plugins + - name: plugin-lib + mountPath: /usr/lib/python3.12/site-packages/pdaltagent/plugin-lib + volumes: + - name: plugins + emptyDir: {} # Example: use emptyDir for ephemeral plugins during development + # persistentVolumeClaim: + # claimName: pdaltagent-plugins-pvc + # configMap: + # name: pdaltagent-plugins + - name: plugin-lib + emptyDir: {} # Example: layer dependencies shared by plugins + # persistentVolumeClaim: + # claimName: pdaltagent-plugin-lib-pvc + # hostPath: + # path: /srv/pdaltagent/plugin-lib diff --git a/k8s/service.yaml b/k8s/service.yaml new file mode 100644 index 0000000..c885391 --- /dev/null +++ b/k8s/service.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: Service +metadata: + name: pdaltagent + labels: + app: pdaltagent +spec: + selector: + app: pdaltagent + ports: + # HTTP/HTTPS endpoints for enqueueing events and UI + - name: http + protocol: TCP + port: 8080 + targetPort: 8080 + - name: https + protocol: TCP + port: 8443 + targetPort: 8443 + # Optional SNMP trap listener (UDP) + - name: snmp-trap + protocol: UDP + port: 9162 + targetPort: 9162 diff --git a/pdaltagent/__init__.py b/pdaltagent/__init__.py index b794fd4..ef7eb44 100644 --- a/pdaltagent/__init__.py +++ b/pdaltagent/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = '0.6.0' diff --git a/pdaltagent/config.py b/pdaltagent/config.py index 75afedd..990ece4 100644 --- a/pdaltagent/config.py +++ b/pdaltagent/config.py @@ -1,6 +1,13 @@ import os from celery import Celery + +def env_flag(name, default=False): + value = os.getenv(name) + if value is None: + return default + return str(value).lower() not in ('false', '0', 'no', 'off', '') + MONGODB_URL = os.getenv('MONGODB_URL', 'mongodb://root:example@mongo:27017/') PDAGENTD_ADMIN_USER = os.getenv('PDAGENTD_ADMIN_USER', 'pdaltagent@example.com') @@ -12,26 +19,65 @@ PD_API_TOKEN = os.environ.get("PDAGENTD_API_TOKEN") WEBHOOK_DEST_URL = os.environ.get("PDAGENTD_WEBHOOK_DEST_URL") -IS_OVERVIEW = 'false' if os.environ.get("PDAGENTD_GET_ALL_LOG_ENTRIES") and os.environ.get("PDAGENTD_GET_ALL_LOG_ENTRIES").lower != 'false' else 'true' +IS_OVERVIEW = 'false' if env_flag("PDAGENTD_GET_ALL_LOG_ENTRIES") else 'true' POLLING_INTERVAL_SECONDS = 10 if os.environ.get("PDAGENTD_POLLING_INTERVAL_SECONDS"): try: POLLING_INTERVAL_SECONDS = int(os.environ.get("PDAGENTD_POLLING_INTERVAL_SECONDS")) - except: + except Exception: pass # keep activity db rows for 30 days -KEEP_ACTIVITY_SECONDS = 30*24*60*60 +KEEP_ACTIVITY_SECONDS = 30 * 24 * 60 * 60 if os.environ.get("PDAGENTD_KEEP_ACTIVITY_SECONDS"): try: KEEP_ACTIVITY_SECONDS = int(os.environ.get("PDAGENTD_KEEP_ACTIVITY_SECONDS")) - except: + except Exception: pass +CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'pyamqp://pdaltagent:pdaltagent@rabbit//') +CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND') +try: + CELERY_PREFETCH = int(os.getenv('PDAGENTD_PREFETCH_MULTIPLIER', '1')) +except ValueError: + CELERY_PREFETCH = 1 + +try: + CELERY_CONCURRENCY = int(os.getenv('PDAGENTD_WORKER_CONCURRENCY', '0')) + if CELERY_CONCURRENCY <= 0: + CELERY_CONCURRENCY = None +except ValueError: + CELERY_CONCURRENCY = None + +try: + CELERY_MAX_TASKS_PER_CHILD = int(os.getenv('PDAGENTD_MAX_TASKS_PER_CHILD', '0')) + if CELERY_MAX_TASKS_PER_CHILD <= 0: + CELERY_MAX_TASKS_PER_CHILD = None +except ValueError: + CELERY_MAX_TASKS_PER_CHILD = None + +CELERY_WORKER_SEND_EVENTS = env_flag('PDAGENTD_WORKER_SEND_EVENTS', False) + app = Celery('tasks') +celery_config = { + "broker_url": CELERY_BROKER_URL, + "result_backend": CELERY_RESULT_BACKEND, + "worker_prefetch_multiplier": CELERY_PREFETCH, + "worker_send_task_events": CELERY_WORKER_SEND_EVENTS, + "task_serializer": "json", + "accept_content": ["json"], + "result_serializer": "json", + "task_acks_late": True, +} +if CELERY_CONCURRENCY is not None: + celery_config["worker_concurrency"] = CELERY_CONCURRENCY +if CELERY_MAX_TASKS_PER_CHILD is not None: + celery_config["worker_max_tasks_per_child"] = CELERY_MAX_TASKS_PER_CHILD + +app.conf.update(**celery_config) app.conf.task_routes = { - 'pdaltagent.tasks.send_to_pd': { 'queue': 'pd_events' }, - 'pdaltagent.tasks.send_webhook': { 'queue': 'pd_webhooks' }, - 'pdaltagent.periodic_tasks.*': { 'queue': 'pd_periodic' }, + 'pdaltagent.tasks.send_to_pd': {'queue': 'pd_events'}, + 'pdaltagent.tasks.send_webhook': {'queue': 'pd_webhooks'}, + 'pdaltagent.periodic_tasks.*': {'queue': 'pd_periodic'}, } diff --git a/pdaltagent/listener.py b/pdaltagent/listener.py index 8ffb3dd..bae1422 100644 --- a/pdaltagent/listener.py +++ b/pdaltagent/listener.py @@ -6,9 +6,10 @@ import os from flask import Flask, request +from pdaltagent.config import env_flag app = Flask(__name__) -SCRUB = True if os.environ.get("PDAGENTD_SCRUB_PII") and os.environ.get("PDAGENTD_SCRUB_PII").lower != 'false' else False +SCRUB = env_flag("PDAGENTD_SCRUB_PII") @app.route('/integration//enqueue', methods=['POST']) def enqueue_integration(routing_key): diff --git a/pdaltagent/scripts/start_snmp.sh b/pdaltagent/scripts/start_snmp.sh new file mode 100644 index 0000000..ec98a01 --- /dev/null +++ b/pdaltagent/scripts/start_snmp.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +if [ "${PDAGENTD_SNMP_ENABLED:-false}" = "true" ]; then + exec pdagent-snmp +fi + +echo "SNMP trap listener disabled; set PDAGENTD_SNMP_ENABLED=true to enable." +sleep infinity diff --git a/pdaltagent/scripts/supervisord.conf b/pdaltagent/scripts/supervisord.conf index 680243d..e6e9921 100644 --- a/pdaltagent/scripts/supervisord.conf +++ b/pdaltagent/scripts/supervisord.conf @@ -76,6 +76,15 @@ stderr_logfile = /dev/stderr stderr_logfile_maxbytes = 0 command=gunicorn -b 0.0.0.0:8444 -w 1 pdaltagent.api.api:app --certfile=/etc/pdagentd/ssl/cert.pem --keyfile=/etc/pdagentd/ssl/key.pem +[program:snmp] +stdout_logfile = /dev/stdout +stdout_logfile_maxbytes = 0 +stderr_logfile = /dev/stderr +stderr_logfile_maxbytes = 0 +command=sh /usr/lib/python3.12/site-packages/pdaltagent/scripts/start_snmp.sh +startsecs = 5 +autorestart = true + [program:run_once] stdout_logfile = /dev/stdout stdout_logfile_maxbytes = 0 @@ -87,4 +96,4 @@ autorestart = false startretries = 1 [group:workers] -programs=events,webhooks,periodic,beat,listener,listener_ssl,admin +programs=events,webhooks,periodic,beat,listener,listener_ssl,admin,snmp diff --git a/pdaltagent/snmp.py b/pdaltagent/snmp.py new file mode 100644 index 0000000..a19f1c8 --- /dev/null +++ b/pdaltagent/snmp.py @@ -0,0 +1,120 @@ +import logging +import os +from typing import Dict + +import pdaltagent.pd as pd +from pdaltagent.config import env_flag +from pdaltagent.tasks import send_to_pd + +try: + from pysnmp.carrier.asyncore.dgram import udp + from pysnmp.entity import config, engine + from pysnmp.entity.rfc3413 import ntfrcv + + SNMP_AVAILABLE = True +except ImportError: # pragma: no cover - runtime guard only + SNMP_AVAILABLE = False + +logger = logging.getLogger(__name__) + + +def _as_details(var_binds) -> Dict[str, str]: + return {str(name): str(val) for name, val in var_binds} + + +def _trap_oid(var_binds) -> str: + for name, val in var_binds: + if str(name).endswith("1.3.6.1.6.3.1.1.4.1.0"): + return str(val) + return str(var_binds[0][0]) if var_binds else "unknown" + + +def _build_payload( + routing_key: str, + source: str, + severity: str, + summary_template: str, + var_binds, +) -> Dict: + details = _as_details(var_binds) + trap_oid = _trap_oid(var_binds) + summary = summary_template.format(trap_oid=trap_oid, source=source) + return { + "routing_key": routing_key, + "event_action": "trigger", + "payload": { + "summary": summary, + "source": source, + "severity": severity, + "custom_details": details, + }, + } + + +def _trap_callback(snmp_engine, state_reference, context_engine_id, context_name, var_binds, ctx): + transport_domain, transport_address = snmp_engine.msgAndPduDsp.getTransportInfo(state_reference) + source = f"{transport_address[0]}:{transport_address[1]}" + payload = _build_payload( + routing_key=ctx["routing_key"], + source=source, + severity=ctx["severity"], + summary_template=ctx["summary_template"], + var_binds=var_binds, + ) + logger.info("Received SNMP trap from %s with %d bindings", source, len(var_binds)) + send_to_pd.delay(ctx["routing_key"], payload, base_url=ctx["base_url"], destination_type="v2") + + +def run_server(): + if not SNMP_AVAILABLE: + raise RuntimeError( + "SNMP trap support requires the pysnmp-lextudio extra. " + "Install with `pip install pdaltagent[snmp]`." + ) + + routing_key = os.getenv("PDAGENTD_SNMP_ROUTING_KEY") + if not routing_key or not pd.is_valid_integration_key(routing_key): + raise RuntimeError("PDAGENTD_SNMP_ROUTING_KEY must be set to a valid PagerDuty routing key.") + + host = os.getenv("PDAGENTD_SNMP_HOST", "0.0.0.0") + port = int(os.getenv("PDAGENTD_SNMP_PORT", "9162")) + community = os.getenv("PDAGENTD_SNMP_COMMUNITY", "public") + severity = os.getenv("PDAGENTD_SNMP_SEVERITY", "error") + summary_template = os.getenv("PDAGENTD_SNMP_SUMMARY", "SNMP trap {trap_oid} from {source}") + base_url = os.getenv("PDAGENTD_SNMP_EVENTS_URL") or os.getenv("PDSEND_EVENTS_BASE_URL") or "https://events.pagerduty.com" + + snmp_engine = engine.SnmpEngine() + config.addTransport( + snmp_engine, + udp.domainName, + udp.UdpTransport().openServerMode((host, port)), + ) + config.addV1System(snmp_engine, "pdaltagent-snmp", community) + config.addVacmUser(snmp_engine, 2, "pdaltagent-snmp", "noAuthNoPriv", readSubTree=(1, 3, 6, 1, 4, 1)) + + ctx = { + "routing_key": routing_key, + "severity": severity, + "summary_template": summary_template, + "base_url": base_url, + } + ntfrcv.NotificationReceiver(snmp_engine, lambda *args: _trap_callback(*args, ctx)) + + logger.info("Listening for SNMP traps on %s:%s (community=%s)", host, port, community) + snmp_engine.transportDispatcher.jobStarted(1) + try: + snmp_engine.transportDispatcher.runDispatcher() + finally: + snmp_engine.transportDispatcher.closeDispatcher() + + +def main(): + logging.basicConfig(level=logging.INFO) + if not env_flag("PDAGENTD_SNMP_ENABLED"): + logger.info("SNMP trap listener disabled (set PDAGENTD_SNMP_ENABLED=true to enable)") + return + run_server() + + +if __name__ == "__main__": # pragma: no cover - manual execution only + main() diff --git a/pyproject.toml b/pyproject.toml index ec60d03..c55fa5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "PDaltagent" -version = "0.5.0" +version = "0.6.0" description = "" authors = ["Martin Stone "] exclude = ["pdaltagent/ui"] @@ -21,6 +21,7 @@ flask-security-too = "~5.4" bcrypt = "~4.0.1" bleach = "~6.1" setuptools = "^70.0.0" +pysnmp-lextudio = "~5.0" [tool.poetry.dev-dependencies] pytest = "^7" @@ -30,6 +31,7 @@ pdagentd = 'pdaltagent.tasks:consume' pdpollerd = 'pdaltagent.tasks:poll' pd-send = 'pdaltagent.pdsend:main' pdaltagentui = 'pdaltagent.api.api:app' +pdagent-snmp = 'pdaltagent.snmp:main' [build-system] requires = ["poetry>=1.1.14"] diff --git a/tests/test_pdaltagent.py b/tests/test_pdaltagent.py index 14bf58d..bcf293d 100644 --- a/tests/test_pdaltagent.py +++ b/tests/test_pdaltagent.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == '0.1.0' + assert __version__ == '0.6.0'