work

2025-11-25 08:01:23 +02:00
parent d92973d6fd
commit 6bee1fdcf5
207 changed files with 12816 additions and 2295 deletions
--- a/ops/devops/orchestrator/README.md
+++ b/ops/devops/orchestrator/README.md
@@ -0,0 +1,36 @@
+# Orchestrator Infra Bootstrap (DEVOPS-ORCH-32-001)
+
+## Components
+- Postgres 16 (state/config)
+- Mongo 7 (job ledger history)
+- NATS 2.10 JetStream (queue/bus)
+
+Compose file: `ops/devops/orchestrator/docker-compose.orchestrator.yml`
+
+## Quick start (offline-friendly)
+```bash
+# bring up infra
+COMPOSE_FILE=ops/devops/orchestrator/docker-compose.orchestrator.yml docker compose up -d
+
+# smoke check and emit connection strings
+scripts/orchestrator/smoke.sh
+cat out/orchestrator-smoke/readiness.txt
+```
+
+Connection strings
+- Postgres: `postgres://orch:orchpass@localhost:55432/orchestrator`
+- Mongo: `mongodb://localhost:57017`
+- NATS: `nats://localhost:4222`
+
+## Observability
+- Alerts: `ops/devops/orchestrator/alerts.yaml`
+- Grafana dashboard: `ops/devops/orchestrator/grafana/orchestrator-overview.json`
+  - Metrics expected: `job_queue_depth`, `job_failures_total`, `lease_extensions_total`, `job_latency_seconds_bucket`.
+
+## CI hook (suggested)
+Add a workflow step (or local cron) to run `scripts/orchestrator/smoke.sh` with `SKIP_UP=1` against existing infra and publish the `readiness.txt` artifact for traceability.
+
+## Notes
+- Uses fixed ports for determinism; adjust via COMPOSE overrides if needed.
+- Data volumes: `orch_pg_data`, `orch_mongo_data` (docker volumes).
+- No external downloads beyond base images; pin images to specific tags above.
--- a/ops/devops/orchestrator/alerts.yaml
+++ b/ops/devops/orchestrator/alerts.yaml
@@ -0,0 +1,30 @@
+groups:
+  - name: orchestrator-core
+    rules:
+      - alert: OrchestratorQueueDepthHigh
+        expr: job_queue_depth > 500
+        for: 10m
+        labels:
+          severity: warning
+          service: orchestrator
+        annotations:
+          summary: "Queue depth high"
+          description: "job_queue_depth exceeded 500 for 10m"
+      - alert: OrchestratorFailuresHigh
+        expr: rate(job_failures_total[5m]) > 5
+        for: 5m
+        labels:
+          severity: critical
+          service: orchestrator
+        annotations:
+          summary: "Job failures elevated"
+          description: "Failure rate above 5/min in last 5m"
+      - alert: OrchestratorLeaseStall
+        expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0
+        for: 5m
+        labels:
+          severity: critical
+          service: orchestrator
+        annotations:
+          summary: "Leases stalled"
+          description: "No lease renewals while queue has items"
--- a/ops/devops/orchestrator/docker-compose.orchestrator.yml
+++ b/ops/devops/orchestrator/docker-compose.orchestrator.yml
@@ -0,0 +1,49 @@
+version: "3.9"
+services:
+  orchestrator-postgres:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: orch
+      POSTGRES_PASSWORD: orchpass
+      POSTGRES_DB: orchestrator
+    volumes:
+      - orch_pg_data:/var/lib/postgresql/data
+    ports:
+      - "55432:5432"
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U orch"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    restart: unless-stopped
+
+  orchestrator-mongo:
+    image: mongo:7
+    command: ["mongod", "--quiet", "--storageEngine=wiredTiger"]
+    ports:
+      - "57017:27017"
+    volumes:
+      - orch_mongo_data:/data/db
+    healthcheck:
+      test: ["CMD", "mongosh", "--quiet", "--eval", "db.adminCommand('ping')"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    restart: unless-stopped
+
+  orchestrator-nats:
+    image: nats:2.10-alpine
+    ports:
+      - "5422:4222"
+      - "5822:8222"
+    command: ["-js", "-m", "8222"]
+    healthcheck:
+      test: ["CMD", "nats", "--server", "localhost:4222", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    restart: unless-stopped
+
+volumes:
+  orch_pg_data:
+  orch_mongo_data:
--- a/ops/devops/orchestrator/grafana/orchestrator-overview.json
+++ b/ops/devops/orchestrator/grafana/orchestrator-overview.json
@@ -0,0 +1,42 @@
+{
+  "schemaVersion": 39,
+  "title": "Orchestrator Overview",
+  "panels": [
+    {
+      "type": "stat",
+      "title": "Queue Depth",
+      "datasource": "Prometheus",
+      "fieldConfig": {"defaults": {"unit": "none"}},
+      "targets": [{"expr": "sum(job_queue_depth)"}]
+    },
+    {
+      "type": "timeseries",
+      "title": "Queue Depth by Job Type",
+      "datasource": "Prometheus",
+      "targets": [{"expr": "job_queue_depth"}],
+      "fieldConfig": {"defaults": {"unit": "none"}}
+    },
+    {
+      "type": "timeseries",
+      "title": "Failures per minute",
+      "datasource": "Prometheus",
+      "targets": [{"expr": "rate(job_failures_total[5m])"}],
+      "fieldConfig": {"defaults": {"unit": "short"}}
+    },
+    {
+      "type": "timeseries",
+      "title": "Leases per second",
+      "datasource": "Prometheus",
+      "targets": [{"expr": "rate(lease_extensions_total[5m])"}],
+      "fieldConfig": {"defaults": {"unit": "ops"}}
+    },
+    {
+      "type": "timeseries",
+      "title": "Job latency p95",
+      "datasource": "Prometheus",
+      "targets": [{"expr": "histogram_quantile(0.95, sum(rate(job_latency_seconds_bucket[5m])) by (le))"}],
+      "fieldConfig": {"defaults": {"unit": "s"}}
+    }
+  ],
+  "time": {"from": "now-6h", "to": "now"}
+}