CD/CD consolidation

2025-12-26 17:32:23 +02:00
parent a866eb6277
commit c786faae84
638 changed files with 3821 additions and 181 deletions
--- a/deploy/telemetry/.gitignore
+++ b/deploy/telemetry/.gitignore
@@ -1 +0,0 @@
-certs/
--- a/deploy/telemetry/README.md
+++ b/deploy/telemetry/README.md
@@ -1,35 +0,0 @@
-# Telemetry Collector Assets
-
-These assets provision the default OpenTelemetry Collector instance required by
-`DEVOPS-OBS-50-001`. The collector acts as the secured ingest point for traces,
-metrics, and logs emitted by Stella Ops services.
-
-## Contents
-
-| File | Purpose |
-| ---- | ------- |
-| `otel-collector-config.yaml` | Baseline collector configuration (mutual TLS, OTLP receivers, Prometheus exporter). |
-| `storage/prometheus.yaml` | Prometheus scrape configuration tuned for the collector and service tenants. |
-| `storage/tempo.yaml` | Tempo configuration with multitenancy, WAL, and compaction settings. |
-| `storage/loki.yaml` | Loki configuration enabling multitenant log ingestion with retention policies. |
-| `storage/tenants/*.yaml` | Per-tenant overrides for Tempo and Loki rate/retention controls. |
-
-## Development workflow
-
-1. Generate development certificates (collector + client) using
-   `ops/devops/telemetry/generate_dev_tls.sh`.
-2. Launch the collector via `docker compose -f docker-compose.telemetry.yaml up`.
-3. Launch the storage backends (Prometheus, Tempo, Loki) via
-   `docker compose -f docker-compose.telemetry-storage.yaml up`.
-4. Run the smoke test: `python ops/devops/telemetry/smoke_otel_collector.py`.
-5. Explore the storage configuration (`storage/README.md`) to tune retention/limits.
-
-The smoke test sends OTLP traffic over TLS and asserts the collector accepted
-traces, metrics, and logs by scraping the Prometheus metrics endpoint.
-
-## Kubernetes
-
-The Helm chart consumes the same configuration (see `values.yaml`). Provide TLS
-material via a secret referenced by `telemetry.collector.tls.secretName`,
-containing `ca.crt`, `tls.crt`, and `tls.key`. Client certificates are required
-for ingestion and should be issued by the same CA.
--- a/deploy/telemetry/alerts/export-center-alerts.yaml
+++ b/deploy/telemetry/alerts/export-center-alerts.yaml
@@ -1,164 +0,0 @@
-# ExportCenter Alert Rules
-# SLO Burn-rate alerts for export service reliability
-
-groups:
-  - name: export-center-slo
-    interval: 30s
-    rules:
-      # SLO: 99.5% success rate target
-      # Error budget: 0.5% (432 errors per day at 86400 requests/day)
-
-      # Fast burn - 2% budget consumption in 1 hour (critical)
-      - alert: ExportCenterHighErrorBurnRate
-        expr: |
-          (
-            sum(rate(export_runs_failed_total[1h]))
-            /
-            sum(rate(export_runs_total[1h]))
-          ) > (14.4 * 0.005)
-        for: 2m
-        labels:
-          severity: critical
-          service: export-center
-          slo: availability
-        annotations:
-          summary: "ExportCenter high error burn rate"
-          description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
-
-      # Slow burn - 10% budget consumption in 6 hours (warning)
-      - alert: ExportCenterElevatedErrorBurnRate
-        expr: |
-          (
-            sum(rate(export_runs_failed_total[6h]))
-            /
-            sum(rate(export_runs_total[6h]))
-          ) > (6 * 0.005)
-        for: 5m
-        labels:
-          severity: warning
-          service: export-center
-          slo: availability
-        annotations:
-          summary: "ExportCenter elevated error burn rate"
-          description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
-
-  - name: export-center-latency
-    interval: 30s
-    rules:
-      # SLO: 95% of exports complete within 120s
-      # Fast burn - p95 latency exceeding threshold
-      - alert: ExportCenterHighLatency
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
-          ) > 120
-        for: 5m
-        labels:
-          severity: warning
-          service: export-center
-          slo: latency
-        annotations:
-          summary: "ExportCenter high latency"
-          description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
-
-      # Critical latency - p99 exceeding 5 minutes
-      - alert: ExportCenterCriticalLatency
-        expr: |
-          histogram_quantile(0.99,
-            sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
-          ) > 300
-        for: 2m
-        labels:
-          severity: critical
-          service: export-center
-          slo: latency
-        annotations:
-          summary: "ExportCenter critical latency"
-          description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
-
-  - name: export-center-capacity
-    interval: 60s
-    rules:
-      # Queue buildup warning
-      - alert: ExportCenterHighConcurrency
-        expr: sum(export_runs_in_progress) > 50
-        for: 5m
-        labels:
-          severity: warning
-          service: export-center
-        annotations:
-          summary: "ExportCenter high concurrency"
-          description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
-
-      # Stuck exports - exports running longer than 30 minutes
-      - alert: ExportCenterStuckExports
-        expr: |
-          histogram_quantile(0.99,
-            sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
-          ) > 1800
-        for: 10m
-        labels:
-          severity: warning
-          service: export-center
-        annotations:
-          summary: "ExportCenter potentially stuck exports"
-          description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
-
-  - name: export-center-errors
-    interval: 30s
-    rules:
-      # Specific error code spike
-      - alert: ExportCenterErrorCodeSpike
-        expr: |
-          sum by (error_code) (
-            rate(export_runs_failed_total[5m])
-          ) > 0.1
-        for: 5m
-        labels:
-          severity: warning
-          service: export-center
-        annotations:
-          summary: "ExportCenter error code spike: {{ $labels.error_code }}"
-          description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
-
-      # No successful exports in 15 minutes (when there is traffic)
-      - alert: ExportCenterNoSuccessfulExports
-        expr: |
-          (
-            sum(rate(export_runs_total[15m])) > 0
-          )
-          and
-          (
-            sum(rate(export_runs_success_total[15m])) == 0
-          )
-        for: 10m
-        labels:
-          severity: critical
-          service: export-center
-        annotations:
-          summary: "ExportCenter no successful exports"
-          description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
-          runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
-
-  - name: export-center-deprecation
-    interval: 5m
-    rules:
-      # Deprecated endpoint usage
-      - alert: ExportCenterDeprecatedEndpointUsage
-        expr: |
-          sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
-        for: 1h
-        labels:
-          severity: info
-          service: export-center
-        annotations:
-          summary: "Deprecated export endpoints still in use"
-          description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
-          runbook_url: "https://docs.stellaops.io/api/export-center/migration"
--- a/deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
+++ b/deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
@@ -1,42 +0,0 @@
-# Scanner FN-Drift Alert Rules
-# SLO alerts for false-negative drift thresholds (30-day rolling window)
-
-groups:
-  - name: scanner-fn-drift
-    interval: 30s
-    rules:
-      - alert: ScannerFnDriftWarning
-        expr: scanner_fn_drift_percent > 1.0
-        for: 5m
-        labels:
-          severity: warning
-          service: scanner
-          slo: fn-drift
-        annotations:
-          summary: "Scanner FN-Drift rate above warning threshold"
-          description: "FN-Drift is {{ $value | humanizePercentage }} (> 1.0%) over the 30-day rolling window."
-          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-warning"
-
-      - alert: ScannerFnDriftCritical
-        expr: scanner_fn_drift_percent > 2.5
-        for: 5m
-        labels:
-          severity: critical
-          service: scanner
-          slo: fn-drift
-        annotations:
-          summary: "Scanner FN-Drift rate above critical threshold"
-          description: "FN-Drift is {{ $value | humanizePercentage }} (> 2.5%) over the 30-day rolling window."
-          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-critical"
-
-      - alert: ScannerFnDriftEngineViolation
-        expr: scanner_fn_drift_cause_engine > 0
-        for: 1m
-        labels:
-          severity: page
-          service: scanner
-          slo: determinism
-        annotations:
-          summary: "Engine-caused FN drift detected (determinism violation)"
-          description: "Engine-caused FN drift count is {{ $value }} (> 0). This indicates non-feed, non-policy changes affecting outcomes."
-          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-engine-violation"
--- a/deploy/telemetry/dashboards/export-center.json
+++ b/deploy/telemetry/dashboards/export-center.json
@@ -1,638 +0,0 @@
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": { "type": "grafana", "uid": "-- Grafana --" },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "description": "ExportCenter service observability dashboard",
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 0,
-  "id": null,
-  "links": [],
-  "liveNow": false,
-  "panels": [
-    {
-      "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
-      "id": 1,
-      "panels": [],
-      "title": "Export Runs Overview",
-      "type": "row"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
-      "id": 2,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto"
-      },
-      "pluginVersion": "10.0.0",
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum(increase(export_runs_total{tenant=~\"$tenant\"}[$__range]))",
-          "legendFormat": "Total Runs",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Total Export Runs",
-      "type": "stat"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
-      "id": 3,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto"
-      },
-      "pluginVersion": "10.0.0",
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum(increase(export_runs_success_total{tenant=~\"$tenant\"}[$__range]))",
-          "legendFormat": "Successful",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Successful Runs",
-      "type": "stat"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null },
-              { "color": "yellow", "value": 1 },
-              { "color": "red", "value": 5 }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
-      "id": 4,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto"
-      },
-      "pluginVersion": "10.0.0",
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum(increase(export_runs_failed_total{tenant=~\"$tenant\"}[$__range]))",
-          "legendFormat": "Failed",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Failed Runs",
-      "type": "stat"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "red", "value": null },
-              { "color": "yellow", "value": 95 },
-              { "color": "green", "value": 99 }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
-      "id": 5,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto"
-      },
-      "pluginVersion": "10.0.0",
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "100 * sum(increase(export_runs_success_total{tenant=~\"$tenant\"}[$__range])) / sum(increase(export_runs_total{tenant=~\"$tenant\"}[$__range]))",
-          "legendFormat": "Success Rate",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Success Rate",
-      "type": "stat"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
-      "id": 6,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto"
-      },
-      "pluginVersion": "10.0.0",
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum(export_runs_in_progress{tenant=~\"$tenant\"})",
-          "legendFormat": "In Progress",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Runs In Progress",
-      "type": "stat"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": { "type": "linear" },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": { "group": "A", "mode": "none" },
-            "thresholdsStyle": { "mode": "off" }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [{ "color": "green", "value": null }]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
-      "id": 7,
-      "options": {
-        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum by (export_type) (rate(export_runs_total{tenant=~\"$tenant\"}[5m]))",
-          "legendFormat": "{{export_type}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Export Runs by Type (rate/5m)",
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": { "type": "linear" },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": { "group": "A", "mode": "none" },
-            "thresholdsStyle": { "mode": "off" }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [{ "color": "green", "value": null }]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
-      "id": 8,
-      "options": {
-        "legend": { "calcs": ["mean", "max", "p95"], "displayMode": "table", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
-          "legendFormat": "p50",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
-          "legendFormat": "p95",
-          "range": true,
-          "refId": "B"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
-          "legendFormat": "p99",
-          "range": true,
-          "refId": "C"
-        }
-      ],
-      "title": "Export Run Duration (latency percentiles)",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
-      "id": 9,
-      "panels": [],
-      "title": "Artifacts & Bundle Sizes",
-      "type": "row"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "bars",
-            "fillOpacity": 50,
-            "gradientMode": "none",
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": { "type": "linear" },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": { "group": "A", "mode": "normal" },
-            "thresholdsStyle": { "mode": "off" }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [{ "color": "green", "value": null }]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
-      "id": 10,
-      "options": {
-        "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum by (artifact_type) (increase(export_artifacts_total{tenant=~\"$tenant\"}[1h]))",
-          "legendFormat": "{{artifact_type}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Artifacts Exported by Type (per hour)",
-      "type": "timeseries"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": { "type": "linear" },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": { "group": "A", "mode": "none" },
-            "thresholdsStyle": { "mode": "off" }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [{ "color": "green", "value": null }]
-          },
-          "unit": "bytes"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
-      "id": 11,
-      "options": {
-        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum by (le, export_type) (rate(export_bundle_size_bytes_bucket{tenant=~\"$tenant\"}[5m])))",
-          "legendFormat": "{{export_type}} p50",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.95, sum by (le, export_type) (rate(export_bundle_size_bytes_bucket{tenant=~\"$tenant\"}[5m])))",
-          "legendFormat": "{{export_type}} p95",
-          "range": true,
-          "refId": "B"
-        }
-      ],
-      "title": "Bundle Size Distribution by Type",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
-      "id": 12,
-      "panels": [],
-      "title": "Error Analysis",
-      "type": "row"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false }
-          },
-          "mappings": [],
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 },
-      "id": 13,
-      "options": {
-        "legend": { "displayMode": "table", "placement": "right", "showLegend": true },
-        "pieType": "pie",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "tooltip": { "mode": "single", "sort": "none" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum by (error_code) (increase(export_runs_failed_total{tenant=~\"$tenant\"}[$__range]))",
-          "legendFormat": "{{error_code}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Failures by Error Code",
-      "type": "piechart"
-    },
-    {
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
-            "lineInterpolation": "linear",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": { "type": "linear" },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": { "group": "A", "mode": "none" },
-            "thresholdsStyle": { "mode": "line" }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null },
-              { "color": "red", "value": 0.01 }
-            ]
-          },
-          "unit": "percentunit"
-        },
-        "overrides": []
-      },
-      "gridPos": { "h": 8, "w": 16, "x": 8, "y": 23 },
-      "id": 14,
-      "options": {
-        "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "editorMode": "code",
-          "expr": "sum(rate(export_runs_failed_total{tenant=~\"$tenant\"}[5m])) / sum(rate(export_runs_total{tenant=~\"$tenant\"}[5m]))",
-          "legendFormat": "Error Rate",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Error Rate (5m window)",
-      "type": "timeseries"
-    }
-  ],
-  "refresh": "30s",
-  "schemaVersion": 38,
-  "style": "dark",
-  "tags": ["export-center", "stellaops"],
-  "templating": {
-    "list": [
-      {
-        "current": {},
-        "hide": 0,
-        "includeAll": false,
-        "multi": false,
-        "name": "datasource",
-        "options": [],
-        "query": "prometheus",
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "type": "datasource"
-      },
-      {
-        "allValue": ".*",
-        "current": {},
-        "datasource": { "type": "prometheus", "uid": "${datasource}" },
-        "definition": "label_values(export_runs_total, tenant)",
-        "hide": 0,
-        "includeAll": true,
-        "multi": true,
-        "name": "tenant",
-        "options": [],
-        "query": { "query": "label_values(export_runs_total, tenant)", "refId": "StandardVariableQuery" },
-        "refresh": 2,
-        "regex": "",
-        "skipUrlSync": false,
-        "sort": 1,
-        "type": "query"
-      }
-    ]
-  },
-  "time": { "from": "now-6h", "to": "now" },
-  "timepicker": {},
-  "timezone": "utc",
-  "title": "ExportCenter Service",
-  "uid": "export-center-overview",
-  "version": 1,
-  "weekStart": ""
-}
--- a/deploy/telemetry/otel-collector-config.yaml
+++ b/deploy/telemetry/otel-collector-config.yaml
@@ -1,92 +0,0 @@
-receivers:
-  otlp:
-    protocols:
-      grpc:
-        endpoint: 0.0.0.0:4317
-        tls:
-          cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
-          key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
-          client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
-          require_client_certificate: ${STELLAOPS_OTEL_REQUIRE_CLIENT_CERT:true}
-      http:
-        endpoint: 0.0.0.0:4318
-        tls:
-          cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
-          key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
-          client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
-          require_client_certificate: ${STELLAOPS_OTEL_REQUIRE_CLIENT_CERT:true}
-
-processors:
-  attributes/tenant-tag:
-    actions:
-      - key: tenant.id
-        action: insert
-        value: ${STELLAOPS_TENANT_ID:unknown}
-  batch:
-    send_batch_size: 1024
-    timeout: 5s
-
-exporters:
-  logging:
-    verbosity: normal
-  prometheus:
-    endpoint: ${STELLAOPS_OTEL_PROMETHEUS_ENDPOINT:0.0.0.0:9464}
-    enable_open_metrics: true
-    metric_expiration: 5m
-    tls:
-      cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
-      key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
-      client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
-  otlphttp/tempo:
-    endpoint: ${STELLAOPS_TEMPO_ENDPOINT:https://stellaops-tempo:3200}
-    compression: gzip
-    tls:
-      ca_file: ${STELLAOPS_TEMPO_TLS_CA_FILE:/etc/otel-collector/tls/ca.crt}
-      cert_file: ${STELLAOPS_TEMPO_TLS_CERT_FILE:/etc/otel-collector/tls/client.crt}
-      key_file: ${STELLAOPS_TEMPO_TLS_KEY_FILE:/etc/otel-collector/tls/client.key}
-      insecure_skip_verify: false
-    headers:
-      "X-Scope-OrgID": ${STELLAOPS_TENANT_ID:unknown}
-  loki/tenant:
-    endpoint: ${STELLAOPS_LOKI_ENDPOINT:https://stellaops-loki:3100/loki/api/v1/push}
-    tenant_id: ${STELLAOPS_TENANT_ID:unknown}
-    tls:
-      ca_file: ${STELLAOPS_LOKI_TLS_CA_FILE:/etc/otel-collector/tls/ca.crt}
-      cert_file: ${STELLAOPS_LOKI_TLS_CERT_FILE:/etc/otel-collector/tls/client.crt}
-      key_file: ${STELLAOPS_LOKI_TLS_KEY_FILE:/etc/otel-collector/tls/client.key}
-      insecure_skip_verify: false
-    default_labels_enabled:
-      exporter: false
-      job: false
-      instance: false
-    format: json
-    drain_interval: 5s
-    queue:
-      enabled: true
-      queue_size: 1024
-      retry_on_failure: true
-
-extensions:
-  health_check:
-    endpoint: ${STELLAOPS_OTEL_HEALTH_ENDPOINT:0.0.0.0:13133}
-  pprof:
-    endpoint: ${STELLAOPS_OTEL_PPROF_ENDPOINT:0.0.0.0:1777}
-
-service:
-  telemetry:
-    logs:
-      level: ${STELLAOPS_OTEL_LOG_LEVEL:info}
-  extensions: [health_check, pprof]
-  pipelines:
-    traces:
-      receivers: [otlp]
-      processors: [attributes/tenant-tag, batch]
-      exporters: [logging, otlphttp/tempo]
-    metrics:
-      receivers: [otlp]
-      processors: [attributes/tenant-tag, batch]
-      exporters: [logging, prometheus]
-    logs:
-      receivers: [otlp]
-      processors: [attributes/tenant-tag, batch]
-      exporters: [logging, loki/tenant]
--- a/deploy/telemetry/storage/README.md
+++ b/deploy/telemetry/storage/README.md
@@ -1,36 +0,0 @@
-# Telemetry Storage Stack
-
-Configuration snippets for the default StellaOps observability backends used in
-staging and production environments. The stack comprises:
-
- **Prometheus** for metrics (scraping the collector's Prometheus exporter)
- **Tempo** for traces (OTLP ingest via mTLS)
- **Loki** for logs (HTTP ingest with tenant isolation)
-
-## Files
-
-| Path | Description |
-| ---- | ----------- |
-| `prometheus.yaml` | Scrape configuration for the collector (mTLS + bearer token placeholder). |
-| `tempo.yaml` | Tempo configuration with multitenancy enabled and local storage paths. |
-| `loki.yaml` | Loki configuration enabling per-tenant overrides and boltdb-shipper storage. |
-| `tenants/tempo-overrides.yaml` | Example tenant overrides for Tempo (retention, limits). |
-| `tenants/loki-overrides.yaml` | Example tenant overrides for Loki (rate limits, retention). |
-| `auth/` | Placeholder directory for Prometheus bearer token files (e.g., `token`). |
-
-These configurations are referenced by the Docker Compose overlay
-(`deploy/compose/docker-compose.telemetry-storage.yaml`) and the staging rollout documented in
-`docs/modules/telemetry/operations/storage.md`. Adjust paths, credentials, and overrides before running in
-connected environments. Place the Prometheus bearer token in `auth/token` when using the
-Compose overlay (the directory contains a `.gitkeep` placeholder and is gitignored by default).
-
-Run `python ops/devops/telemetry/validate_storage_stack.py` after editing any of these files to
-ensure TLS, multitenancy, and override references remain intact.
-
-## Security
-
- Both Tempo and Loki require mutual TLS.
- Prometheus uses mTLS plus a bearer token that should be minted by Authority.
- Update the overrides files to enforce per-tenant retention/ingestion limits.
-
-For comprehensive deployment steps see `docs/modules/telemetry/operations/storage.md`.
--- a/deploy/telemetry/storage/auth/.gitkeep
+++ b/deploy/telemetry/storage/auth/.gitkeep
--- a/deploy/telemetry/storage/loki.yaml
+++ b/deploy/telemetry/storage/loki.yaml
@@ -1,48 +0,0 @@
-auth_enabled: true
-
-server:
-  http_listen_port: 3100
-  log_level: info
-
-common:
-  ring:
-    instance_addr: 127.0.0.1
-    kvstore:
-      store: inmemory
-  replication_factor: 1
-  path_prefix: /var/loki
-
-schema_config:
-  configs:
-    - from: 2024-01-01
-      store: boltdb-shipper
-      object_store: filesystem
-      schema: v13
-      index:
-        prefix: loki_index_
-        period: 24h
-
-storage_config:
-  filesystem:
-    directory: /var/loki/chunks
-  boltdb_shipper:
-    active_index_directory: /var/loki/index
-    cache_location: /var/loki/index_cache
-    shared_store: filesystem
-
-ruler:
-  storage:
-    type: local
-    local:
-      directory: /var/loki/rules
-  rule_path: /tmp/loki-rules
-  enable_api: true
-
-limits_config:
-  enforce_metric_name: false
-  reject_old_samples: true
-  reject_old_samples_max_age: 168h
-  max_entries_limit_per_query: 5000
-  ingestion_rate_mb: 10
-  ingestion_burst_size_mb: 20
-  per_tenant_override_config: /etc/telemetry/tenants/loki-overrides.yaml
--- a/deploy/telemetry/storage/prometheus.yaml
+++ b/deploy/telemetry/storage/prometheus.yaml
@@ -1,19 +0,0 @@
-global:
-  scrape_interval: 15s
-  evaluation_interval: 30s
-
-scrape_configs:
-  - job_name: "stellaops-otel-collector"
-    scheme: https
-    metrics_path: /
-    tls_config:
-      ca_file: ${PROMETHEUS_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
-      cert_file: ${PROMETHEUS_TLS_CERT_FILE:-/etc/telemetry/tls/client.crt}
-      key_file: ${PROMETHEUS_TLS_KEY_FILE:-/etc/telemetry/tls/client.key}
-      insecure_skip_verify: false
-    authorization:
-      type: Bearer
-      credentials_file: ${PROMETHEUS_BEARER_TOKEN_FILE:-/etc/telemetry/auth/token}
-    static_configs:
-      - targets:
-          - ${PROMETHEUS_COLLECTOR_TARGET:-stellaops-otel-collector:9464}
--- a/deploy/telemetry/storage/tempo.yaml
+++ b/deploy/telemetry/storage/tempo.yaml
@@ -1,56 +0,0 @@
-multitenancy_enabled: true
-usage_report:
-  reporting_enabled: false
-
-server:
-  http_listen_port: 3200
-  log_level: info
-
-distributor:
-  receivers:
-    otlp:
-      protocols:
-        grpc:
-          tls:
-            cert_file: ${TEMPO_TLS_CERT_FILE:-/etc/telemetry/tls/server.crt}
-            key_file: ${TEMPO_TLS_KEY_FILE:-/etc/telemetry/tls/server.key}
-            client_ca_file: ${TEMPO_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
-            require_client_cert: true
-        http:
-          tls:
-            cert_file: ${TEMPO_TLS_CERT_FILE:-/etc/telemetry/tls/server.crt}
-            key_file: ${TEMPO_TLS_KEY_FILE:-/etc/telemetry/tls/server.key}
-            client_ca_file: ${TEMPO_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
-            require_client_cert: true
-
-ingester:
-  lifecycler:
-    ring:
-      instance_availability_zone: ${TEMPO_ZONE:-zone-a}
-  trace_idle_period: 10s
-  max_block_bytes: 1_048_576
-
-compactor:
-  compaction:
-    block_retention: 168h
-
-metrics_generator:
-  registry:
-    external_labels:
-      cluster: stellaops
-
-storage:
-  trace:
-    backend: local
-    local:
-      path: /var/tempo/traces
-    wal:
-      path: /var/tempo/wal
-  metrics:
-    backend: prometheus
-
-overrides:
-  defaults:
-    ingestion_rate_limit_bytes: 1048576
-    max_traces_per_user: 200000
-  per_tenant_override_config: /etc/telemetry/tenants/tempo-overrides.yaml
--- a/deploy/telemetry/storage/tenants/loki-overrides.yaml
+++ b/deploy/telemetry/storage/tenants/loki-overrides.yaml
@@ -1,19 +0,0 @@
-# Example Loki per-tenant overrides
-# Adjust according to https://grafana.com/docs/loki/latest/configuration/#limits_config
-
-stellaops-dev:
-  ingestion_rate_mb: 10
-  ingestion_burst_size_mb: 20
-  max_global_streams_per_user: 5000
-  retention_period: 168h
-
-stellaops-stage:
-  ingestion_rate_mb: 20
-  ingestion_burst_size_mb: 40
-  max_global_streams_per_user: 10000
-  retention_period: 336h
-
-__default__:
-  ingestion_rate_mb: 5
-  ingestion_burst_size_mb: 10
-  retention_period: 72h
--- a/deploy/telemetry/storage/tenants/tempo-overrides.yaml
+++ b/deploy/telemetry/storage/tenants/tempo-overrides.yaml
@@ -1,16 +0,0 @@
-# Example Tempo per-tenant overrides
-# Consult https://grafana.com/docs/tempo/latest/configuration/#limits-configuration
-# before applying in production.
-
-stellaops-dev:
-  traces_per_second_limit: 100000
-  max_bytes_per_trace: 10485760
-  max_search_bytes_per_trace: 20971520
-
-stellaops-stage:
-  traces_per_second_limit: 200000
-  max_bytes_per_trace: 20971520
-
-__default__:
-  traces_per_second_limit: 50000
-  max_bytes_per_trace: 5242880