devops folders consolidate

2026-01-25 23:27:41 +02:00
parent 6e687b523a
commit a50bbb38ef
334 changed files with 35079 additions and 5569 deletions
--- a/deploy/telemetry/alerts/alerts-slo.yaml
+++ b/deploy/telemetry/alerts/alerts-slo.yaml
@@ -0,0 +1,36 @@
+groups:
+  - name: slo-burn
+    rules:
+      - alert: SLOBurnRateFast
+        expr: |
+          (rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
+          4 * (1 - 0.99)
+        for: 5m
+        labels:
+          severity: critical
+          team: devops
+        annotations:
+          summary: "Fast burn: 99% SLO breached"
+          description: "Error budget burn (5m) exceeds fast threshold."
+      - alert: SLOBurnRateSlow
+        expr: |
+          (rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
+          1 * (1 - 0.99)
+        for: 1h
+        labels:
+          severity: warning
+          team: devops
+        annotations:
+          summary: "Slow burn: 99% SLO at risk"
+          description: "Error budget burn (1h) exceeds slow threshold."
+  - name: slo-webhook
+    rules:
+      - alert: SLOWebhookFailures
+        expr: rate(slo_webhook_failures_total[5m]) > 0
+        for: 10m
+        labels:
+          severity: warning
+          team: devops
+        annotations:
+          summary: "SLO webhook failures"
+          description: "Webhook emitter has failures in last 5m."
--- a/deploy/telemetry/alerts/export-center-alerts.yaml
+++ b/deploy/telemetry/alerts/export-center-alerts.yaml
@@ -0,0 +1,164 @@
+# ExportCenter Alert Rules
+# SLO Burn-rate alerts for export service reliability
+
+groups:
+  - name: export-center-slo
+    interval: 30s
+    rules:
+      # SLO: 99.5% success rate target
+      # Error budget: 0.5% (432 errors per day at 86400 requests/day)
+
+      # Fast burn - 2% budget consumption in 1 hour (critical)
+      - alert: ExportCenterHighErrorBurnRate
+        expr: |
+          (
+            sum(rate(export_runs_failed_total[1h]))
+            /
+            sum(rate(export_runs_total[1h]))
+          ) > (14.4 * 0.005)
+        for: 2m
+        labels:
+          severity: critical
+          service: export-center
+          slo: availability
+        annotations:
+          summary: "ExportCenter high error burn rate"
+          description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
+
+      # Slow burn - 10% budget consumption in 6 hours (warning)
+      - alert: ExportCenterElevatedErrorBurnRate
+        expr: |
+          (
+            sum(rate(export_runs_failed_total[6h]))
+            /
+            sum(rate(export_runs_total[6h]))
+          ) > (6 * 0.005)
+        for: 5m
+        labels:
+          severity: warning
+          service: export-center
+          slo: availability
+        annotations:
+          summary: "ExportCenter elevated error burn rate"
+          description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
+
+  - name: export-center-latency
+    interval: 30s
+    rules:
+      # SLO: 95% of exports complete within 120s
+      # Fast burn - p95 latency exceeding threshold
+      - alert: ExportCenterHighLatency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
+          ) > 120
+        for: 5m
+        labels:
+          severity: warning
+          service: export-center
+          slo: latency
+        annotations:
+          summary: "ExportCenter high latency"
+          description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
+
+      # Critical latency - p99 exceeding 5 minutes
+      - alert: ExportCenterCriticalLatency
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
+          ) > 300
+        for: 2m
+        labels:
+          severity: critical
+          service: export-center
+          slo: latency
+        annotations:
+          summary: "ExportCenter critical latency"
+          description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
+
+  - name: export-center-capacity
+    interval: 60s
+    rules:
+      # Queue buildup warning
+      - alert: ExportCenterHighConcurrency
+        expr: sum(export_runs_in_progress) > 50
+        for: 5m
+        labels:
+          severity: warning
+          service: export-center
+        annotations:
+          summary: "ExportCenter high concurrency"
+          description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
+
+      # Stuck exports - exports running longer than 30 minutes
+      - alert: ExportCenterStuckExports
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
+          ) > 1800
+        for: 10m
+        labels:
+          severity: warning
+          service: export-center
+        annotations:
+          summary: "ExportCenter potentially stuck exports"
+          description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
+
+  - name: export-center-errors
+    interval: 30s
+    rules:
+      # Specific error code spike
+      - alert: ExportCenterErrorCodeSpike
+        expr: |
+          sum by (error_code) (
+            rate(export_runs_failed_total[5m])
+          ) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+          service: export-center
+        annotations:
+          summary: "ExportCenter error code spike: {{ $labels.error_code }}"
+          description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
+
+      # No successful exports in 15 minutes (when there is traffic)
+      - alert: ExportCenterNoSuccessfulExports
+        expr: |
+          (
+            sum(rate(export_runs_total[15m])) > 0
+          )
+          and
+          (
+            sum(rate(export_runs_success_total[15m])) == 0
+          )
+        for: 10m
+        labels:
+          severity: critical
+          service: export-center
+        annotations:
+          summary: "ExportCenter no successful exports"
+          description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
+          runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
+
+  - name: export-center-deprecation
+    interval: 5m
+    rules:
+      # Deprecated endpoint usage
+      - alert: ExportCenterDeprecatedEndpointUsage
+        expr: |
+          sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
+        for: 1h
+        labels:
+          severity: info
+          service: export-center
+        annotations:
+          summary: "Deprecated export endpoints still in use"
+          description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
+          runbook_url: "https://docs.stellaops.io/api/export-center/migration"
--- a/deploy/telemetry/alerts/policy-alerts.yaml
+++ b/deploy/telemetry/alerts/policy-alerts.yaml
@@ -0,0 +1,52 @@
+groups:
+  - name: policy-pipeline
+    rules:
+      - alert: PolicyCompileLatencyP99High
+        expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
+        for: 10m
+        labels:
+          severity: warning
+          service: policy
+        annotations:
+          summary: "Policy compile latency elevated (p99)"
+          description: "p99 compile duration has been >5s for 10m"
+
+      - alert: PolicySimulationQueueBacklog
+        expr: sum(policy_simulation_queue_depth) > 100
+        for: 10m
+        labels:
+          severity: warning
+          service: policy
+        annotations:
+          summary: "Policy simulation backlog"
+          description: "Simulation queue depth above 100 for 10m"
+
+      - alert: PolicyApprovalLatencyHigh
+        expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
+        for: 15m
+        labels:
+          severity: critical
+          service: policy
+        annotations:
+          summary: "Policy approval latency high"
+          description: "p95 approval latency above 30s for 15m"
+
+      - alert: PolicyPromotionFailureRate
+        expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
+        for: 10m
+        labels:
+          severity: critical
+          service: policy
+        annotations:
+          summary: "Policy promotion failure rate elevated"
+          description: "Failures exceed 20% of promotions over 15m"
+
+      - alert: PolicyPromotionStall
+        expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
+        for: 10m
+        labels:
+          severity: warning
+          service: policy
+        annotations:
+          summary: "Policy promotion stalled"
+          description: "No successful promotions while work is queued"
--- a/deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
+++ b/deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
@@ -0,0 +1,42 @@
+# Scanner FN-Drift Alert Rules
+# SLO alerts for false-negative drift thresholds (30-day rolling window)
+
+groups:
+  - name: scanner-fn-drift
+    interval: 30s
+    rules:
+      - alert: ScannerFnDriftWarning
+        expr: scanner_fn_drift_percent > 1.0
+        for: 5m
+        labels:
+          severity: warning
+          service: scanner
+          slo: fn-drift
+        annotations:
+          summary: "Scanner FN-Drift rate above warning threshold"
+          description: "FN-Drift is {{ $value | humanizePercentage }} (> 1.0%) over the 30-day rolling window."
+          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-warning"
+
+      - alert: ScannerFnDriftCritical
+        expr: scanner_fn_drift_percent > 2.5
+        for: 5m
+        labels:
+          severity: critical
+          service: scanner
+          slo: fn-drift
+        annotations:
+          summary: "Scanner FN-Drift rate above critical threshold"
+          description: "FN-Drift is {{ $value | humanizePercentage }} (> 2.5%) over the 30-day rolling window."
+          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-critical"
+
+      - alert: ScannerFnDriftEngineViolation
+        expr: scanner_fn_drift_cause_engine > 0
+        for: 1m
+        labels:
+          severity: page
+          service: scanner
+          slo: determinism
+        annotations:
+          summary: "Engine-caused FN drift detected (determinism violation)"
+          description: "Engine-caused FN drift count is {{ $value }} (> 0). This indicates non-feed, non-policy changes affecting outcomes."
+          runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-engine-violation"
--- a/deploy/telemetry/alerts/signals-alerts.yaml
+++ b/deploy/telemetry/alerts/signals-alerts.yaml
@@ -0,0 +1,54 @@
+groups:
+  - name: signals-pipeline
+    rules:
+      - alert: SignalsScoringLatencyP95High
+        expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
+        for: 10m
+        labels:
+          severity: warning
+          service: signals
+        annotations:
+          summary: "Signals scoring latency high (p95)"
+          description: "Reachability scoring p95 exceeds 2s for 10m"
+
+      - alert: SignalsCacheMissRateHigh
+        expr: |
+          clamp_min(rate(signals_cache_misses_total[5m]), 0)
+          / clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
+        for: 10m
+        labels:
+          severity: warning
+          service: signals
+        annotations:
+          summary: "Signals cache miss rate high"
+          description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
+
+      - alert: SignalsCacheDown
+        expr: signals_cache_available == 0
+        for: 2m
+        labels:
+          severity: critical
+          service: signals
+        annotations:
+          summary: "Signals cache unavailable"
+          description: "Redis cache reported unavailable for >2m"
+
+      - alert: SignalsSensorStaleness
+        expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
+        for: 5m
+        labels:
+          severity: warning
+          service: signals
+        annotations:
+          summary: "Signals sensor stale"
+          description: "No updates from sensor for >15 minutes"
+
+      - alert: SignalsIngestionErrorRate
+        expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+          service: signals
+        annotations:
+          summary: "Signals ingestion failures elevated"
+          description: "Ingestion failure ratio above 5% over 5m"
--- a/deploy/telemetry/alerts/stella-p0-alerts.yml
+++ b/deploy/telemetry/alerts/stella-p0-alerts.yml
@@ -0,0 +1,118 @@
+# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
+# Task: P0M-006 - Alerting Rules
+# P0 Product Metrics Alert Rules
+
+groups:
+  - name: stella-p0-metrics
+    rules:
+      # P0M-001: Time to First Verified Release
+      - alert: StellaTimeToFirstReleaseHigh
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
+        for: 1h
+        labels:
+          severity: warning
+          category: adoption
+        annotations:
+          summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
+          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
+          
+      - alert: StellaTimeToFirstReleaseCritical
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
+        for: 1h
+        labels:
+          severity: critical
+          category: adoption
+        annotations:
+          summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
+          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
+
+      # P0M-002: Why Blocked Latency
+      - alert: StellaWhyBlockedLatencyHigh
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
+        for: 30m
+        labels:
+          severity: warning
+          category: usability
+        annotations:
+          summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
+          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
+
+      - alert: StellaWhyBlockedLatencyCritical
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
+        for: 30m
+        labels:
+          severity: critical
+          category: usability
+        annotations:
+          summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
+          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
+
+      # P0M-003: Support Burden
+      - alert: StellaSupportBurdenHigh
+        expr: |
+          sum by (tenant, month) (stella_support_burden_minutes_total) > 30
+        for: 0m
+        labels:
+          severity: warning
+          category: operations
+        annotations:
+          summary: "Support burden high for tenant {{ $labels.tenant }}"
+          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
+
+      - alert: StellaSupportBurdenCritical
+        expr: |
+          sum by (tenant, month) (stella_support_burden_minutes_total) > 60
+        for: 0m
+        labels:
+          severity: critical
+          category: operations
+        annotations:
+          summary: "Support burden critically high for tenant {{ $labels.tenant }}"
+          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
+
+      # P0M-004: Determinism Regressions
+      - alert: StellaDeterminismRegression
+        expr: |
+          increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          category: reliability
+        annotations:
+          summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
+          description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
+
+      - alert: StellaDeterminismRegressionSemantic
+        expr: |
+          increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
+        for: 0m
+        labels:
+          severity: warning
+          category: reliability
+        annotations:
+          summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
+          description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
+
+      - alert: StellaDeterminismRegressionBitwise
+        expr: |
+          increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
+        for: 0m
+        labels:
+          severity: warning
+          category: reliability
+        annotations:
+          summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
+          description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
--- a/deploy/telemetry/alerts/triage-alerts.yaml
+++ b/deploy/telemetry/alerts/triage-alerts.yaml
@@ -0,0 +1,62 @@
+groups:
+  - name: triage-ttfs
+    rules:
+      - alert: TriageTtfsFirstEvidenceP95High
+        expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
+        for: 10m
+        labels:
+          severity: critical
+          service: triage
+        annotations:
+          summary: "TTFS first evidence p95 high"
+          description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
+
+      - alert: TriageTtfsSkeletonP95High
+        expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
+        for: 10m
+        labels:
+          severity: warning
+          service: triage
+        annotations:
+          summary: "TTFS skeleton p95 high"
+          description: "TTFS skeleton p95 exceeds 200ms for 10m."
+
+      - alert: TriageTtfsFullEvidenceP95High
+        expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
+        for: 10m
+        labels:
+          severity: warning
+          service: triage
+        annotations:
+          summary: "TTFS full evidence p95 high"
+          description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
+
+      - alert: TriageClicksToClosureMedianHigh
+        expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
+        for: 15m
+        labels:
+          severity: warning
+          service: triage
+        annotations:
+          summary: "Clicks-to-closure median high"
+          description: "Median clicks-to-closure exceeds 6 for 15m."
+
+      - alert: TriageEvidenceCompletenessAvgLow
+        expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
+        for: 30m
+        labels:
+          severity: warning
+          service: triage
+        annotations:
+          summary: "Evidence completeness below target"
+          description: "Average evidence completeness score below 3.6 (90%) for 30m."
+
+      - alert: TriageBudgetViolationRateHigh
+        expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
+        for: 10m
+        labels:
+          severity: warning
+          service: triage
+        annotations:
+          summary: "Performance budget violations elevated"
+          description: "Performance budget violation rate exceeds 0.05/s for 10m."