devops folders consolidate

This commit is contained in:
master
2026-01-25 23:27:41 +02:00
parent 6e687b523a
commit a50bbb38ef
334 changed files with 35079 additions and 5569 deletions

View File

@@ -0,0 +1,36 @@
groups:
- name: slo-burn
rules:
- alert: SLOBurnRateFast
expr: |
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
4 * (1 - 0.99)
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Fast burn: 99% SLO breached"
description: "Error budget burn (5m) exceeds fast threshold."
- alert: SLOBurnRateSlow
expr: |
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
1 * (1 - 0.99)
for: 1h
labels:
severity: warning
team: devops
annotations:
summary: "Slow burn: 99% SLO at risk"
description: "Error budget burn (1h) exceeds slow threshold."
- name: slo-webhook
rules:
- alert: SLOWebhookFailures
expr: rate(slo_webhook_failures_total[5m]) > 0
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "SLO webhook failures"
description: "Webhook emitter has failures in last 5m."

View File

@@ -0,0 +1,164 @@
# ExportCenter Alert Rules
# SLO Burn-rate alerts for export service reliability
groups:
- name: export-center-slo
interval: 30s
rules:
# SLO: 99.5% success rate target
# Error budget: 0.5% (432 errors per day at 86400 requests/day)
# Fast burn - 2% budget consumption in 1 hour (critical)
- alert: ExportCenterHighErrorBurnRate
expr: |
(
sum(rate(export_runs_failed_total[1h]))
/
sum(rate(export_runs_total[1h]))
) > (14.4 * 0.005)
for: 2m
labels:
severity: critical
service: export-center
slo: availability
annotations:
summary: "ExportCenter high error burn rate"
description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
# Slow burn - 10% budget consumption in 6 hours (warning)
- alert: ExportCenterElevatedErrorBurnRate
expr: |
(
sum(rate(export_runs_failed_total[6h]))
/
sum(rate(export_runs_total[6h]))
) > (6 * 0.005)
for: 5m
labels:
severity: warning
service: export-center
slo: availability
annotations:
summary: "ExportCenter elevated error burn rate"
description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
- name: export-center-latency
interval: 30s
rules:
# SLO: 95% of exports complete within 120s
# Fast burn - p95 latency exceeding threshold
- alert: ExportCenterHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
) > 120
for: 5m
labels:
severity: warning
service: export-center
slo: latency
annotations:
summary: "ExportCenter high latency"
description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
# Critical latency - p99 exceeding 5 minutes
- alert: ExportCenterCriticalLatency
expr: |
histogram_quantile(0.99,
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
) > 300
for: 2m
labels:
severity: critical
service: export-center
slo: latency
annotations:
summary: "ExportCenter critical latency"
description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
- name: export-center-capacity
interval: 60s
rules:
# Queue buildup warning
- alert: ExportCenterHighConcurrency
expr: sum(export_runs_in_progress) > 50
for: 5m
labels:
severity: warning
service: export-center
annotations:
summary: "ExportCenter high concurrency"
description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
# Stuck exports - exports running longer than 30 minutes
- alert: ExportCenterStuckExports
expr: |
histogram_quantile(0.99,
sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
) > 1800
for: 10m
labels:
severity: warning
service: export-center
annotations:
summary: "ExportCenter potentially stuck exports"
description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
- name: export-center-errors
interval: 30s
rules:
# Specific error code spike
- alert: ExportCenterErrorCodeSpike
expr: |
sum by (error_code) (
rate(export_runs_failed_total[5m])
) > 0.1
for: 5m
labels:
severity: warning
service: export-center
annotations:
summary: "ExportCenter error code spike: {{ $labels.error_code }}"
description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
# No successful exports in 15 minutes (when there is traffic)
- alert: ExportCenterNoSuccessfulExports
expr: |
(
sum(rate(export_runs_total[15m])) > 0
)
and
(
sum(rate(export_runs_success_total[15m])) == 0
)
for: 10m
labels:
severity: critical
service: export-center
annotations:
summary: "ExportCenter no successful exports"
description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
- name: export-center-deprecation
interval: 5m
rules:
# Deprecated endpoint usage
- alert: ExportCenterDeprecatedEndpointUsage
expr: |
sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
for: 1h
labels:
severity: info
service: export-center
annotations:
summary: "Deprecated export endpoints still in use"
description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
runbook_url: "https://docs.stellaops.io/api/export-center/migration"

View File

@@ -0,0 +1,52 @@
groups:
- name: policy-pipeline
rules:
- alert: PolicyCompileLatencyP99High
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy compile latency elevated (p99)"
description: "p99 compile duration has been >5s for 10m"
- alert: PolicySimulationQueueBacklog
expr: sum(policy_simulation_queue_depth) > 100
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy simulation backlog"
description: "Simulation queue depth above 100 for 10m"
- alert: PolicyApprovalLatencyHigh
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
for: 15m
labels:
severity: critical
service: policy
annotations:
summary: "Policy approval latency high"
description: "p95 approval latency above 30s for 15m"
- alert: PolicyPromotionFailureRate
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
for: 10m
labels:
severity: critical
service: policy
annotations:
summary: "Policy promotion failure rate elevated"
description: "Failures exceed 20% of promotions over 15m"
- alert: PolicyPromotionStall
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy promotion stalled"
description: "No successful promotions while work is queued"

View File

@@ -0,0 +1,42 @@
# Scanner FN-Drift Alert Rules
# SLO alerts for false-negative drift thresholds (30-day rolling window)
groups:
- name: scanner-fn-drift
interval: 30s
rules:
- alert: ScannerFnDriftWarning
expr: scanner_fn_drift_percent > 1.0
for: 5m
labels:
severity: warning
service: scanner
slo: fn-drift
annotations:
summary: "Scanner FN-Drift rate above warning threshold"
description: "FN-Drift is {{ $value | humanizePercentage }} (> 1.0%) over the 30-day rolling window."
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-warning"
- alert: ScannerFnDriftCritical
expr: scanner_fn_drift_percent > 2.5
for: 5m
labels:
severity: critical
service: scanner
slo: fn-drift
annotations:
summary: "Scanner FN-Drift rate above critical threshold"
description: "FN-Drift is {{ $value | humanizePercentage }} (> 2.5%) over the 30-day rolling window."
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-critical"
- alert: ScannerFnDriftEngineViolation
expr: scanner_fn_drift_cause_engine > 0
for: 1m
labels:
severity: page
service: scanner
slo: determinism
annotations:
summary: "Engine-caused FN drift detected (determinism violation)"
description: "Engine-caused FN drift count is {{ $value }} (> 0). This indicates non-feed, non-policy changes affecting outcomes."
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-engine-violation"

View File

@@ -0,0 +1,54 @@
groups:
- name: signals-pipeline
rules:
- alert: SignalsScoringLatencyP95High
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals scoring latency high (p95)"
description: "Reachability scoring p95 exceeds 2s for 10m"
- alert: SignalsCacheMissRateHigh
expr: |
clamp_min(rate(signals_cache_misses_total[5m]), 0)
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals cache miss rate high"
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
- alert: SignalsCacheDown
expr: signals_cache_available == 0
for: 2m
labels:
severity: critical
service: signals
annotations:
summary: "Signals cache unavailable"
description: "Redis cache reported unavailable for >2m"
- alert: SignalsSensorStaleness
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
for: 5m
labels:
severity: warning
service: signals
annotations:
summary: "Signals sensor stale"
description: "No updates from sensor for >15 minutes"
- alert: SignalsIngestionErrorRate
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
for: 5m
labels:
severity: critical
service: signals
annotations:
summary: "Signals ingestion failures elevated"
description: "Ingestion failure ratio above 5% over 5m"

View File

@@ -0,0 +1,118 @@
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
# Task: P0M-006 - Alerting Rules
# P0 Product Metrics Alert Rules
groups:
- name: stella-p0-metrics
rules:
# P0M-001: Time to First Verified Release
- alert: StellaTimeToFirstReleaseHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
for: 1h
labels:
severity: warning
category: adoption
annotations:
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
- alert: StellaTimeToFirstReleaseCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
for: 1h
labels:
severity: critical
category: adoption
annotations:
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
# P0M-002: Why Blocked Latency
- alert: StellaWhyBlockedLatencyHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
for: 30m
labels:
severity: warning
category: usability
annotations:
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
- alert: StellaWhyBlockedLatencyCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
for: 30m
labels:
severity: critical
category: usability
annotations:
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
# P0M-003: Support Burden
- alert: StellaSupportBurdenHigh
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
for: 0m
labels:
severity: warning
category: operations
annotations:
summary: "Support burden high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
- alert: StellaSupportBurdenCritical
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
for: 0m
labels:
severity: critical
category: operations
annotations:
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
# P0M-004: Determinism Regressions
- alert: StellaDeterminismRegression
expr: |
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
for: 0m
labels:
severity: critical
category: reliability
annotations:
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionSemantic
expr: |
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionBitwise
expr: |
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"

View File

@@ -0,0 +1,62 @@
groups:
- name: triage-ttfs
rules:
- alert: TriageTtfsFirstEvidenceP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
for: 10m
labels:
severity: critical
service: triage
annotations:
summary: "TTFS first evidence p95 high"
description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
- alert: TriageTtfsSkeletonP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "TTFS skeleton p95 high"
description: "TTFS skeleton p95 exceeds 200ms for 10m."
- alert: TriageTtfsFullEvidenceP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "TTFS full evidence p95 high"
description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
- alert: TriageClicksToClosureMedianHigh
expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
for: 15m
labels:
severity: warning
service: triage
annotations:
summary: "Clicks-to-closure median high"
description: "Median clicks-to-closure exceeds 6 for 15m."
- alert: TriageEvidenceCompletenessAvgLow
expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
for: 30m
labels:
severity: warning
service: triage
annotations:
summary: "Evidence completeness below target"
description: "Average evidence completeness score below 3.6 (90%) for 30m."
- alert: TriageBudgetViolationRateHigh
expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "Performance budget violations elevated"
description: "Performance budget violation rate exceeds 0.05/s for 10m."