devops folders consolidate

This commit is contained in:
master
2026-01-25 23:27:41 +02:00
parent 6e687b523a
commit a743bb9a1d
613 changed files with 8611 additions and 41846 deletions

View File

@@ -0,0 +1,36 @@
groups:
- name: slo-burn
rules:
- alert: SLOBurnRateFast
expr: |
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
4 * (1 - 0.99)
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Fast burn: 99% SLO breached"
description: "Error budget burn (5m) exceeds fast threshold."
- alert: SLOBurnRateSlow
expr: |
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
1 * (1 - 0.99)
for: 1h
labels:
severity: warning
team: devops
annotations:
summary: "Slow burn: 99% SLO at risk"
description: "Error budget burn (1h) exceeds slow threshold."
- name: slo-webhook
rules:
- alert: SLOWebhookFailures
expr: rate(slo_webhook_failures_total[5m]) > 0
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "SLO webhook failures"
description: "Webhook emitter has failures in last 5m."

View File

@@ -0,0 +1,43 @@
groups:
- name: attestor-latency
rules:
- alert: AttestorSignLatencyP95High
expr: histogram_quantile(0.95, sum(rate(attestor_sign_duration_seconds_bucket[5m])) by (le)) > 2
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Attestor signing latency p95 high"
description: "Signing p95 is {{ $value }}s over the last 5m (threshold 2s)."
- alert: AttestorVerifyLatencyP95High
expr: histogram_quantile(0.95, sum(rate(attestor_verify_duration_seconds_bucket[5m])) by (le)) > 2
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Attestor verification latency p95 high"
description: "Verification p95 is {{ $value }}s over the last 5m (threshold 2s)."
- name: attestor-errors
rules:
- alert: AttestorVerifyFailureRate
expr: rate(attestor_verify_failures_total[5m]) / rate(attestor_verify_requests_total[5m]) > 0.02
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Attestor verification failure rate above 2%"
description: "Verification failure rate is {{ $value | humanizePercentage }} over last 5m."
- name: attestor-keys
rules:
- alert: AttestorKeyRotationStale
expr: (time() - attestor_key_last_rotated_seconds) > 60*60*24*30
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Attestor signing key rotation overdue"
description: "Signing key has not rotated in >30d ({{ $value }} seconds)."

View File

@@ -0,0 +1,52 @@
groups:
- name: policy-pipeline
rules:
- alert: PolicyCompileLatencyP99High
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy compile latency elevated (p99)"
description: "p99 compile duration has been >5s for 10m"
- alert: PolicySimulationQueueBacklog
expr: sum(policy_simulation_queue_depth) > 100
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy simulation backlog"
description: "Simulation queue depth above 100 for 10m"
- alert: PolicyApprovalLatencyHigh
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
for: 15m
labels:
severity: critical
service: policy
annotations:
summary: "Policy approval latency high"
description: "p95 approval latency above 30s for 15m"
- alert: PolicyPromotionFailureRate
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
for: 10m
labels:
severity: critical
service: policy
annotations:
summary: "Policy promotion failure rate elevated"
description: "Failures exceed 20% of promotions over 15m"
- alert: PolicyPromotionStall
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy promotion stalled"
description: "No successful promotions while work is queued"

View File

@@ -0,0 +1,54 @@
groups:
- name: signals-pipeline
rules:
- alert: SignalsScoringLatencyP95High
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals scoring latency high (p95)"
description: "Reachability scoring p95 exceeds 2s for 10m"
- alert: SignalsCacheMissRateHigh
expr: |
clamp_min(rate(signals_cache_misses_total[5m]), 0)
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals cache miss rate high"
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
- alert: SignalsCacheDown
expr: signals_cache_available == 0
for: 2m
labels:
severity: critical
service: signals
annotations:
summary: "Signals cache unavailable"
description: "Redis cache reported unavailable for >2m"
- alert: SignalsSensorStaleness
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
for: 5m
labels:
severity: warning
service: signals
annotations:
summary: "Signals sensor stale"
description: "No updates from sensor for >15 minutes"
- alert: SignalsIngestionErrorRate
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
for: 5m
labels:
severity: critical
service: signals
annotations:
summary: "Signals ingestion failures elevated"
description: "Ingestion failure ratio above 5% over 5m"

View File

@@ -0,0 +1,62 @@
groups:
- name: triage-ttfs
rules:
- alert: TriageTtfsFirstEvidenceP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
for: 10m
labels:
severity: critical
service: triage
annotations:
summary: "TTFS first evidence p95 high"
description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
- alert: TriageTtfsSkeletonP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "TTFS skeleton p95 high"
description: "TTFS skeleton p95 exceeds 200ms for 10m."
- alert: TriageTtfsFullEvidenceP95High
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "TTFS full evidence p95 high"
description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
- alert: TriageClicksToClosureMedianHigh
expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
for: 15m
labels:
severity: warning
service: triage
annotations:
summary: "Clicks-to-closure median high"
description: "Median clicks-to-closure exceeds 6 for 15m."
- alert: TriageEvidenceCompletenessAvgLow
expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
for: 30m
labels:
severity: warning
service: triage
annotations:
summary: "Evidence completeness below target"
description: "Average evidence completeness score below 3.6 (90%) for 30m."
- alert: TriageBudgetViolationRateHigh
expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
for: 10m
labels:
severity: warning
service: triage
annotations:
summary: "Performance budget violations elevated"
description: "Performance budget violation rate exceeds 0.05/s for 10m."