Files
git.stella-ops.org/devops/observability/policy-alerts.yaml
2025-12-26 18:11:06 +02:00

53 lines
1.9 KiB
YAML

groups:
- name: policy-pipeline
rules:
- alert: PolicyCompileLatencyP99High
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy compile latency elevated (p99)"
description: "p99 compile duration has been >5s for 10m"
- alert: PolicySimulationQueueBacklog
expr: sum(policy_simulation_queue_depth) > 100
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy simulation backlog"
description: "Simulation queue depth above 100 for 10m"
- alert: PolicyApprovalLatencyHigh
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
for: 15m
labels:
severity: critical
service: policy
annotations:
summary: "Policy approval latency high"
description: "p95 approval latency above 30s for 15m"
- alert: PolicyPromotionFailureRate
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
for: 10m
labels:
severity: critical
service: policy
annotations:
summary: "Policy promotion failure rate elevated"
description: "Failures exceed 20% of promotions over 15m"
- alert: PolicyPromotionStall
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
for: 10m
labels:
severity: warning
service: policy
annotations:
summary: "Policy promotion stalled"
description: "No successful promotions while work is queued"