groups: - name: policy-pipeline rules: - alert: PolicyCompileLatencyP99High expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5 for: 10m labels: severity: warning service: policy annotations: summary: "Policy compile latency elevated (p99)" description: "p99 compile duration has been >5s for 10m" - alert: PolicySimulationQueueBacklog expr: sum(policy_simulation_queue_depth) > 100 for: 10m labels: severity: warning service: policy annotations: summary: "Policy simulation backlog" description: "Simulation queue depth above 100 for 10m" - alert: PolicyApprovalLatencyHigh expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30 for: 15m labels: severity: critical service: policy annotations: summary: "Policy approval latency high" description: "p95 approval latency above 30s for 15m" - alert: PolicyPromotionFailureRate expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2 for: 10m labels: severity: critical service: policy annotations: summary: "Policy promotion failure rate elevated" description: "Failures exceed 20% of promotions over 15m" - alert: PolicyPromotionStall expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0 for: 10m labels: severity: warning service: policy annotations: summary: "Policy promotion stalled" description: "No successful promotions while work is queued"