groups: - name: orchestrator-core rules: - alert: OrchestratorQueueDepthHigh expr: job_queue_depth > 500 for: 10m labels: severity: warning service: orchestrator annotations: summary: "Queue depth high" description: "job_queue_depth exceeded 500 for 10m" - alert: OrchestratorFailuresHigh expr: rate(job_failures_total[5m]) > 5 for: 5m labels: severity: critical service: orchestrator annotations: summary: "Job failures elevated" description: "Failure rate above 5/min in last 5m" - alert: OrchestratorLeaseStall expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0 for: 5m labels: severity: critical service: orchestrator annotations: summary: "Leases stalled" description: "No lease renewals while queue has items" - alert: OrchestratorDLQDepthHigh expr: job_dlq_depth > 10 for: 10m labels: severity: warning service: orchestrator annotations: summary: "DLQ depth high" description: "Dead-letter queue depth above 10 for 10m" - alert: OrchestratorBackpressure expr: avg_over_time(rate_limiter_backpressure_ratio[5m]) > 0.5 for: 5m labels: severity: warning service: orchestrator annotations: summary: "Backpressure elevated" description: "Rate limiter backpressure >50% over 5m" - alert: OrchestratorErrorCluster expr: sum by(jobType) (rate(job_failures_total[5m])) > 3 for: 5m labels: severity: critical service: orchestrator annotations: summary: "Error cluster detected" description: "Failure rate >3/min for a job type" - alert: OrchestratorFailureBurnRateHigh expr: | (rate(job_failures_total[5m]) / clamp_min(rate(job_processed_total[5m]), 1)) > 0.02 and (rate(job_failures_total[30m]) / clamp_min(rate(job_processed_total[30m]), 1)) > 0.01 for: 10m labels: severity: critical service: orchestrator annotations: summary: "Failure burn rate breaching SLO" description: "5m/30m failure burn rate above 2%/1% SLO; investigate upstream jobs and dependencies."