groups:
  - name: orchestrator-core
    rules:
      - alert: OrchestratorQueueDepthHigh
        expr: job_queue_depth > 500
        for: 10m
        labels:
          severity: warning
          service: orchestrator
        annotations:
          summary: "Queue depth high"
          description: "job_queue_depth exceeded 500 for 10m"
      - alert: OrchestratorFailuresHigh
        expr: rate(job_failures_total[5m]) > 5
        for: 5m
        labels:
          severity: critical
          service: orchestrator
        annotations:
          summary: "Job failures elevated"
          description: "Failure rate above 5/min in last 5m"
      - alert: OrchestratorLeaseStall
        expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0
        for: 5m
        labels:
          severity: critical
          service: orchestrator
        annotations:
          summary: "Leases stalled"
          description: "No lease renewals while queue has items"
      - alert: OrchestratorDLQDepthHigh
        expr: job_dlq_depth > 10
        for: 10m
        labels:
          severity: warning
          service: orchestrator
        annotations:
          summary: "DLQ depth high"
          description: "Dead-letter queue depth above 10 for 10m"
      - alert: OrchestratorBackpressure
        expr: avg_over_time(rate_limiter_backpressure_ratio[5m]) > 0.5
        for: 5m
        labels:
          severity: warning
          service: orchestrator
        annotations:
          summary: "Backpressure elevated"
          description: "Rate limiter backpressure >50% over 5m"
      - alert: OrchestratorErrorCluster
        expr: sum by(jobType) (rate(job_failures_total[5m])) > 3
        for: 5m
        labels:
          severity: critical
          service: orchestrator
        annotations:
          summary: "Error cluster detected"
          description: "Failure rate >3/min for a job type"
      - alert: OrchestratorFailureBurnRateHigh
        expr: |
          (rate(job_failures_total[5m]) / clamp_min(rate(job_processed_total[5m]), 1)) > 0.02
          and
          (rate(job_failures_total[30m]) / clamp_min(rate(job_processed_total[30m]), 1)) > 0.01
        for: 10m
        labels:
          severity: critical
          service: orchestrator
        annotations:
          summary: "Failure burn rate breaching SLO"
          description: "5m/30m failure burn rate above 2%/1% SLO; investigate upstream jobs and dependencies."