# Unknowns Queue Alert Rules
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
# 
# Deploy to Prometheus/Alertmanager

groups:
  - name: unknowns-queue
    interval: 1m
    rules:
      # =============================================================================
      # SLA Alerts
      # =============================================================================
      
      - alert: UnknownsSlaBreachCritical
        expr: unknowns_sla_compliance < 0.80
        for: 5m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "SLA compliance dropped below 80%"
          description: |
            SLA compliance is {{ $value | humanizePercentage }}.
            Multiple unknowns have breached their SLA deadlines.
            Immediate action required.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
          
      - alert: UnknownsSlaBreachWarning
        expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
        for: 15m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "SLA compliance below 95%"
          description: |
            SLA compliance is {{ $value | humanizePercentage }}.
            Some unknowns are approaching or have breached SLA.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
          
      - alert: UnknownsSlaBreach
        expr: increase(unknowns_sla_breach_total[1h]) > 0
        for: 0m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "Unknown SLA breached"
          description: |
            {{ $value }} unknown(s) have breached SLA in the last hour.
            Check the unknowns queue dashboard for affected entries.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"

      # =============================================================================
      # Queue Depth Alerts
      # =============================================================================
      
      - alert: UnknownsHotQueueHigh
        expr: unknowns_queue_depth_hot > 5
        for: 10m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "High number of HOT unknowns"
          description: |
            {{ $value }} HOT unknowns in queue.
            HOT unknowns have 24-hour SLA and block releases.
            Prioritize resolution immediately.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
          
      - alert: UnknownsHotQueuePresent
        expr: unknowns_queue_depth_hot > 0
        for: 1h
        labels:
          severity: warning
          team: security
        annotations:
          summary: "HOT unknowns present for over 1 hour"
          description: |
            {{ $value }} HOT unknown(s) have been in queue for over 1 hour.
            50% of 24-hour SLA elapsed.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
          
      - alert: UnknownsQueueBacklog
        expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
        for: 30m
        labels:
          severity: warning
          team: operations
        annotations:
          summary: "Unknowns queue backlog growing"
          description: |
            Total queue depth is {{ $value }}.
            Consider scaling processing capacity or reviewing automation.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"

      # =============================================================================
      # Processing Alerts
      # =============================================================================
      
      - alert: UnknownsStuckProcessing
        expr: greyqueue_processing_count > 10
        for: 30m
        labels:
          severity: warning
          team: operations
        annotations:
          summary: "Many entries stuck in processing"
          description: |
            {{ $value }} entries in Processing status for extended period.
            Check for processing bottlenecks or failures.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
          
      - alert: UnknownsProcessingTimeout
        expr: increase(greyqueue_timeout_total[1h]) > 5
        for: 0m
        labels:
          severity: warning
          team: operations
        annotations:
          summary: "Processing timeouts occurring"
          description: |
            {{ $value }} processing timeouts in the last hour.
            Entries are being forcefully retried.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
          
      - alert: UnknownsProcessingFailures
        expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
        for: 0m
        labels:
          severity: critical
          team: operations
        annotations:
          summary: "Processing failures detected"
          description: |
            {{ $value }} entries moved to Failed status in the last hour.
            Manual intervention may be required.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"

      # =============================================================================
      # Escalation Alerts
      # =============================================================================
      
      - alert: UnknownsEscalationRate
        expr: increase(unknowns_escalated_total[1h]) > 10
        for: 0m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "High escalation rate"
          description: |
            {{ $value }} unknowns escalated in the last hour.
            Review escalation criteria or upstream data quality.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"

      # =============================================================================
      # Service Health Alerts
      # =============================================================================
      
      - alert: UnknownsSlaMonitorDown
        expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
        for: 5m
        labels:
          severity: critical
          team: operations
        annotations:
          summary: "Unknowns SLA monitor not reporting"
          description: |
            No metrics received from unknowns SLA monitor.
            Check if the service is running.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
          
      - alert: UnknownsHealthCheckUnhealthy
        expr: probe_success{job="unknowns-healthcheck"} == 0
        for: 5m
        labels:
          severity: critical
          team: operations
        annotations:
          summary: "Unknowns service health check failing"
          description: |
            Health check endpoint returning unhealthy.
            SLA breaches may exist.
          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"