# Unknowns Queue Alert Rules # Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007) # # Deploy to Prometheus/Alertmanager groups: - name: unknowns-queue interval: 1m rules: # ============================================================================= # SLA Alerts # ============================================================================= - alert: UnknownsSlaBreachCritical expr: unknowns_sla_compliance < 0.80 for: 5m labels: severity: critical team: security annotations: summary: "SLA compliance dropped below 80%" description: | SLA compliance is {{ $value | humanizePercentage }}. Multiple unknowns have breached their SLA deadlines. Immediate action required. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach" - alert: UnknownsSlaBreachWarning expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80 for: 15m labels: severity: warning team: security annotations: summary: "SLA compliance below 95%" description: | SLA compliance is {{ $value | humanizePercentage }}. Some unknowns are approaching or have breached SLA. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning" - alert: UnknownsSlaBreach expr: increase(unknowns_sla_breach_total[1h]) > 0 for: 0m labels: severity: critical team: security annotations: summary: "Unknown SLA breached" description: | {{ $value }} unknown(s) have breached SLA in the last hour. Check the unknowns queue dashboard for affected entries. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach" # ============================================================================= # Queue Depth Alerts # ============================================================================= - alert: UnknownsHotQueueHigh expr: unknowns_queue_depth_hot > 5 for: 10m labels: severity: critical team: security annotations: summary: "High number of HOT unknowns" description: | {{ $value }} HOT unknowns in queue. HOT unknowns have 24-hour SLA and block releases. Prioritize resolution immediately. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue" - alert: UnknownsHotQueuePresent expr: unknowns_queue_depth_hot > 0 for: 1h labels: severity: warning team: security annotations: summary: "HOT unknowns present for over 1 hour" description: | {{ $value }} HOT unknown(s) have been in queue for over 1 hour. 50% of 24-hour SLA elapsed. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue" - alert: UnknownsQueueBacklog expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100 for: 30m labels: severity: warning team: operations annotations: summary: "Unknowns queue backlog growing" description: | Total queue depth is {{ $value }}. Consider scaling processing capacity or reviewing automation. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog" # ============================================================================= # Processing Alerts # ============================================================================= - alert: UnknownsStuckProcessing expr: greyqueue_processing_count > 10 for: 30m labels: severity: warning team: operations annotations: summary: "Many entries stuck in processing" description: | {{ $value }} entries in Processing status for extended period. Check for processing bottlenecks or failures. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing" - alert: UnknownsProcessingTimeout expr: increase(greyqueue_timeout_total[1h]) > 5 for: 0m labels: severity: warning team: operations annotations: summary: "Processing timeouts occurring" description: | {{ $value }} processing timeouts in the last hour. Entries are being forcefully retried. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts" - alert: UnknownsProcessingFailures expr: increase(greyqueue_watchdog_failed_total[1h]) > 0 for: 0m labels: severity: critical team: operations annotations: summary: "Processing failures detected" description: | {{ $value }} entries moved to Failed status in the last hour. Manual intervention may be required. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures" # ============================================================================= # Escalation Alerts # ============================================================================= - alert: UnknownsEscalationRate expr: increase(unknowns_escalated_total[1h]) > 10 for: 0m labels: severity: warning team: security annotations: summary: "High escalation rate" description: | {{ $value }} unknowns escalated in the last hour. Review escalation criteria or upstream data quality. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations" # ============================================================================= # Service Health Alerts # ============================================================================= - alert: UnknownsSlaMonitorDown expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm) for: 5m labels: severity: critical team: operations annotations: summary: "Unknowns SLA monitor not reporting" description: | No metrics received from unknowns SLA monitor. Check if the service is running. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down" - alert: UnknownsHealthCheckUnhealthy expr: probe_success{job="unknowns-healthcheck"} == 0 for: 5m labels: severity: critical team: operations annotations: summary: "Unknowns service health check failing" description: | Health check endpoint returning unhealthy. SLA breaches may exist. runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"