187 lines
7.2 KiB
YAML
187 lines
7.2 KiB
YAML
# Unknowns Queue Alert Rules
|
|
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
|
|
#
|
|
# Deploy to Prometheus/Alertmanager
|
|
|
|
groups:
|
|
- name: unknowns-queue
|
|
interval: 1m
|
|
rules:
|
|
# =============================================================================
|
|
# SLA Alerts
|
|
# =============================================================================
|
|
|
|
- alert: UnknownsSlaBreachCritical
|
|
expr: unknowns_sla_compliance < 0.80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: security
|
|
annotations:
|
|
summary: "SLA compliance dropped below 80%"
|
|
description: |
|
|
SLA compliance is {{ $value | humanizePercentage }}.
|
|
Multiple unknowns have breached their SLA deadlines.
|
|
Immediate action required.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
|
|
|
- alert: UnknownsSlaBreachWarning
|
|
expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "SLA compliance below 95%"
|
|
description: |
|
|
SLA compliance is {{ $value | humanizePercentage }}.
|
|
Some unknowns are approaching or have breached SLA.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
|
|
|
|
- alert: UnknownsSlaBreach
|
|
expr: increase(unknowns_sla_breach_total[1h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
team: security
|
|
annotations:
|
|
summary: "Unknown SLA breached"
|
|
description: |
|
|
{{ $value }} unknown(s) have breached SLA in the last hour.
|
|
Check the unknowns queue dashboard for affected entries.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
|
|
|
# =============================================================================
|
|
# Queue Depth Alerts
|
|
# =============================================================================
|
|
|
|
- alert: UnknownsHotQueueHigh
|
|
expr: unknowns_queue_depth_hot > 5
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
team: security
|
|
annotations:
|
|
summary: "High number of HOT unknowns"
|
|
description: |
|
|
{{ $value }} HOT unknowns in queue.
|
|
HOT unknowns have 24-hour SLA and block releases.
|
|
Prioritize resolution immediately.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
|
|
|
- alert: UnknownsHotQueuePresent
|
|
expr: unknowns_queue_depth_hot > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "HOT unknowns present for over 1 hour"
|
|
description: |
|
|
{{ $value }} HOT unknown(s) have been in queue for over 1 hour.
|
|
50% of 24-hour SLA elapsed.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
|
|
|
- alert: UnknownsQueueBacklog
|
|
expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
team: operations
|
|
annotations:
|
|
summary: "Unknowns queue backlog growing"
|
|
description: |
|
|
Total queue depth is {{ $value }}.
|
|
Consider scaling processing capacity or reviewing automation.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
|
|
|
|
# =============================================================================
|
|
# Processing Alerts
|
|
# =============================================================================
|
|
|
|
- alert: UnknownsStuckProcessing
|
|
expr: greyqueue_processing_count > 10
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
team: operations
|
|
annotations:
|
|
summary: "Many entries stuck in processing"
|
|
description: |
|
|
{{ $value }} entries in Processing status for extended period.
|
|
Check for processing bottlenecks or failures.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
|
|
|
|
- alert: UnknownsProcessingTimeout
|
|
expr: increase(greyqueue_timeout_total[1h]) > 5
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
team: operations
|
|
annotations:
|
|
summary: "Processing timeouts occurring"
|
|
description: |
|
|
{{ $value }} processing timeouts in the last hour.
|
|
Entries are being forcefully retried.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
|
|
|
|
- alert: UnknownsProcessingFailures
|
|
expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
team: operations
|
|
annotations:
|
|
summary: "Processing failures detected"
|
|
description: |
|
|
{{ $value }} entries moved to Failed status in the last hour.
|
|
Manual intervention may be required.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
|
|
|
|
# =============================================================================
|
|
# Escalation Alerts
|
|
# =============================================================================
|
|
|
|
- alert: UnknownsEscalationRate
|
|
expr: increase(unknowns_escalated_total[1h]) > 10
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "High escalation rate"
|
|
description: |
|
|
{{ $value }} unknowns escalated in the last hour.
|
|
Review escalation criteria or upstream data quality.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
|
|
|
|
# =============================================================================
|
|
# Service Health Alerts
|
|
# =============================================================================
|
|
|
|
- alert: UnknownsSlaMonitorDown
|
|
expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: operations
|
|
annotations:
|
|
summary: "Unknowns SLA monitor not reporting"
|
|
description: |
|
|
No metrics received from unknowns SLA monitor.
|
|
Check if the service is running.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
|
|
|
|
- alert: UnknownsHealthCheckUnhealthy
|
|
expr: probe_success{job="unknowns-healthcheck"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: operations
|
|
annotations:
|
|
summary: "Unknowns service health check failing"
|
|
description: |
|
|
Health check endpoint returning unhealthy.
|
|
SLA breaches may exist.
|
|
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"
|