doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements
This commit is contained in:
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
@@ -0,0 +1,186 @@
|
||||
# Unknowns Queue Alert Rules
|
||||
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
|
||||
#
|
||||
# Deploy to Prometheus/Alertmanager
|
||||
|
||||
groups:
|
||||
- name: unknowns-queue
|
||||
interval: 1m
|
||||
rules:
|
||||
# =============================================================================
|
||||
# SLA Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaBreachCritical
|
||||
expr: unknowns_sla_compliance < 0.80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance dropped below 80%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Multiple unknowns have breached their SLA deadlines.
|
||||
Immediate action required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
- alert: UnknownsSlaBreachWarning
|
||||
expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance below 95%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Some unknowns are approaching or have breached SLA.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
|
||||
|
||||
- alert: UnknownsSlaBreach
|
||||
expr: increase(unknowns_sla_breach_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Unknown SLA breached"
|
||||
description: |
|
||||
{{ $value }} unknown(s) have breached SLA in the last hour.
|
||||
Check the unknowns queue dashboard for affected entries.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
# =============================================================================
|
||||
# Queue Depth Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsHotQueueHigh
|
||||
expr: unknowns_queue_depth_hot > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High number of HOT unknowns"
|
||||
description: |
|
||||
{{ $value }} HOT unknowns in queue.
|
||||
HOT unknowns have 24-hour SLA and block releases.
|
||||
Prioritize resolution immediately.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsHotQueuePresent
|
||||
expr: unknowns_queue_depth_hot > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "HOT unknowns present for over 1 hour"
|
||||
description: |
|
||||
{{ $value }} HOT unknown(s) have been in queue for over 1 hour.
|
||||
50% of 24-hour SLA elapsed.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsQueueBacklog
|
||||
expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns queue backlog growing"
|
||||
description: |
|
||||
Total queue depth is {{ $value }}.
|
||||
Consider scaling processing capacity or reviewing automation.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
|
||||
|
||||
# =============================================================================
|
||||
# Processing Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsStuckProcessing
|
||||
expr: greyqueue_processing_count > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Many entries stuck in processing"
|
||||
description: |
|
||||
{{ $value }} entries in Processing status for extended period.
|
||||
Check for processing bottlenecks or failures.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
|
||||
|
||||
- alert: UnknownsProcessingTimeout
|
||||
expr: increase(greyqueue_timeout_total[1h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing timeouts occurring"
|
||||
description: |
|
||||
{{ $value }} processing timeouts in the last hour.
|
||||
Entries are being forcefully retried.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
|
||||
|
||||
- alert: UnknownsProcessingFailures
|
||||
expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing failures detected"
|
||||
description: |
|
||||
{{ $value }} entries moved to Failed status in the last hour.
|
||||
Manual intervention may be required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
|
||||
|
||||
# =============================================================================
|
||||
# Escalation Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsEscalationRate
|
||||
expr: increase(unknowns_escalated_total[1h]) > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High escalation rate"
|
||||
description: |
|
||||
{{ $value }} unknowns escalated in the last hour.
|
||||
Review escalation criteria or upstream data quality.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
|
||||
|
||||
# =============================================================================
|
||||
# Service Health Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaMonitorDown
|
||||
expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns SLA monitor not reporting"
|
||||
description: |
|
||||
No metrics received from unknowns SLA monitor.
|
||||
Check if the service is running.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
|
||||
|
||||
- alert: UnknownsHealthCheckUnhealthy
|
||||
expr: probe_success{job="unknowns-healthcheck"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns service health check failing"
|
||||
description: |
|
||||
Health check endpoint returning unhealthy.
|
||||
SLA breaches may exist.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"
|
||||
Reference in New Issue
Block a user