Files
git.stella-ops.org/devops/observability/grafana/dashboards/unknowns-queue-dashboard.json

362 lines
9.3 KiB
JSON

{
"__inputs": [],
"annotations": {
"list": []
},
"description": "Unknowns Queue monitoring dashboard - Sprint SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"title": "Queue Overview",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"collapsed": false
},
{
"title": "Total Queue Depth",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"targets": [
{
"expr": "sum(unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold)",
"legendFormat": "Total"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 50, "color": "yellow" },
{ "value": 100, "color": "red" }
]
}
}
}
},
{
"title": "HOT Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_hot",
"legendFormat": "HOT"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "orange" },
{ "value": 5, "color": "red" }
]
}
}
}
},
{
"title": "WARM Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_warm",
"legendFormat": "WARM"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 10, "color": "yellow" },
{ "value": 25, "color": "orange" }
]
}
}
}
},
{
"title": "COLD Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_cold",
"legendFormat": "COLD"
}
]
},
{
"title": "SLA Compliance",
"type": "gauge",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"targets": [
{
"expr": "unknowns_sla_compliance * 100",
"legendFormat": "Compliance %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "red" },
{ "value": 80, "color": "yellow" },
{ "value": 95, "color": "green" }
]
}
}
}
},
{
"title": "Stuck Processing",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"targets": [
{
"expr": "greyqueue_processing_count",
"legendFormat": "Processing"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 5, "color": "yellow" },
{ "value": 10, "color": "red" }
]
}
}
}
},
{
"title": "Queue Depth Over Time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
"targets": [
{
"expr": "unknowns_queue_depth_hot",
"legendFormat": "HOT"
},
{
"expr": "unknowns_queue_depth_warm",
"legendFormat": "WARM"
},
{
"expr": "unknowns_queue_depth_cold",
"legendFormat": "COLD"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"lineWidth": 2,
"fillOpacity": 20
}
},
"overrides": [
{ "matcher": { "id": "byName", "options": "HOT" }, "properties": [{ "id": "color", "value": { "fixedColor": "red" } }] },
{ "matcher": { "id": "byName", "options": "WARM" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange" } }] },
{ "matcher": { "id": "byName", "options": "COLD" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue" } }] }
]
}
},
{
"title": "SLA Compliance Over Time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
"targets": [
{
"expr": "unknowns_sla_compliance * 100",
"legendFormat": "SLA Compliance %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 80, "color": "yellow" },
{ "value": 95, "color": "green" }
]
}
}
}
},
{
"title": "Operations",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
"collapsed": false
},
{
"title": "State Transitions (Rate)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
"targets": [
{
"expr": "rate(unknowns_state_transitions_total[5m])",
"legendFormat": "{{from_state}} → {{to_state}}"
}
]
},
{
"title": "Processing Time (p95)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
"targets": [
{
"expr": "histogram_quantile(0.95, rate(unknowns_processing_time_seconds_bucket[5m]))",
"legendFormat": "p95 Processing Time"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"title": "Escalations & Failures",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"targets": [
{
"expr": "rate(unknowns_escalated_total[1h])",
"legendFormat": "Escalations"
},
{
"expr": "rate(unknowns_demoted_total[1h])",
"legendFormat": "Demotions"
},
{
"expr": "rate(unknowns_expired_total[1h])",
"legendFormat": "Expired"
},
{
"expr": "rate(greyqueue_watchdog_failed_total[1h])",
"legendFormat": "Failed"
}
]
},
{
"title": "Resolution Time by Band",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"targets": [
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"hot\"}[1h]))",
"legendFormat": "HOT (p50)"
},
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"warm\"}[1h]))",
"legendFormat": "WARM (p50)"
},
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"cold\"}[1h]))",
"legendFormat": "COLD (p50)"
}
],
"fieldConfig": {
"defaults": {
"unit": "h"
}
}
},
{
"title": "Watchdog Metrics",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
"collapsed": false
},
{
"title": "Stuck & Timeout Events",
"type": "timeseries",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 },
"targets": [
{
"expr": "rate(greyqueue_stuck_total[1h]) * 3600",
"legendFormat": "Stuck (per hour)"
},
{
"expr": "rate(greyqueue_timeout_total[1h]) * 3600",
"legendFormat": "Timeouts (per hour)"
},
{
"expr": "rate(greyqueue_watchdog_retry_total[1h]) * 3600",
"legendFormat": "Forced Retries (per hour)"
}
]
},
{
"title": "Currently Processing",
"type": "stat",
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 31 },
"targets": [
{
"expr": "greyqueue_processing_count",
"legendFormat": "In Processing"
}
]
},
{
"title": "SLA Breaches Today",
"type": "stat",
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 31 },
"targets": [
{
"expr": "increase(unknowns_sla_breach_total[24h])",
"legendFormat": "Breaches (24h)"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "red" }
]
}
}
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["unknowns", "security", "sla"],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"title": "Unknowns Queue Dashboard",
"uid": "unknowns-queue-dashboard",
"version": 1
}