doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements
This commit is contained in:
@@ -0,0 +1,361 @@
|
||||
{
|
||||
"__inputs": [],
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "Unknowns Queue monitoring dashboard - Sprint SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Queue Overview",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Total Queue Depth",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold)",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 50, "color": "yellow" },
|
||||
{ "value": 100, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "HOT Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_hot",
|
||||
"legendFormat": "HOT"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 1, "color": "orange" },
|
||||
{ "value": 5, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "WARM Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_warm",
|
||||
"legendFormat": "WARM"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 10, "color": "yellow" },
|
||||
{ "value": 25, "color": "orange" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "COLD Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_cold",
|
||||
"legendFormat": "COLD"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "SLA Compliance",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_sla_compliance * 100",
|
||||
"legendFormat": "Compliance %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 80, "color": "yellow" },
|
||||
{ "value": 95, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Stuck Processing",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "greyqueue_processing_count",
|
||||
"legendFormat": "Processing"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 5, "color": "yellow" },
|
||||
{ "value": 10, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Depth Over Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_hot",
|
||||
"legendFormat": "HOT"
|
||||
},
|
||||
{
|
||||
"expr": "unknowns_queue_depth_warm",
|
||||
"legendFormat": "WARM"
|
||||
},
|
||||
{
|
||||
"expr": "unknowns_queue_depth_cold",
|
||||
"legendFormat": "COLD"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HOT" }, "properties": [{ "id": "color", "value": { "fixedColor": "red" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "WARM" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "COLD" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue" } }] }
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "SLA Compliance Over Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_sla_compliance * 100",
|
||||
"legendFormat": "SLA Compliance %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 80, "color": "yellow" },
|
||||
{ "value": 95, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Operations",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "State Transitions (Rate)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(unknowns_state_transitions_total[5m])",
|
||||
"legendFormat": "{{from_state}} → {{to_state}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Processing Time (p95)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(unknowns_processing_time_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95 Processing Time"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Escalations & Failures",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(unknowns_escalated_total[1h])",
|
||||
"legendFormat": "Escalations"
|
||||
},
|
||||
{
|
||||
"expr": "rate(unknowns_demoted_total[1h])",
|
||||
"legendFormat": "Demotions"
|
||||
},
|
||||
{
|
||||
"expr": "rate(unknowns_expired_total[1h])",
|
||||
"legendFormat": "Expired"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_watchdog_failed_total[1h])",
|
||||
"legendFormat": "Failed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Resolution Time by Band",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"hot\"}[1h]))",
|
||||
"legendFormat": "HOT (p50)"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"warm\"}[1h]))",
|
||||
"legendFormat": "WARM (p50)"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"cold\"}[1h]))",
|
||||
"legendFormat": "COLD (p50)"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "h"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Watchdog Metrics",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Stuck & Timeout Events",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(greyqueue_stuck_total[1h]) * 3600",
|
||||
"legendFormat": "Stuck (per hour)"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_timeout_total[1h]) * 3600",
|
||||
"legendFormat": "Timeouts (per hour)"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_watchdog_retry_total[1h]) * 3600",
|
||||
"legendFormat": "Forced Retries (per hour)"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Currently Processing",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "greyqueue_processing_count",
|
||||
"legendFormat": "In Processing"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "SLA Breaches Today",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(unknowns_sla_breach_total[24h])",
|
||||
"legendFormat": "Breaches (24h)"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 1, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["unknowns", "security", "sla"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"title": "Unknowns Queue Dashboard",
|
||||
"uid": "unknowns-queue-dashboard",
|
||||
"version": 1
|
||||
}
|
||||
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
@@ -0,0 +1,186 @@
|
||||
# Unknowns Queue Alert Rules
|
||||
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
|
||||
#
|
||||
# Deploy to Prometheus/Alertmanager
|
||||
|
||||
groups:
|
||||
- name: unknowns-queue
|
||||
interval: 1m
|
||||
rules:
|
||||
# =============================================================================
|
||||
# SLA Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaBreachCritical
|
||||
expr: unknowns_sla_compliance < 0.80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance dropped below 80%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Multiple unknowns have breached their SLA deadlines.
|
||||
Immediate action required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
- alert: UnknownsSlaBreachWarning
|
||||
expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance below 95%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Some unknowns are approaching or have breached SLA.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
|
||||
|
||||
- alert: UnknownsSlaBreach
|
||||
expr: increase(unknowns_sla_breach_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Unknown SLA breached"
|
||||
description: |
|
||||
{{ $value }} unknown(s) have breached SLA in the last hour.
|
||||
Check the unknowns queue dashboard for affected entries.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
# =============================================================================
|
||||
# Queue Depth Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsHotQueueHigh
|
||||
expr: unknowns_queue_depth_hot > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High number of HOT unknowns"
|
||||
description: |
|
||||
{{ $value }} HOT unknowns in queue.
|
||||
HOT unknowns have 24-hour SLA and block releases.
|
||||
Prioritize resolution immediately.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsHotQueuePresent
|
||||
expr: unknowns_queue_depth_hot > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "HOT unknowns present for over 1 hour"
|
||||
description: |
|
||||
{{ $value }} HOT unknown(s) have been in queue for over 1 hour.
|
||||
50% of 24-hour SLA elapsed.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsQueueBacklog
|
||||
expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns queue backlog growing"
|
||||
description: |
|
||||
Total queue depth is {{ $value }}.
|
||||
Consider scaling processing capacity or reviewing automation.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
|
||||
|
||||
# =============================================================================
|
||||
# Processing Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsStuckProcessing
|
||||
expr: greyqueue_processing_count > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Many entries stuck in processing"
|
||||
description: |
|
||||
{{ $value }} entries in Processing status for extended period.
|
||||
Check for processing bottlenecks or failures.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
|
||||
|
||||
- alert: UnknownsProcessingTimeout
|
||||
expr: increase(greyqueue_timeout_total[1h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing timeouts occurring"
|
||||
description: |
|
||||
{{ $value }} processing timeouts in the last hour.
|
||||
Entries are being forcefully retried.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
|
||||
|
||||
- alert: UnknownsProcessingFailures
|
||||
expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing failures detected"
|
||||
description: |
|
||||
{{ $value }} entries moved to Failed status in the last hour.
|
||||
Manual intervention may be required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
|
||||
|
||||
# =============================================================================
|
||||
# Escalation Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsEscalationRate
|
||||
expr: increase(unknowns_escalated_total[1h]) > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High escalation rate"
|
||||
description: |
|
||||
{{ $value }} unknowns escalated in the last hour.
|
||||
Review escalation criteria or upstream data quality.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
|
||||
|
||||
# =============================================================================
|
||||
# Service Health Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaMonitorDown
|
||||
expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns SLA monitor not reporting"
|
||||
description: |
|
||||
No metrics received from unknowns SLA monitor.
|
||||
Check if the service is running.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
|
||||
|
||||
- alert: UnknownsHealthCheckUnhealthy
|
||||
expr: probe_success{job="unknowns-healthcheck"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns service health check failing"
|
||||
description: |
|
||||
Health check endpoint returning unhealthy.
|
||||
SLA breaches may exist.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"
|
||||
Reference in New Issue
Block a user