doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements

This commit is contained in:
master
2026-01-19 09:02:59 +02:00
parent 8c4bf54aed
commit 17419ba7c4
809 changed files with 170738 additions and 12244 deletions

View File

@@ -0,0 +1,139 @@
-- -----------------------------------------------------------------------------
-- V20260119_001__Add_UnderReview_Escalated_Rejected_States.sql
-- Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
-- Task: UQ-005 - Migration for existing entries (map to new states)
-- Description: Adds new state machine states and required columns
-- -----------------------------------------------------------------------------
-- Add new columns for UnderReview and Escalated states
ALTER TABLE grey_queue_entries
ADD COLUMN IF NOT EXISTS assignee VARCHAR(255) NULL,
ADD COLUMN IF NOT EXISTS assigned_at TIMESTAMPTZ NULL,
ADD COLUMN IF NOT EXISTS escalated_at TIMESTAMPTZ NULL,
ADD COLUMN IF NOT EXISTS escalation_reason TEXT NULL;
-- Add new enum values to grey_queue_status
-- Note: PostgreSQL requires special handling for enum additions
-- First, check if we need to add the values (idempotent)
DO $$
BEGIN
-- Add 'under_review' if not exists
IF NOT EXISTS (
SELECT 1 FROM pg_enum
WHERE enumlabel = 'under_review'
AND enumtypid = 'grey_queue_status'::regtype
) THEN
ALTER TYPE grey_queue_status ADD VALUE 'under_review' AFTER 'retrying';
END IF;
-- Add 'escalated' if not exists
IF NOT EXISTS (
SELECT 1 FROM pg_enum
WHERE enumlabel = 'escalated'
AND enumtypid = 'grey_queue_status'::regtype
) THEN
ALTER TYPE grey_queue_status ADD VALUE 'escalated' AFTER 'under_review';
END IF;
-- Add 'rejected' if not exists
IF NOT EXISTS (
SELECT 1 FROM pg_enum
WHERE enumlabel = 'rejected'
AND enumtypid = 'grey_queue_status'::regtype
) THEN
ALTER TYPE grey_queue_status ADD VALUE 'rejected' AFTER 'resolved';
END IF;
EXCEPTION
WHEN others THEN
-- Enum values may already exist, which is fine
NULL;
END $$;
-- Add indexes for new query patterns
CREATE INDEX IF NOT EXISTS idx_grey_queue_assignee
ON grey_queue_entries(assignee)
WHERE assignee IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_grey_queue_status_assignee
ON grey_queue_entries(status, assignee)
WHERE status IN ('under_review', 'escalated');
CREATE INDEX IF NOT EXISTS idx_grey_queue_escalated_at
ON grey_queue_entries(escalated_at DESC)
WHERE escalated_at IS NOT NULL;
-- Add audit trigger for state transitions
CREATE TABLE IF NOT EXISTS grey_queue_state_transitions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
entry_id UUID NOT NULL REFERENCES grey_queue_entries(id),
tenant_id VARCHAR(128) NOT NULL,
from_state VARCHAR(32) NOT NULL,
to_state VARCHAR(32) NOT NULL,
transitioned_by VARCHAR(255),
reason TEXT,
transitioned_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
metadata JSONB
);
CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_entry
ON grey_queue_state_transitions(entry_id);
CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_tenant_time
ON grey_queue_state_transitions(tenant_id, transitioned_at DESC);
-- Function to record state transitions
CREATE OR REPLACE FUNCTION record_grey_queue_transition()
RETURNS TRIGGER AS $$
BEGIN
IF OLD.status IS DISTINCT FROM NEW.status THEN
INSERT INTO grey_queue_state_transitions (
entry_id, tenant_id, from_state, to_state,
transitioned_by, transitioned_at
) VALUES (
NEW.id,
NEW.tenant_id,
OLD.status::text,
NEW.status::text,
COALESCE(NEW.assignee, current_user),
NOW()
);
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create trigger if not exists
DROP TRIGGER IF EXISTS trg_grey_queue_state_transition ON grey_queue_entries;
CREATE TRIGGER trg_grey_queue_state_transition
AFTER UPDATE ON grey_queue_entries
FOR EACH ROW
EXECUTE FUNCTION record_grey_queue_transition();
-- Update summary view to include new states
CREATE OR REPLACE VIEW grey_queue_summary AS
SELECT
tenant_id,
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
COUNT(*) FILTER (WHERE status = 'retrying') as retrying_count,
COUNT(*) FILTER (WHERE status = 'under_review') as under_review_count,
COUNT(*) FILTER (WHERE status = 'escalated') as escalated_count,
COUNT(*) FILTER (WHERE status = 'resolved') as resolved_count,
COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count,
COUNT(*) FILTER (WHERE status = 'failed') as failed_count,
COUNT(*) FILTER (WHERE status = 'expired') as expired_count,
COUNT(*) FILTER (WHERE status = 'dismissed') as dismissed_count,
COUNT(*) as total_count
FROM grey_queue_entries
GROUP BY tenant_id;
-- Comment for documentation
COMMENT ON COLUMN grey_queue_entries.assignee IS
'Assignee for entries in UnderReview state (Sprint UQ-005)';
COMMENT ON COLUMN grey_queue_entries.assigned_at IS
'When the entry was assigned for review (Sprint UQ-005)';
COMMENT ON COLUMN grey_queue_entries.escalated_at IS
'When the entry was escalated to security team (Sprint UQ-005)';
COMMENT ON COLUMN grey_queue_entries.escalation_reason IS
'Reason for escalation (Sprint UQ-005)';

View File

@@ -0,0 +1,130 @@
-- Migration: Add diff_id column to scanner layers table
-- Sprint: SPRINT_025_Scanner_layer_manifest_infrastructure
-- Task: TASK-025-03
-- Add diff_id column to layers table (sha256:64hex = 71 chars)
ALTER TABLE scanner.layers
ADD COLUMN IF NOT EXISTS diff_id VARCHAR(71);
-- Add timestamp for when diffID was computed
ALTER TABLE scanner.layers
ADD COLUMN IF NOT EXISTS diff_id_computed_at_utc TIMESTAMP;
-- Create index on diff_id for fast lookups
CREATE INDEX IF NOT EXISTS idx_layers_diff_id
ON scanner.layers (diff_id)
WHERE diff_id IS NOT NULL;
-- Create image_layers junction table if it doesn't exist
-- This tracks which layers belong to which images
CREATE TABLE IF NOT EXISTS scanner.image_layers (
image_reference VARCHAR(512) NOT NULL,
layer_digest VARCHAR(71) NOT NULL,
layer_index INT NOT NULL,
created_at_utc TIMESTAMP NOT NULL DEFAULT NOW(),
PRIMARY KEY (image_reference, layer_digest)
);
CREATE INDEX IF NOT EXISTS idx_image_layers_digest
ON scanner.image_layers (layer_digest);
-- DiffID cache table for resolved diffIDs
CREATE TABLE IF NOT EXISTS scanner.scanner_diffid_cache (
layer_digest VARCHAR(71) PRIMARY KEY,
diff_id VARCHAR(71) NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Base image fingerprint tables for layer reuse detection
CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_fingerprints (
image_reference VARCHAR(512) PRIMARY KEY,
layer_count INT NOT NULL,
registered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
detection_count BIGINT NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_layers (
image_reference VARCHAR(512) NOT NULL REFERENCES scanner.scanner_base_image_fingerprints(image_reference) ON DELETE CASCADE,
layer_index INT NOT NULL,
diff_id VARCHAR(71) NOT NULL,
PRIMARY KEY (image_reference, layer_index)
);
CREATE INDEX IF NOT EXISTS idx_base_image_layers_diff_id
ON scanner.scanner_base_image_layers (diff_id);
-- Manifest snapshots table for IOciManifestSnapshotService
CREATE TABLE IF NOT EXISTS scanner.manifest_snapshots (
id UUID PRIMARY KEY,
image_reference VARCHAR(512) NOT NULL,
registry VARCHAR(256) NOT NULL,
repository VARCHAR(256) NOT NULL,
tag VARCHAR(128),
manifest_digest VARCHAR(71) NOT NULL,
config_digest VARCHAR(71) NOT NULL,
media_type VARCHAR(128) NOT NULL,
layers JSONB NOT NULL,
diff_ids JSONB NOT NULL,
platform JSONB,
total_size BIGINT NOT NULL,
captured_at TIMESTAMPTZ NOT NULL,
snapshot_version VARCHAR(32),
UNIQUE (manifest_digest)
);
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_image_ref
ON scanner.manifest_snapshots (image_reference);
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_repository
ON scanner.manifest_snapshots (registry, repository);
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_captured_at
ON scanner.manifest_snapshots (captured_at DESC);
-- Layer scan history for reuse detection (TASK-025-04)
CREATE TABLE IF NOT EXISTS scanner.layer_scans (
diff_id VARCHAR(71) PRIMARY KEY,
scanned_at TIMESTAMPTZ NOT NULL,
finding_count INT,
scanned_by VARCHAR(128) NOT NULL,
scanner_version VARCHAR(64)
);
-- Layer reuse counts for statistics
CREATE TABLE IF NOT EXISTS scanner.layer_reuse_counts (
diff_id VARCHAR(71) PRIMARY KEY,
reuse_count INT NOT NULL DEFAULT 1,
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_layer_reuse_counts_count
ON scanner.layer_reuse_counts (reuse_count DESC);
COMMENT ON COLUMN scanner.layers.diff_id IS 'Uncompressed layer content hash (sha256:hex64). Immutable once computed.';
COMMENT ON TABLE scanner.scanner_diffid_cache IS 'Cache of layer digest to diffID mappings. Layer digests are immutable so cache entries never expire.';
COMMENT ON TABLE scanner.scanner_base_image_fingerprints IS 'Known base image fingerprints for layer reuse detection.';
COMMENT ON TABLE scanner.manifest_snapshots IS 'Point-in-time captures of OCI image manifests for delta scanning.';
COMMENT ON TABLE scanner.layer_scans IS 'History of layer scans for deduplication. One entry per diffID.';
COMMENT ON TABLE scanner.layer_reuse_counts IS 'Counts of how many times each layer appears across images.';
-- Layer SBOM CAS for per-layer SBOM storage (TASK-026-02)
CREATE TABLE IF NOT EXISTS scanner.layer_sbom_cas (
diff_id VARCHAR(71) NOT NULL,
format VARCHAR(20) NOT NULL,
content BYTEA NOT NULL,
size_bytes BIGINT NOT NULL,
compressed BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_accessed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (diff_id, format)
);
CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_last_accessed
ON scanner.layer_sbom_cas (last_accessed_at);
CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_format
ON scanner.layer_sbom_cas (format);
COMMENT ON TABLE scanner.layer_sbom_cas IS 'Content-addressable storage for per-layer SBOMs. Keyed by diffID (immutable).';
COMMENT ON COLUMN scanner.layer_sbom_cas.content IS 'Compressed (gzip) SBOM content.';
COMMENT ON COLUMN scanner.layer_sbom_cas.last_accessed_at IS 'For TTL-based eviction of cold entries.';

View File

@@ -0,0 +1,246 @@
# Tetragon Agent DaemonSet for Stella Ops
# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
# Task: TASK-019-007 - Create Kubernetes deployment extending existing manifests
#
# Deploys the Stella Ops Tetragon agent alongside the existing agent framework.
# Follows existing DaemonSet patterns from devops/helm/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: stella-ops-tetragon-agent
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
app.kubernetes.io/component: runtime-instrumentation
app.kubernetes.io/part-of: stella-ops
spec:
selector:
matchLabels:
app.kubernetes.io/name: stella-ops-tetragon-agent
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
app.kubernetes.io/component: runtime-instrumentation
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: stella-ops-tetragon-agent
hostPID: true
hostNetwork: false
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
containers:
- name: tetragon-agent
image: stellaops/tetragon-agent:latest
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- NET_ADMIN
- BPF
- PERFMON
ports:
- name: metrics
containerPort: 8080
protocol: TCP
- name: health
containerPort: 8081
protocol: TCP
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: STELLA_API_URL
valueFrom:
configMapKeyRef:
name: stella-ops-tetragon-config
key: api-url
- name: STELLA_AGENT_ID
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: TETRAGON_GRPC_ADDRESS
value: "localhost:54321"
- name: LOG_LEVEL
valueFrom:
configMapKeyRef:
name: stella-ops-tetragon-config
key: log-level
optional: true
volumeMounts:
- name: tetragon-config
mountPath: /etc/tetragon
readOnly: true
- name: agent-certs
mountPath: /etc/stella-ops/certs
readOnly: true
- name: bpf
mountPath: /sys/fs/bpf
- name: proc
mountPath: /host/proc
readOnly: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: tetragon-config
configMap:
name: stella-ops-tetragon-policy
- name: agent-certs
secret:
secretName: stella-ops-agent-certs
optional: true
- name: bpf
hostPath:
path: /sys/fs/bpf
type: DirectoryOrCreate
- name: proc
hostPath:
path: /proc
type: Directory
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: stella-ops-tetragon-agent
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: stella-ops-tetragon-agent
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
rules:
# Read pods for container correlation
- apiGroups: [""]
resources: ["pods", "namespaces"]
verbs: ["get", "list", "watch"]
# Read nodes for host information
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list"]
# Read Tetragon CRDs
- apiGroups: ["cilium.io"]
resources: ["tracingpolicies", "tracingpoliciesnamespaced"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: stella-ops-tetragon-agent
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
subjects:
- kind: ServiceAccount
name: stella-ops-tetragon-agent
namespace: stella-ops
roleRef:
kind: ClusterRole
name: stella-ops-tetragon-agent
apiGroup: rbac.authorization.k8s.io
---
apiVersion: v1
kind: ConfigMap
metadata:
name: stella-ops-tetragon-config
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
data:
api-url: "http://stella-ops-signals.stella-ops.svc.cluster.local:8080"
log-level: "info"
aggregation-window: "60s"
buffer-size: "10000"
min-confidence: "0.5"
# Privacy settings
redact-arguments: "true"
symbol-id-only-mode: "false"
# Allowed namespaces (comma-separated, empty = all)
allowed-namespaces: "stella-ops-workloads,default"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: stella-ops-tetragon-policy
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
data:
policy.yaml: |
# Reference the TracingPolicy defined in stella-ops-tracing-policy.yaml
# This ConfigMap can contain additional local policy configurations
policyRef: stella-ops-runtime-capture
enableStackTraces: true
stackTraceSize: 16
filterNamespaces:
- stella-ops-workloads
---
apiVersion: v1
kind: Service
metadata:
name: stella-ops-tetragon-agent
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
spec:
type: ClusterIP
clusterIP: None # Headless for DaemonSet
ports:
- name: metrics
port: 8080
targetPort: metrics
- name: health
port: 8081
targetPort: health
selector:
app.kubernetes.io/name: stella-ops-tetragon-agent
---
# ServiceMonitor for Prometheus Operator (optional)
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: stella-ops-tetragon-agent
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops-tetragon-agent
spec:
selector:
matchLabels:
app.kubernetes.io/name: stella-ops-tetragon-agent
endpoints:
- port: metrics
interval: 30s
path: /metrics

View File

@@ -0,0 +1,125 @@
# Tetragon TracingPolicy for Stella Ops Runtime Instrumentation
# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
# Task: TASK-019-001 - Define Tetragon TracingPolicy for stack capture
#
# This policy captures process execution, syscalls, and stack traces for
# runtime reachability validation. Integrates with existing Signals infrastructure.
apiVersion: cilium.io/v1alpha1
kind: TracingPolicy
metadata:
name: stella-ops-runtime-capture
namespace: stella-ops
labels:
app.kubernetes.io/name: stella-ops
app.kubernetes.io/component: runtime-instrumentation
spec:
# Process execution events
kprobes:
- call: "sys_execve"
syscall: true
return: false
args:
- index: 0
type: "string" # filename
- index: 1
type: "string" # argv[0]
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In
matchLabels:
- key: "stella-ops.io/instrumented"
operator: Exists
returnArgAction: Post
# Security-relevant syscalls for reachability validation
- call: "sys_openat"
syscall: true
args:
- index: 0
type: "int" # dirfd
- index: 1
type: "string" # pathname
- index: 2
type: "int" # flags
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In
- matchArgs:
- index: 1
operator: "Prefix"
values:
- "/etc/"
- "/proc/"
- "/sys/"
returnArg:
index: 0
type: "int"
- call: "sys_connect"
syscall: true
args:
- index: 0
type: "int" # sockfd
- index: 1
type: "sock" # addr struct
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In
returnArg:
index: 0
type: "int"
# Tracepoints for additional coverage
tracepoints:
- subsystem: "sched"
event: "sched_process_exec"
args:
- index: 0
type: "string" # filename
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In
# Stack trace configuration
options:
# Enable kernel + userspace stack traces
stackTraces: true
# Capture both kernel and user stacks
stackTraceSize: 16
# Symbol resolution for userspace
symbols: true
---
# Companion TracingPolicy for library loading
apiVersion: cilium.io/v1alpha1
kind: TracingPolicy
metadata:
name: stella-ops-library-capture
namespace: stella-ops
spec:
# Capture dynamic library loading
uprobes:
- path: "/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2"
symbols:
- "_dl_map_object"
args:
- index: 0
type: "string" # library name
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In
# Alternative for musl-based containers
- path: "/lib/ld-musl-x86_64.so.1"
symbols:
- "__dls3"
selectors:
- matchNamespaces:
- namespace: stella-ops-workloads
operator: In

View File

@@ -0,0 +1,361 @@
{
"__inputs": [],
"annotations": {
"list": []
},
"description": "Unknowns Queue monitoring dashboard - Sprint SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"title": "Queue Overview",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"collapsed": false
},
{
"title": "Total Queue Depth",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"targets": [
{
"expr": "sum(unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold)",
"legendFormat": "Total"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 50, "color": "yellow" },
{ "value": 100, "color": "red" }
]
}
}
}
},
{
"title": "HOT Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_hot",
"legendFormat": "HOT"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "orange" },
{ "value": 5, "color": "red" }
]
}
}
}
},
{
"title": "WARM Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_warm",
"legendFormat": "WARM"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 10, "color": "yellow" },
{ "value": 25, "color": "orange" }
]
}
}
}
},
{
"title": "COLD Unknowns",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"targets": [
{
"expr": "unknowns_queue_depth_cold",
"legendFormat": "COLD"
}
]
},
{
"title": "SLA Compliance",
"type": "gauge",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"targets": [
{
"expr": "unknowns_sla_compliance * 100",
"legendFormat": "Compliance %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "red" },
{ "value": 80, "color": "yellow" },
{ "value": 95, "color": "green" }
]
}
}
}
},
{
"title": "Stuck Processing",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"targets": [
{
"expr": "greyqueue_processing_count",
"legendFormat": "Processing"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 5, "color": "yellow" },
{ "value": 10, "color": "red" }
]
}
}
}
},
{
"title": "Queue Depth Over Time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
"targets": [
{
"expr": "unknowns_queue_depth_hot",
"legendFormat": "HOT"
},
{
"expr": "unknowns_queue_depth_warm",
"legendFormat": "WARM"
},
{
"expr": "unknowns_queue_depth_cold",
"legendFormat": "COLD"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"lineWidth": 2,
"fillOpacity": 20
}
},
"overrides": [
{ "matcher": { "id": "byName", "options": "HOT" }, "properties": [{ "id": "color", "value": { "fixedColor": "red" } }] },
{ "matcher": { "id": "byName", "options": "WARM" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange" } }] },
{ "matcher": { "id": "byName", "options": "COLD" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue" } }] }
]
}
},
{
"title": "SLA Compliance Over Time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
"targets": [
{
"expr": "unknowns_sla_compliance * 100",
"legendFormat": "SLA Compliance %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 80, "color": "yellow" },
{ "value": 95, "color": "green" }
]
}
}
}
},
{
"title": "Operations",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
"collapsed": false
},
{
"title": "State Transitions (Rate)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
"targets": [
{
"expr": "rate(unknowns_state_transitions_total[5m])",
"legendFormat": "{{from_state}} → {{to_state}}"
}
]
},
{
"title": "Processing Time (p95)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
"targets": [
{
"expr": "histogram_quantile(0.95, rate(unknowns_processing_time_seconds_bucket[5m]))",
"legendFormat": "p95 Processing Time"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"title": "Escalations & Failures",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"targets": [
{
"expr": "rate(unknowns_escalated_total[1h])",
"legendFormat": "Escalations"
},
{
"expr": "rate(unknowns_demoted_total[1h])",
"legendFormat": "Demotions"
},
{
"expr": "rate(unknowns_expired_total[1h])",
"legendFormat": "Expired"
},
{
"expr": "rate(greyqueue_watchdog_failed_total[1h])",
"legendFormat": "Failed"
}
]
},
{
"title": "Resolution Time by Band",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"targets": [
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"hot\"}[1h]))",
"legendFormat": "HOT (p50)"
},
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"warm\"}[1h]))",
"legendFormat": "WARM (p50)"
},
{
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"cold\"}[1h]))",
"legendFormat": "COLD (p50)"
}
],
"fieldConfig": {
"defaults": {
"unit": "h"
}
}
},
{
"title": "Watchdog Metrics",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
"collapsed": false
},
{
"title": "Stuck & Timeout Events",
"type": "timeseries",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 },
"targets": [
{
"expr": "rate(greyqueue_stuck_total[1h]) * 3600",
"legendFormat": "Stuck (per hour)"
},
{
"expr": "rate(greyqueue_timeout_total[1h]) * 3600",
"legendFormat": "Timeouts (per hour)"
},
{
"expr": "rate(greyqueue_watchdog_retry_total[1h]) * 3600",
"legendFormat": "Forced Retries (per hour)"
}
]
},
{
"title": "Currently Processing",
"type": "stat",
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 31 },
"targets": [
{
"expr": "greyqueue_processing_count",
"legendFormat": "In Processing"
}
]
},
{
"title": "SLA Breaches Today",
"type": "stat",
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 31 },
"targets": [
{
"expr": "increase(unknowns_sla_breach_total[24h])",
"legendFormat": "Breaches (24h)"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "red" }
]
}
}
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["unknowns", "security", "sla"],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"title": "Unknowns Queue Dashboard",
"uid": "unknowns-queue-dashboard",
"version": 1
}

View File

@@ -0,0 +1,186 @@
# Unknowns Queue Alert Rules
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
#
# Deploy to Prometheus/Alertmanager
groups:
- name: unknowns-queue
interval: 1m
rules:
# =============================================================================
# SLA Alerts
# =============================================================================
- alert: UnknownsSlaBreachCritical
expr: unknowns_sla_compliance < 0.80
for: 5m
labels:
severity: critical
team: security
annotations:
summary: "SLA compliance dropped below 80%"
description: |
SLA compliance is {{ $value | humanizePercentage }}.
Multiple unknowns have breached their SLA deadlines.
Immediate action required.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
- alert: UnknownsSlaBreachWarning
expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
for: 15m
labels:
severity: warning
team: security
annotations:
summary: "SLA compliance below 95%"
description: |
SLA compliance is {{ $value | humanizePercentage }}.
Some unknowns are approaching or have breached SLA.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
- alert: UnknownsSlaBreach
expr: increase(unknowns_sla_breach_total[1h]) > 0
for: 0m
labels:
severity: critical
team: security
annotations:
summary: "Unknown SLA breached"
description: |
{{ $value }} unknown(s) have breached SLA in the last hour.
Check the unknowns queue dashboard for affected entries.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
# =============================================================================
# Queue Depth Alerts
# =============================================================================
- alert: UnknownsHotQueueHigh
expr: unknowns_queue_depth_hot > 5
for: 10m
labels:
severity: critical
team: security
annotations:
summary: "High number of HOT unknowns"
description: |
{{ $value }} HOT unknowns in queue.
HOT unknowns have 24-hour SLA and block releases.
Prioritize resolution immediately.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
- alert: UnknownsHotQueuePresent
expr: unknowns_queue_depth_hot > 0
for: 1h
labels:
severity: warning
team: security
annotations:
summary: "HOT unknowns present for over 1 hour"
description: |
{{ $value }} HOT unknown(s) have been in queue for over 1 hour.
50% of 24-hour SLA elapsed.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
- alert: UnknownsQueueBacklog
expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
for: 30m
labels:
severity: warning
team: operations
annotations:
summary: "Unknowns queue backlog growing"
description: |
Total queue depth is {{ $value }}.
Consider scaling processing capacity or reviewing automation.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
# =============================================================================
# Processing Alerts
# =============================================================================
- alert: UnknownsStuckProcessing
expr: greyqueue_processing_count > 10
for: 30m
labels:
severity: warning
team: operations
annotations:
summary: "Many entries stuck in processing"
description: |
{{ $value }} entries in Processing status for extended period.
Check for processing bottlenecks or failures.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
- alert: UnknownsProcessingTimeout
expr: increase(greyqueue_timeout_total[1h]) > 5
for: 0m
labels:
severity: warning
team: operations
annotations:
summary: "Processing timeouts occurring"
description: |
{{ $value }} processing timeouts in the last hour.
Entries are being forcefully retried.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
- alert: UnknownsProcessingFailures
expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
for: 0m
labels:
severity: critical
team: operations
annotations:
summary: "Processing failures detected"
description: |
{{ $value }} entries moved to Failed status in the last hour.
Manual intervention may be required.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
# =============================================================================
# Escalation Alerts
# =============================================================================
- alert: UnknownsEscalationRate
expr: increase(unknowns_escalated_total[1h]) > 10
for: 0m
labels:
severity: warning
team: security
annotations:
summary: "High escalation rate"
description: |
{{ $value }} unknowns escalated in the last hour.
Review escalation criteria or upstream data quality.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
# =============================================================================
# Service Health Alerts
# =============================================================================
- alert: UnknownsSlaMonitorDown
expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
for: 5m
labels:
severity: critical
team: operations
annotations:
summary: "Unknowns SLA monitor not reporting"
description: |
No metrics received from unknowns SLA monitor.
Check if the service is running.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
- alert: UnknownsHealthCheckUnhealthy
expr: probe_success{job="unknowns-healthcheck"} == 0
for: 5m
labels:
severity: critical
team: operations
annotations:
summary: "Unknowns service health check failing"
description: |
Health check endpoint returning unhealthy.
SLA breaches may exist.
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"