doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
-- -----------------------------------------------------------------------------
|
||||
-- V20260119_001__Add_UnderReview_Escalated_Rejected_States.sql
|
||||
-- Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
-- Task: UQ-005 - Migration for existing entries (map to new states)
|
||||
-- Description: Adds new state machine states and required columns
|
||||
-- -----------------------------------------------------------------------------
|
||||
|
||||
-- Add new columns for UnderReview and Escalated states
|
||||
ALTER TABLE grey_queue_entries
|
||||
ADD COLUMN IF NOT EXISTS assignee VARCHAR(255) NULL,
|
||||
ADD COLUMN IF NOT EXISTS assigned_at TIMESTAMPTZ NULL,
|
||||
ADD COLUMN IF NOT EXISTS escalated_at TIMESTAMPTZ NULL,
|
||||
ADD COLUMN IF NOT EXISTS escalation_reason TEXT NULL;
|
||||
|
||||
-- Add new enum values to grey_queue_status
|
||||
-- Note: PostgreSQL requires special handling for enum additions
|
||||
|
||||
-- First, check if we need to add the values (idempotent)
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Add 'under_review' if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_enum
|
||||
WHERE enumlabel = 'under_review'
|
||||
AND enumtypid = 'grey_queue_status'::regtype
|
||||
) THEN
|
||||
ALTER TYPE grey_queue_status ADD VALUE 'under_review' AFTER 'retrying';
|
||||
END IF;
|
||||
|
||||
-- Add 'escalated' if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_enum
|
||||
WHERE enumlabel = 'escalated'
|
||||
AND enumtypid = 'grey_queue_status'::regtype
|
||||
) THEN
|
||||
ALTER TYPE grey_queue_status ADD VALUE 'escalated' AFTER 'under_review';
|
||||
END IF;
|
||||
|
||||
-- Add 'rejected' if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_enum
|
||||
WHERE enumlabel = 'rejected'
|
||||
AND enumtypid = 'grey_queue_status'::regtype
|
||||
) THEN
|
||||
ALTER TYPE grey_queue_status ADD VALUE 'rejected' AFTER 'resolved';
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN others THEN
|
||||
-- Enum values may already exist, which is fine
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
-- Add indexes for new query patterns
|
||||
CREATE INDEX IF NOT EXISTS idx_grey_queue_assignee
|
||||
ON grey_queue_entries(assignee)
|
||||
WHERE assignee IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_grey_queue_status_assignee
|
||||
ON grey_queue_entries(status, assignee)
|
||||
WHERE status IN ('under_review', 'escalated');
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_grey_queue_escalated_at
|
||||
ON grey_queue_entries(escalated_at DESC)
|
||||
WHERE escalated_at IS NOT NULL;
|
||||
|
||||
-- Add audit trigger for state transitions
|
||||
CREATE TABLE IF NOT EXISTS grey_queue_state_transitions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
entry_id UUID NOT NULL REFERENCES grey_queue_entries(id),
|
||||
tenant_id VARCHAR(128) NOT NULL,
|
||||
from_state VARCHAR(32) NOT NULL,
|
||||
to_state VARCHAR(32) NOT NULL,
|
||||
transitioned_by VARCHAR(255),
|
||||
reason TEXT,
|
||||
transitioned_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_entry
|
||||
ON grey_queue_state_transitions(entry_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_tenant_time
|
||||
ON grey_queue_state_transitions(tenant_id, transitioned_at DESC);
|
||||
|
||||
-- Function to record state transitions
|
||||
CREATE OR REPLACE FUNCTION record_grey_queue_transition()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
IF OLD.status IS DISTINCT FROM NEW.status THEN
|
||||
INSERT INTO grey_queue_state_transitions (
|
||||
entry_id, tenant_id, from_state, to_state,
|
||||
transitioned_by, transitioned_at
|
||||
) VALUES (
|
||||
NEW.id,
|
||||
NEW.tenant_id,
|
||||
OLD.status::text,
|
||||
NEW.status::text,
|
||||
COALESCE(NEW.assignee, current_user),
|
||||
NOW()
|
||||
);
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create trigger if not exists
|
||||
DROP TRIGGER IF EXISTS trg_grey_queue_state_transition ON grey_queue_entries;
|
||||
CREATE TRIGGER trg_grey_queue_state_transition
|
||||
AFTER UPDATE ON grey_queue_entries
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION record_grey_queue_transition();
|
||||
|
||||
-- Update summary view to include new states
|
||||
CREATE OR REPLACE VIEW grey_queue_summary AS
|
||||
SELECT
|
||||
tenant_id,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
|
||||
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
|
||||
COUNT(*) FILTER (WHERE status = 'retrying') as retrying_count,
|
||||
COUNT(*) FILTER (WHERE status = 'under_review') as under_review_count,
|
||||
COUNT(*) FILTER (WHERE status = 'escalated') as escalated_count,
|
||||
COUNT(*) FILTER (WHERE status = 'resolved') as resolved_count,
|
||||
COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed_count,
|
||||
COUNT(*) FILTER (WHERE status = 'expired') as expired_count,
|
||||
COUNT(*) FILTER (WHERE status = 'dismissed') as dismissed_count,
|
||||
COUNT(*) as total_count
|
||||
FROM grey_queue_entries
|
||||
GROUP BY tenant_id;
|
||||
|
||||
-- Comment for documentation
|
||||
COMMENT ON COLUMN grey_queue_entries.assignee IS
|
||||
'Assignee for entries in UnderReview state (Sprint UQ-005)';
|
||||
COMMENT ON COLUMN grey_queue_entries.assigned_at IS
|
||||
'When the entry was assigned for review (Sprint UQ-005)';
|
||||
COMMENT ON COLUMN grey_queue_entries.escalated_at IS
|
||||
'When the entry was escalated to security team (Sprint UQ-005)';
|
||||
COMMENT ON COLUMN grey_queue_entries.escalation_reason IS
|
||||
'Reason for escalation (Sprint UQ-005)';
|
||||
130
devops/database/migrations/V20260119__scanner_layer_diffid.sql
Normal file
130
devops/database/migrations/V20260119__scanner_layer_diffid.sql
Normal file
@@ -0,0 +1,130 @@
|
||||
-- Migration: Add diff_id column to scanner layers table
|
||||
-- Sprint: SPRINT_025_Scanner_layer_manifest_infrastructure
|
||||
-- Task: TASK-025-03
|
||||
|
||||
-- Add diff_id column to layers table (sha256:64hex = 71 chars)
|
||||
ALTER TABLE scanner.layers
|
||||
ADD COLUMN IF NOT EXISTS diff_id VARCHAR(71);
|
||||
|
||||
-- Add timestamp for when diffID was computed
|
||||
ALTER TABLE scanner.layers
|
||||
ADD COLUMN IF NOT EXISTS diff_id_computed_at_utc TIMESTAMP;
|
||||
|
||||
-- Create index on diff_id for fast lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_layers_diff_id
|
||||
ON scanner.layers (diff_id)
|
||||
WHERE diff_id IS NOT NULL;
|
||||
|
||||
-- Create image_layers junction table if it doesn't exist
|
||||
-- This tracks which layers belong to which images
|
||||
CREATE TABLE IF NOT EXISTS scanner.image_layers (
|
||||
image_reference VARCHAR(512) NOT NULL,
|
||||
layer_digest VARCHAR(71) NOT NULL,
|
||||
layer_index INT NOT NULL,
|
||||
created_at_utc TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (image_reference, layer_digest)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_image_layers_digest
|
||||
ON scanner.image_layers (layer_digest);
|
||||
|
||||
-- DiffID cache table for resolved diffIDs
|
||||
CREATE TABLE IF NOT EXISTS scanner.scanner_diffid_cache (
|
||||
layer_digest VARCHAR(71) PRIMARY KEY,
|
||||
diff_id VARCHAR(71) NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Base image fingerprint tables for layer reuse detection
|
||||
CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_fingerprints (
|
||||
image_reference VARCHAR(512) PRIMARY KEY,
|
||||
layer_count INT NOT NULL,
|
||||
registered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
detection_count BIGINT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_layers (
|
||||
image_reference VARCHAR(512) NOT NULL REFERENCES scanner.scanner_base_image_fingerprints(image_reference) ON DELETE CASCADE,
|
||||
layer_index INT NOT NULL,
|
||||
diff_id VARCHAR(71) NOT NULL,
|
||||
PRIMARY KEY (image_reference, layer_index)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_base_image_layers_diff_id
|
||||
ON scanner.scanner_base_image_layers (diff_id);
|
||||
|
||||
-- Manifest snapshots table for IOciManifestSnapshotService
|
||||
CREATE TABLE IF NOT EXISTS scanner.manifest_snapshots (
|
||||
id UUID PRIMARY KEY,
|
||||
image_reference VARCHAR(512) NOT NULL,
|
||||
registry VARCHAR(256) NOT NULL,
|
||||
repository VARCHAR(256) NOT NULL,
|
||||
tag VARCHAR(128),
|
||||
manifest_digest VARCHAR(71) NOT NULL,
|
||||
config_digest VARCHAR(71) NOT NULL,
|
||||
media_type VARCHAR(128) NOT NULL,
|
||||
layers JSONB NOT NULL,
|
||||
diff_ids JSONB NOT NULL,
|
||||
platform JSONB,
|
||||
total_size BIGINT NOT NULL,
|
||||
captured_at TIMESTAMPTZ NOT NULL,
|
||||
snapshot_version VARCHAR(32),
|
||||
UNIQUE (manifest_digest)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_image_ref
|
||||
ON scanner.manifest_snapshots (image_reference);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_repository
|
||||
ON scanner.manifest_snapshots (registry, repository);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_captured_at
|
||||
ON scanner.manifest_snapshots (captured_at DESC);
|
||||
|
||||
-- Layer scan history for reuse detection (TASK-025-04)
|
||||
CREATE TABLE IF NOT EXISTS scanner.layer_scans (
|
||||
diff_id VARCHAR(71) PRIMARY KEY,
|
||||
scanned_at TIMESTAMPTZ NOT NULL,
|
||||
finding_count INT,
|
||||
scanned_by VARCHAR(128) NOT NULL,
|
||||
scanner_version VARCHAR(64)
|
||||
);
|
||||
|
||||
-- Layer reuse counts for statistics
|
||||
CREATE TABLE IF NOT EXISTS scanner.layer_reuse_counts (
|
||||
diff_id VARCHAR(71) PRIMARY KEY,
|
||||
reuse_count INT NOT NULL DEFAULT 1,
|
||||
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_layer_reuse_counts_count
|
||||
ON scanner.layer_reuse_counts (reuse_count DESC);
|
||||
|
||||
COMMENT ON COLUMN scanner.layers.diff_id IS 'Uncompressed layer content hash (sha256:hex64). Immutable once computed.';
|
||||
COMMENT ON TABLE scanner.scanner_diffid_cache IS 'Cache of layer digest to diffID mappings. Layer digests are immutable so cache entries never expire.';
|
||||
COMMENT ON TABLE scanner.scanner_base_image_fingerprints IS 'Known base image fingerprints for layer reuse detection.';
|
||||
COMMENT ON TABLE scanner.manifest_snapshots IS 'Point-in-time captures of OCI image manifests for delta scanning.';
|
||||
COMMENT ON TABLE scanner.layer_scans IS 'History of layer scans for deduplication. One entry per diffID.';
|
||||
COMMENT ON TABLE scanner.layer_reuse_counts IS 'Counts of how many times each layer appears across images.';
|
||||
|
||||
-- Layer SBOM CAS for per-layer SBOM storage (TASK-026-02)
|
||||
CREATE TABLE IF NOT EXISTS scanner.layer_sbom_cas (
|
||||
diff_id VARCHAR(71) NOT NULL,
|
||||
format VARCHAR(20) NOT NULL,
|
||||
content BYTEA NOT NULL,
|
||||
size_bytes BIGINT NOT NULL,
|
||||
compressed BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_accessed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (diff_id, format)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_last_accessed
|
||||
ON scanner.layer_sbom_cas (last_accessed_at);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_format
|
||||
ON scanner.layer_sbom_cas (format);
|
||||
|
||||
COMMENT ON TABLE scanner.layer_sbom_cas IS 'Content-addressable storage for per-layer SBOMs. Keyed by diffID (immutable).';
|
||||
COMMENT ON COLUMN scanner.layer_sbom_cas.content IS 'Compressed (gzip) SBOM content.';
|
||||
COMMENT ON COLUMN scanner.layer_sbom_cas.last_accessed_at IS 'For TTL-based eviction of cold entries.';
|
||||
@@ -0,0 +1,246 @@
|
||||
# Tetragon Agent DaemonSet for Stella Ops
|
||||
# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
|
||||
# Task: TASK-019-007 - Create Kubernetes deployment extending existing manifests
|
||||
#
|
||||
# Deploys the Stella Ops Tetragon agent alongside the existing agent framework.
|
||||
# Follows existing DaemonSet patterns from devops/helm/
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
app.kubernetes.io/component: runtime-instrumentation
|
||||
app.kubernetes.io/part-of: stella-ops
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
app.kubernetes.io/component: runtime-instrumentation
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
serviceAccountName: stella-ops-tetragon-agent
|
||||
hostPID: true
|
||||
hostNetwork: false
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: tetragon-agent
|
||||
image: stellaops/tetragon-agent:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add:
|
||||
- SYS_ADMIN
|
||||
- NET_ADMIN
|
||||
- BPF
|
||||
- PERFMON
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8080
|
||||
protocol: TCP
|
||||
- name: health
|
||||
containerPort: 8081
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
- name: STELLA_API_URL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: stella-ops-tetragon-config
|
||||
key: api-url
|
||||
- name: STELLA_AGENT_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: TETRAGON_GRPC_ADDRESS
|
||||
value: "localhost:54321"
|
||||
- name: LOG_LEVEL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: stella-ops-tetragon-config
|
||||
key: log-level
|
||||
optional: true
|
||||
volumeMounts:
|
||||
- name: tetragon-config
|
||||
mountPath: /etc/tetragon
|
||||
readOnly: true
|
||||
- name: agent-certs
|
||||
mountPath: /etc/stella-ops/certs
|
||||
readOnly: true
|
||||
- name: bpf
|
||||
mountPath: /sys/fs/bpf
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8081
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 8081
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: tetragon-config
|
||||
configMap:
|
||||
name: stella-ops-tetragon-policy
|
||||
- name: agent-certs
|
||||
secret:
|
||||
secretName: stella-ops-agent-certs
|
||||
optional: true
|
||||
- name: bpf
|
||||
hostPath:
|
||||
path: /sys/fs/bpf
|
||||
type: DirectoryOrCreate
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
type: Directory
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
rules:
|
||||
# Read pods for container correlation
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "namespaces"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
# Read nodes for host information
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list"]
|
||||
# Read Tetragon CRDs
|
||||
- apiGroups: ["cilium.io"]
|
||||
resources: ["tracingpolicies", "tracingpoliciesnamespaced"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: stella-ops-tetragon-agent
|
||||
namespace: stella-ops
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: stella-ops-tetragon-agent
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: stella-ops-tetragon-config
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
data:
|
||||
api-url: "http://stella-ops-signals.stella-ops.svc.cluster.local:8080"
|
||||
log-level: "info"
|
||||
aggregation-window: "60s"
|
||||
buffer-size: "10000"
|
||||
min-confidence: "0.5"
|
||||
# Privacy settings
|
||||
redact-arguments: "true"
|
||||
symbol-id-only-mode: "false"
|
||||
# Allowed namespaces (comma-separated, empty = all)
|
||||
allowed-namespaces: "stella-ops-workloads,default"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: stella-ops-tetragon-policy
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
data:
|
||||
policy.yaml: |
|
||||
# Reference the TracingPolicy defined in stella-ops-tracing-policy.yaml
|
||||
# This ConfigMap can contain additional local policy configurations
|
||||
policyRef: stella-ops-runtime-capture
|
||||
enableStackTraces: true
|
||||
stackTraceSize: 16
|
||||
filterNamespaces:
|
||||
- stella-ops-workloads
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None # Headless for DaemonSet
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 8080
|
||||
targetPort: metrics
|
||||
- name: health
|
||||
port: 8081
|
||||
targetPort: health
|
||||
selector:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
---
|
||||
# ServiceMonitor for Prometheus Operator (optional)
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: stella-ops-tetragon-agent
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: stella-ops-tetragon-agent
|
||||
endpoints:
|
||||
- port: metrics
|
||||
interval: 30s
|
||||
path: /metrics
|
||||
125
devops/manifests/tetragon/stella-ops-tracing-policy.yaml
Normal file
125
devops/manifests/tetragon/stella-ops-tracing-policy.yaml
Normal file
@@ -0,0 +1,125 @@
|
||||
# Tetragon TracingPolicy for Stella Ops Runtime Instrumentation
|
||||
# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
|
||||
# Task: TASK-019-001 - Define Tetragon TracingPolicy for stack capture
|
||||
#
|
||||
# This policy captures process execution, syscalls, and stack traces for
|
||||
# runtime reachability validation. Integrates with existing Signals infrastructure.
|
||||
|
||||
apiVersion: cilium.io/v1alpha1
|
||||
kind: TracingPolicy
|
||||
metadata:
|
||||
name: stella-ops-runtime-capture
|
||||
namespace: stella-ops
|
||||
labels:
|
||||
app.kubernetes.io/name: stella-ops
|
||||
app.kubernetes.io/component: runtime-instrumentation
|
||||
spec:
|
||||
# Process execution events
|
||||
kprobes:
|
||||
- call: "sys_execve"
|
||||
syscall: true
|
||||
return: false
|
||||
args:
|
||||
- index: 0
|
||||
type: "string" # filename
|
||||
- index: 1
|
||||
type: "string" # argv[0]
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
matchLabels:
|
||||
- key: "stella-ops.io/instrumented"
|
||||
operator: Exists
|
||||
returnArgAction: Post
|
||||
|
||||
# Security-relevant syscalls for reachability validation
|
||||
- call: "sys_openat"
|
||||
syscall: true
|
||||
args:
|
||||
- index: 0
|
||||
type: "int" # dirfd
|
||||
- index: 1
|
||||
type: "string" # pathname
|
||||
- index: 2
|
||||
type: "int" # flags
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
- matchArgs:
|
||||
- index: 1
|
||||
operator: "Prefix"
|
||||
values:
|
||||
- "/etc/"
|
||||
- "/proc/"
|
||||
- "/sys/"
|
||||
returnArg:
|
||||
index: 0
|
||||
type: "int"
|
||||
|
||||
- call: "sys_connect"
|
||||
syscall: true
|
||||
args:
|
||||
- index: 0
|
||||
type: "int" # sockfd
|
||||
- index: 1
|
||||
type: "sock" # addr struct
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
returnArg:
|
||||
index: 0
|
||||
type: "int"
|
||||
|
||||
# Tracepoints for additional coverage
|
||||
tracepoints:
|
||||
- subsystem: "sched"
|
||||
event: "sched_process_exec"
|
||||
args:
|
||||
- index: 0
|
||||
type: "string" # filename
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
|
||||
# Stack trace configuration
|
||||
options:
|
||||
# Enable kernel + userspace stack traces
|
||||
stackTraces: true
|
||||
# Capture both kernel and user stacks
|
||||
stackTraceSize: 16
|
||||
# Symbol resolution for userspace
|
||||
symbols: true
|
||||
|
||||
---
|
||||
# Companion TracingPolicy for library loading
|
||||
apiVersion: cilium.io/v1alpha1
|
||||
kind: TracingPolicy
|
||||
metadata:
|
||||
name: stella-ops-library-capture
|
||||
namespace: stella-ops
|
||||
spec:
|
||||
# Capture dynamic library loading
|
||||
uprobes:
|
||||
- path: "/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2"
|
||||
symbols:
|
||||
- "_dl_map_object"
|
||||
args:
|
||||
- index: 0
|
||||
type: "string" # library name
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
|
||||
# Alternative for musl-based containers
|
||||
- path: "/lib/ld-musl-x86_64.so.1"
|
||||
symbols:
|
||||
- "__dls3"
|
||||
selectors:
|
||||
- matchNamespaces:
|
||||
- namespace: stella-ops-workloads
|
||||
operator: In
|
||||
@@ -0,0 +1,361 @@
|
||||
{
|
||||
"__inputs": [],
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "Unknowns Queue monitoring dashboard - Sprint SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Queue Overview",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Total Queue Depth",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold)",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 50, "color": "yellow" },
|
||||
{ "value": 100, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "HOT Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_hot",
|
||||
"legendFormat": "HOT"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 1, "color": "orange" },
|
||||
{ "value": 5, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "WARM Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_warm",
|
||||
"legendFormat": "WARM"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 10, "color": "yellow" },
|
||||
{ "value": 25, "color": "orange" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "COLD Unknowns",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_cold",
|
||||
"legendFormat": "COLD"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "SLA Compliance",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_sla_compliance * 100",
|
||||
"legendFormat": "Compliance %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 80, "color": "yellow" },
|
||||
{ "value": 95, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Stuck Processing",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "greyqueue_processing_count",
|
||||
"legendFormat": "Processing"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 5, "color": "yellow" },
|
||||
{ "value": 10, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Depth Over Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_queue_depth_hot",
|
||||
"legendFormat": "HOT"
|
||||
},
|
||||
{
|
||||
"expr": "unknowns_queue_depth_warm",
|
||||
"legendFormat": "WARM"
|
||||
},
|
||||
{
|
||||
"expr": "unknowns_queue_depth_cold",
|
||||
"legendFormat": "COLD"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 20
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HOT" }, "properties": [{ "id": "color", "value": { "fixedColor": "red" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "WARM" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "COLD" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue" } }] }
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "SLA Compliance Over Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "unknowns_sla_compliance * 100",
|
||||
"legendFormat": "SLA Compliance %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 80, "color": "yellow" },
|
||||
{ "value": 95, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Operations",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "State Transitions (Rate)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(unknowns_state_transitions_total[5m])",
|
||||
"legendFormat": "{{from_state}} → {{to_state}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Processing Time (p95)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(unknowns_processing_time_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95 Processing Time"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Escalations & Failures",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(unknowns_escalated_total[1h])",
|
||||
"legendFormat": "Escalations"
|
||||
},
|
||||
{
|
||||
"expr": "rate(unknowns_demoted_total[1h])",
|
||||
"legendFormat": "Demotions"
|
||||
},
|
||||
{
|
||||
"expr": "rate(unknowns_expired_total[1h])",
|
||||
"legendFormat": "Expired"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_watchdog_failed_total[1h])",
|
||||
"legendFormat": "Failed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Resolution Time by Band",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"hot\"}[1h]))",
|
||||
"legendFormat": "HOT (p50)"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"warm\"}[1h]))",
|
||||
"legendFormat": "WARM (p50)"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"cold\"}[1h]))",
|
||||
"legendFormat": "COLD (p50)"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "h"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Watchdog Metrics",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Stuck & Timeout Events",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(greyqueue_stuck_total[1h]) * 3600",
|
||||
"legendFormat": "Stuck (per hour)"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_timeout_total[1h]) * 3600",
|
||||
"legendFormat": "Timeouts (per hour)"
|
||||
},
|
||||
{
|
||||
"expr": "rate(greyqueue_watchdog_retry_total[1h]) * 3600",
|
||||
"legendFormat": "Forced Retries (per hour)"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Currently Processing",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "greyqueue_processing_count",
|
||||
"legendFormat": "In Processing"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "SLA Breaches Today",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 31 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(unknowns_sla_breach_total[24h])",
|
||||
"legendFormat": "Breaches (24h)"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 1, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["unknowns", "security", "sla"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"title": "Unknowns Queue Dashboard",
|
||||
"uid": "unknowns-queue-dashboard",
|
||||
"version": 1
|
||||
}
|
||||
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
186
devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
Normal file
@@ -0,0 +1,186 @@
|
||||
# Unknowns Queue Alert Rules
|
||||
# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
|
||||
#
|
||||
# Deploy to Prometheus/Alertmanager
|
||||
|
||||
groups:
|
||||
- name: unknowns-queue
|
||||
interval: 1m
|
||||
rules:
|
||||
# =============================================================================
|
||||
# SLA Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaBreachCritical
|
||||
expr: unknowns_sla_compliance < 0.80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance dropped below 80%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Multiple unknowns have breached their SLA deadlines.
|
||||
Immediate action required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
- alert: UnknownsSlaBreachWarning
|
||||
expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "SLA compliance below 95%"
|
||||
description: |
|
||||
SLA compliance is {{ $value | humanizePercentage }}.
|
||||
Some unknowns are approaching or have breached SLA.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
|
||||
|
||||
- alert: UnknownsSlaBreach
|
||||
expr: increase(unknowns_sla_breach_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Unknown SLA breached"
|
||||
description: |
|
||||
{{ $value }} unknown(s) have breached SLA in the last hour.
|
||||
Check the unknowns queue dashboard for affected entries.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
|
||||
|
||||
# =============================================================================
|
||||
# Queue Depth Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsHotQueueHigh
|
||||
expr: unknowns_queue_depth_hot > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High number of HOT unknowns"
|
||||
description: |
|
||||
{{ $value }} HOT unknowns in queue.
|
||||
HOT unknowns have 24-hour SLA and block releases.
|
||||
Prioritize resolution immediately.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsHotQueuePresent
|
||||
expr: unknowns_queue_depth_hot > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "HOT unknowns present for over 1 hour"
|
||||
description: |
|
||||
{{ $value }} HOT unknown(s) have been in queue for over 1 hour.
|
||||
50% of 24-hour SLA elapsed.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
|
||||
|
||||
- alert: UnknownsQueueBacklog
|
||||
expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns queue backlog growing"
|
||||
description: |
|
||||
Total queue depth is {{ $value }}.
|
||||
Consider scaling processing capacity or reviewing automation.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
|
||||
|
||||
# =============================================================================
|
||||
# Processing Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsStuckProcessing
|
||||
expr: greyqueue_processing_count > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Many entries stuck in processing"
|
||||
description: |
|
||||
{{ $value }} entries in Processing status for extended period.
|
||||
Check for processing bottlenecks or failures.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
|
||||
|
||||
- alert: UnknownsProcessingTimeout
|
||||
expr: increase(greyqueue_timeout_total[1h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing timeouts occurring"
|
||||
description: |
|
||||
{{ $value }} processing timeouts in the last hour.
|
||||
Entries are being forcefully retried.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
|
||||
|
||||
- alert: UnknownsProcessingFailures
|
||||
expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Processing failures detected"
|
||||
description: |
|
||||
{{ $value }} entries moved to Failed status in the last hour.
|
||||
Manual intervention may be required.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
|
||||
|
||||
# =============================================================================
|
||||
# Escalation Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsEscalationRate
|
||||
expr: increase(unknowns_escalated_total[1h]) > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High escalation rate"
|
||||
description: |
|
||||
{{ $value }} unknowns escalated in the last hour.
|
||||
Review escalation criteria or upstream data quality.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
|
||||
|
||||
# =============================================================================
|
||||
# Service Health Alerts
|
||||
# =============================================================================
|
||||
|
||||
- alert: UnknownsSlaMonitorDown
|
||||
expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns SLA monitor not reporting"
|
||||
description: |
|
||||
No metrics received from unknowns SLA monitor.
|
||||
Check if the service is running.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
|
||||
|
||||
- alert: UnknownsHealthCheckUnhealthy
|
||||
expr: probe_success{job="unknowns-healthcheck"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: operations
|
||||
annotations:
|
||||
summary: "Unknowns service health check failing"
|
||||
description: |
|
||||
Health check endpoint returning unhealthy.
|
||||
SLA breaches may exist.
|
||||
runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"
|
||||
Reference in New Issue
Block a user