doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements

2026-01-19 09:02:59 +02:00
parent 8c4bf54aed
commit 17419ba7c4
809 changed files with 170738 additions and 12244 deletions
--- a/devops/database/migrations/V20260119_001__Add_UnderReview_Escalated_Rejected_States.sql
+++ b/devops/database/migrations/V20260119_001__Add_UnderReview_Escalated_Rejected_States.sql
@@ -0,0 +1,139 @@
+-- -----------------------------------------------------------------------------
+-- V20260119_001__Add_UnderReview_Escalated_Rejected_States.sql
+-- Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
+-- Task: UQ-005 - Migration for existing entries (map to new states)
+-- Description: Adds new state machine states and required columns
+-- -----------------------------------------------------------------------------
+
+-- Add new columns for UnderReview and Escalated states
+ALTER TABLE grey_queue_entries
+ADD COLUMN IF NOT EXISTS assignee VARCHAR(255) NULL,
+ADD COLUMN IF NOT EXISTS assigned_at TIMESTAMPTZ NULL,
+ADD COLUMN IF NOT EXISTS escalated_at TIMESTAMPTZ NULL,
+ADD COLUMN IF NOT EXISTS escalation_reason TEXT NULL;
+
+-- Add new enum values to grey_queue_status
+-- Note: PostgreSQL requires special handling for enum additions
+
+-- First, check if we need to add the values (idempotent)
+DO $$
+BEGIN
+    -- Add 'under_review' if not exists
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_enum 
+        WHERE enumlabel = 'under_review' 
+        AND enumtypid = 'grey_queue_status'::regtype
+    ) THEN
+        ALTER TYPE grey_queue_status ADD VALUE 'under_review' AFTER 'retrying';
+    END IF;
+    
+    -- Add 'escalated' if not exists
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_enum 
+        WHERE enumlabel = 'escalated' 
+        AND enumtypid = 'grey_queue_status'::regtype
+    ) THEN
+        ALTER TYPE grey_queue_status ADD VALUE 'escalated' AFTER 'under_review';
+    END IF;
+    
+    -- Add 'rejected' if not exists
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_enum 
+        WHERE enumlabel = 'rejected' 
+        AND enumtypid = 'grey_queue_status'::regtype
+    ) THEN
+        ALTER TYPE grey_queue_status ADD VALUE 'rejected' AFTER 'resolved';
+    END IF;
+EXCEPTION
+    WHEN others THEN
+        -- Enum values may already exist, which is fine
+        NULL;
+END $$;
+
+-- Add indexes for new query patterns
+CREATE INDEX IF NOT EXISTS idx_grey_queue_assignee 
+    ON grey_queue_entries(assignee) 
+    WHERE assignee IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_grey_queue_status_assignee 
+    ON grey_queue_entries(status, assignee) 
+    WHERE status IN ('under_review', 'escalated');
+
+CREATE INDEX IF NOT EXISTS idx_grey_queue_escalated_at 
+    ON grey_queue_entries(escalated_at DESC) 
+    WHERE escalated_at IS NOT NULL;
+
+-- Add audit trigger for state transitions
+CREATE TABLE IF NOT EXISTS grey_queue_state_transitions (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    entry_id UUID NOT NULL REFERENCES grey_queue_entries(id),
+    tenant_id VARCHAR(128) NOT NULL,
+    from_state VARCHAR(32) NOT NULL,
+    to_state VARCHAR(32) NOT NULL,
+    transitioned_by VARCHAR(255),
+    reason TEXT,
+    transitioned_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    metadata JSONB
+);
+
+CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_entry 
+    ON grey_queue_state_transitions(entry_id);
+
+CREATE INDEX IF NOT EXISTS idx_grey_queue_transitions_tenant_time 
+    ON grey_queue_state_transitions(tenant_id, transitioned_at DESC);
+
+-- Function to record state transitions
+CREATE OR REPLACE FUNCTION record_grey_queue_transition()
+RETURNS TRIGGER AS $$
+BEGIN
+    IF OLD.status IS DISTINCT FROM NEW.status THEN
+        INSERT INTO grey_queue_state_transitions (
+            entry_id, tenant_id, from_state, to_state, 
+            transitioned_by, transitioned_at
+        ) VALUES (
+            NEW.id, 
+            NEW.tenant_id, 
+            OLD.status::text, 
+            NEW.status::text,
+            COALESCE(NEW.assignee, current_user),
+            NOW()
+        );
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create trigger if not exists
+DROP TRIGGER IF EXISTS trg_grey_queue_state_transition ON grey_queue_entries;
+CREATE TRIGGER trg_grey_queue_state_transition
+    AFTER UPDATE ON grey_queue_entries
+    FOR EACH ROW
+    EXECUTE FUNCTION record_grey_queue_transition();
+
+-- Update summary view to include new states
+CREATE OR REPLACE VIEW grey_queue_summary AS
+SELECT 
+    tenant_id,
+    COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
+    COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
+    COUNT(*) FILTER (WHERE status = 'retrying') as retrying_count,
+    COUNT(*) FILTER (WHERE status = 'under_review') as under_review_count,
+    COUNT(*) FILTER (WHERE status = 'escalated') as escalated_count,
+    COUNT(*) FILTER (WHERE status = 'resolved') as resolved_count,
+    COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count,
+    COUNT(*) FILTER (WHERE status = 'failed') as failed_count,
+    COUNT(*) FILTER (WHERE status = 'expired') as expired_count,
+    COUNT(*) FILTER (WHERE status = 'dismissed') as dismissed_count,
+    COUNT(*) as total_count
+FROM grey_queue_entries
+GROUP BY tenant_id;
+
+-- Comment for documentation
+COMMENT ON COLUMN grey_queue_entries.assignee IS 
+    'Assignee for entries in UnderReview state (Sprint UQ-005)';
+COMMENT ON COLUMN grey_queue_entries.assigned_at IS 
+    'When the entry was assigned for review (Sprint UQ-005)';
+COMMENT ON COLUMN grey_queue_entries.escalated_at IS 
+    'When the entry was escalated to security team (Sprint UQ-005)';
+COMMENT ON COLUMN grey_queue_entries.escalation_reason IS 
+    'Reason for escalation (Sprint UQ-005)';
--- a/devops/database/migrations/V20260119__scanner_layer_diffid.sql
+++ b/devops/database/migrations/V20260119__scanner_layer_diffid.sql
@@ -0,0 +1,130 @@
+-- Migration: Add diff_id column to scanner layers table
+-- Sprint: SPRINT_025_Scanner_layer_manifest_infrastructure
+-- Task: TASK-025-03
+
+-- Add diff_id column to layers table (sha256:64hex = 71 chars)
+ALTER TABLE scanner.layers
+ADD COLUMN IF NOT EXISTS diff_id VARCHAR(71);
+
+-- Add timestamp for when diffID was computed
+ALTER TABLE scanner.layers
+ADD COLUMN IF NOT EXISTS diff_id_computed_at_utc TIMESTAMP;
+
+-- Create index on diff_id for fast lookups
+CREATE INDEX IF NOT EXISTS idx_layers_diff_id
+ON scanner.layers (diff_id)
+WHERE diff_id IS NOT NULL;
+
+-- Create image_layers junction table if it doesn't exist
+-- This tracks which layers belong to which images
+CREATE TABLE IF NOT EXISTS scanner.image_layers (
+    image_reference VARCHAR(512) NOT NULL,
+    layer_digest VARCHAR(71) NOT NULL,
+    layer_index INT NOT NULL,
+    created_at_utc TIMESTAMP NOT NULL DEFAULT NOW(),
+    PRIMARY KEY (image_reference, layer_digest)
+);
+
+CREATE INDEX IF NOT EXISTS idx_image_layers_digest
+ON scanner.image_layers (layer_digest);
+
+-- DiffID cache table for resolved diffIDs
+CREATE TABLE IF NOT EXISTS scanner.scanner_diffid_cache (
+    layer_digest VARCHAR(71) PRIMARY KEY,
+    diff_id VARCHAR(71) NOT NULL,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Base image fingerprint tables for layer reuse detection
+CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_fingerprints (
+    image_reference VARCHAR(512) PRIMARY KEY,
+    layer_count INT NOT NULL,
+    registered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    detection_count BIGINT NOT NULL DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS scanner.scanner_base_image_layers (
+    image_reference VARCHAR(512) NOT NULL REFERENCES scanner.scanner_base_image_fingerprints(image_reference) ON DELETE CASCADE,
+    layer_index INT NOT NULL,
+    diff_id VARCHAR(71) NOT NULL,
+    PRIMARY KEY (image_reference, layer_index)
+);
+
+CREATE INDEX IF NOT EXISTS idx_base_image_layers_diff_id
+ON scanner.scanner_base_image_layers (diff_id);
+
+-- Manifest snapshots table for IOciManifestSnapshotService
+CREATE TABLE IF NOT EXISTS scanner.manifest_snapshots (
+    id UUID PRIMARY KEY,
+    image_reference VARCHAR(512) NOT NULL,
+    registry VARCHAR(256) NOT NULL,
+    repository VARCHAR(256) NOT NULL,
+    tag VARCHAR(128),
+    manifest_digest VARCHAR(71) NOT NULL,
+    config_digest VARCHAR(71) NOT NULL,
+    media_type VARCHAR(128) NOT NULL,
+    layers JSONB NOT NULL,
+    diff_ids JSONB NOT NULL,
+    platform JSONB,
+    total_size BIGINT NOT NULL,
+    captured_at TIMESTAMPTZ NOT NULL,
+    snapshot_version VARCHAR(32),
+    UNIQUE (manifest_digest)
+);
+
+CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_image_ref
+ON scanner.manifest_snapshots (image_reference);
+
+CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_repository
+ON scanner.manifest_snapshots (registry, repository);
+
+CREATE INDEX IF NOT EXISTS idx_manifest_snapshots_captured_at
+ON scanner.manifest_snapshots (captured_at DESC);
+
+-- Layer scan history for reuse detection (TASK-025-04)
+CREATE TABLE IF NOT EXISTS scanner.layer_scans (
+    diff_id VARCHAR(71) PRIMARY KEY,
+    scanned_at TIMESTAMPTZ NOT NULL,
+    finding_count INT,
+    scanned_by VARCHAR(128) NOT NULL,
+    scanner_version VARCHAR(64)
+);
+
+-- Layer reuse counts for statistics
+CREATE TABLE IF NOT EXISTS scanner.layer_reuse_counts (
+    diff_id VARCHAR(71) PRIMARY KEY,
+    reuse_count INT NOT NULL DEFAULT 1,
+    first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_layer_reuse_counts_count
+ON scanner.layer_reuse_counts (reuse_count DESC);
+
+COMMENT ON COLUMN scanner.layers.diff_id IS 'Uncompressed layer content hash (sha256:hex64). Immutable once computed.';
+COMMENT ON TABLE scanner.scanner_diffid_cache IS 'Cache of layer digest to diffID mappings. Layer digests are immutable so cache entries never expire.';
+COMMENT ON TABLE scanner.scanner_base_image_fingerprints IS 'Known base image fingerprints for layer reuse detection.';
+COMMENT ON TABLE scanner.manifest_snapshots IS 'Point-in-time captures of OCI image manifests for delta scanning.';
+COMMENT ON TABLE scanner.layer_scans IS 'History of layer scans for deduplication. One entry per diffID.';
+COMMENT ON TABLE scanner.layer_reuse_counts IS 'Counts of how many times each layer appears across images.';
+
+-- Layer SBOM CAS for per-layer SBOM storage (TASK-026-02)
+CREATE TABLE IF NOT EXISTS scanner.layer_sbom_cas (
+    diff_id VARCHAR(71) NOT NULL,
+    format VARCHAR(20) NOT NULL,
+    content BYTEA NOT NULL,
+    size_bytes BIGINT NOT NULL,
+    compressed BOOLEAN NOT NULL DEFAULT TRUE,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    last_accessed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    PRIMARY KEY (diff_id, format)
+);
+
+CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_last_accessed
+ON scanner.layer_sbom_cas (last_accessed_at);
+
+CREATE INDEX IF NOT EXISTS idx_layer_sbom_cas_format
+ON scanner.layer_sbom_cas (format);
+
+COMMENT ON TABLE scanner.layer_sbom_cas IS 'Content-addressable storage for per-layer SBOMs. Keyed by diffID (immutable).';
+COMMENT ON COLUMN scanner.layer_sbom_cas.content IS 'Compressed (gzip) SBOM content.';
+COMMENT ON COLUMN scanner.layer_sbom_cas.last_accessed_at IS 'For TTL-based eviction of cold entries.';
--- a/devops/manifests/tetragon/stella-ops-tetragon-agent-daemonset.yaml
+++ b/devops/manifests/tetragon/stella-ops-tetragon-agent-daemonset.yaml
@@ -0,0 +1,246 @@
+# Tetragon Agent DaemonSet for Stella Ops
+# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
+# Task: TASK-019-007 - Create Kubernetes deployment extending existing manifests
+#
+# Deploys the Stella Ops Tetragon agent alongside the existing agent framework.
+# Follows existing DaemonSet patterns from devops/helm/
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: stella-ops-tetragon-agent
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+    app.kubernetes.io/component: runtime-instrumentation
+    app.kubernetes.io/part-of: stella-ops
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: stella-ops-tetragon-agent
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: stella-ops-tetragon-agent
+        app.kubernetes.io/component: runtime-instrumentation
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      serviceAccountName: stella-ops-tetragon-agent
+      hostPID: true
+      hostNetwork: false
+      tolerations:
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/control-plane
+          effect: NoSchedule
+      containers:
+        - name: tetragon-agent
+          image: stellaops/tetragon-agent:latest
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+                - SYS_ADMIN
+                - NET_ADMIN
+                - BPF
+                - PERFMON
+          ports:
+            - name: metrics
+              containerPort: 8080
+              protocol: TCP
+            - name: health
+              containerPort: 8081
+              protocol: TCP
+          env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: STELLA_API_URL
+              valueFrom:
+                configMapKeyRef:
+                  name: stella-ops-tetragon-config
+                  key: api-url
+            - name: STELLA_AGENT_ID
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: TETRAGON_GRPC_ADDRESS
+              value: "localhost:54321"
+            - name: LOG_LEVEL
+              valueFrom:
+                configMapKeyRef:
+                  name: stella-ops-tetragon-config
+                  key: log-level
+                  optional: true
+          volumeMounts:
+            - name: tetragon-config
+              mountPath: /etc/tetragon
+              readOnly: true
+            - name: agent-certs
+              mountPath: /etc/stella-ops/certs
+              readOnly: true
+            - name: bpf
+              mountPath: /sys/fs/bpf
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8081
+            initialDelaySeconds: 10
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8081
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: tetragon-config
+          configMap:
+            name: stella-ops-tetragon-policy
+        - name: agent-certs
+          secret:
+            secretName: stella-ops-agent-certs
+            optional: true
+        - name: bpf
+          hostPath:
+            path: /sys/fs/bpf
+            type: DirectoryOrCreate
+        - name: proc
+          hostPath:
+            path: /proc
+            type: Directory
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: stella-ops-tetragon-agent
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: stella-ops-tetragon-agent
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+rules:
+  # Read pods for container correlation
+  - apiGroups: [""]
+    resources: ["pods", "namespaces"]
+    verbs: ["get", "list", "watch"]
+  # Read nodes for host information
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list"]
+  # Read Tetragon CRDs
+  - apiGroups: ["cilium.io"]
+    resources: ["tracingpolicies", "tracingpoliciesnamespaced"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: stella-ops-tetragon-agent
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+subjects:
+  - kind: ServiceAccount
+    name: stella-ops-tetragon-agent
+    namespace: stella-ops
+roleRef:
+  kind: ClusterRole
+  name: stella-ops-tetragon-agent
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: stella-ops-tetragon-config
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+data:
+  api-url: "http://stella-ops-signals.stella-ops.svc.cluster.local:8080"
+  log-level: "info"
+  aggregation-window: "60s"
+  buffer-size: "10000"
+  min-confidence: "0.5"
+  # Privacy settings
+  redact-arguments: "true"
+  symbol-id-only-mode: "false"
+  # Allowed namespaces (comma-separated, empty = all)
+  allowed-namespaces: "stella-ops-workloads,default"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: stella-ops-tetragon-policy
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+data:
+  policy.yaml: |
+    # Reference the TracingPolicy defined in stella-ops-tracing-policy.yaml
+    # This ConfigMap can contain additional local policy configurations
+    policyRef: stella-ops-runtime-capture
+    enableStackTraces: true
+    stackTraceSize: 16
+    filterNamespaces:
+      - stella-ops-workloads
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: stella-ops-tetragon-agent
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+spec:
+  type: ClusterIP
+  clusterIP: None  # Headless for DaemonSet
+  ports:
+    - name: metrics
+      port: 8080
+      targetPort: metrics
+    - name: health
+      port: 8081
+      targetPort: health
+  selector:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+---
+# ServiceMonitor for Prometheus Operator (optional)
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: stella-ops-tetragon-agent
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops-tetragon-agent
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: stella-ops-tetragon-agent
+  endpoints:
+    - port: metrics
+      interval: 30s
+      path: /metrics
--- a/devops/manifests/tetragon/stella-ops-tracing-policy.yaml
+++ b/devops/manifests/tetragon/stella-ops-tracing-policy.yaml
@@ -0,0 +1,125 @@
+# Tetragon TracingPolicy for Stella Ops Runtime Instrumentation
+# Sprint: SPRINT_20260118_019_Infra_tetragon_integration
+# Task: TASK-019-001 - Define Tetragon TracingPolicy for stack capture
+#
+# This policy captures process execution, syscalls, and stack traces for
+# runtime reachability validation. Integrates with existing Signals infrastructure.
+
+apiVersion: cilium.io/v1alpha1
+kind: TracingPolicy
+metadata:
+  name: stella-ops-runtime-capture
+  namespace: stella-ops
+  labels:
+    app.kubernetes.io/name: stella-ops
+    app.kubernetes.io/component: runtime-instrumentation
+spec:
+  # Process execution events
+  kprobes:
+    - call: "sys_execve"
+      syscall: true
+      return: false
+      args:
+        - index: 0
+          type: "string"  # filename
+        - index: 1
+          type: "string"  # argv[0]
+      selectors:
+        - matchNamespaces:
+            - namespace: stella-ops-workloads
+              operator: In
+          matchLabels:
+            - key: "stella-ops.io/instrumented"
+              operator: Exists
+      returnArgAction: Post
+
+    # Security-relevant syscalls for reachability validation
+    - call: "sys_openat"
+      syscall: true
+      args:
+        - index: 0
+          type: "int"     # dirfd
+        - index: 1
+          type: "string"  # pathname
+        - index: 2
+          type: "int"     # flags
+      selectors:
+        - matchNamespaces:
+            - namespace: stella-ops-workloads
+              operator: In
+        - matchArgs:
+            - index: 1
+              operator: "Prefix"
+              values:
+                - "/etc/"
+                - "/proc/"
+                - "/sys/"
+      returnArg:
+        index: 0
+        type: "int"
+
+    - call: "sys_connect"
+      syscall: true
+      args:
+        - index: 0
+          type: "int"     # sockfd
+        - index: 1
+          type: "sock"    # addr struct
+      selectors:
+        - matchNamespaces:
+            - namespace: stella-ops-workloads
+              operator: In
+      returnArg:
+        index: 0
+        type: "int"
+
+  # Tracepoints for additional coverage
+  tracepoints:
+    - subsystem: "sched"
+      event: "sched_process_exec"
+      args:
+        - index: 0
+          type: "string"  # filename
+      selectors:
+        - matchNamespaces:
+            - namespace: stella-ops-workloads
+              operator: In
+
+  # Stack trace configuration
+  options:
+    # Enable kernel + userspace stack traces
+    stackTraces: true
+    # Capture both kernel and user stacks
+    stackTraceSize: 16
+    # Symbol resolution for userspace
+    symbols: true
+
+---
+# Companion TracingPolicy for library loading
+apiVersion: cilium.io/v1alpha1
+kind: TracingPolicy
+metadata:
+  name: stella-ops-library-capture
+  namespace: stella-ops
+spec:
+  # Capture dynamic library loading
+  uprobes:
+    - path: "/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2"
+      symbols:
+        - "_dl_map_object"
+      args:
+        - index: 0
+          type: "string"  # library name
+      selectors:
+        - matchNamespaces:
+            - namespace: stella-ops-workloads
+              operator: In
+
+  # Alternative for musl-based containers
+  - path: "/lib/ld-musl-x86_64.so.1"
+    symbols:
+      - "__dls3"
+    selectors:
+      - matchNamespaces:
+          - namespace: stella-ops-workloads
+            operator: In
--- a/devops/observability/grafana/dashboards/unknowns-queue-dashboard.json
+++ b/devops/observability/grafana/dashboards/unknowns-queue-dashboard.json
@@ -0,0 +1,361 @@
+{
+  "__inputs": [],
+  "annotations": {
+    "list": []
+  },
+  "description": "Unknowns Queue monitoring dashboard - Sprint SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "title": "Queue Overview",
+      "type": "row",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "collapsed": false
+    },
+    {
+      "title": "Total Queue Depth",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
+      "targets": [
+        {
+          "expr": "sum(unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold)",
+          "legendFormat": "Total"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 50, "color": "yellow" },
+              { "value": 100, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "HOT Unknowns",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
+      "targets": [
+        {
+          "expr": "unknowns_queue_depth_hot",
+          "legendFormat": "HOT"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 1, "color": "orange" },
+              { "value": 5, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "WARM Unknowns",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
+      "targets": [
+        {
+          "expr": "unknowns_queue_depth_warm",
+          "legendFormat": "WARM"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 10, "color": "yellow" },
+              { "value": 25, "color": "orange" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "COLD Unknowns",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
+      "targets": [
+        {
+          "expr": "unknowns_queue_depth_cold",
+          "legendFormat": "COLD"
+        }
+      ]
+    },
+    {
+      "title": "SLA Compliance",
+      "type": "gauge",
+      "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
+      "targets": [
+        {
+          "expr": "unknowns_sla_compliance * 100",
+          "legendFormat": "Compliance %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "red" },
+              { "value": 80, "color": "yellow" },
+              { "value": 95, "color": "green" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Stuck Processing",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
+      "targets": [
+        {
+          "expr": "greyqueue_processing_count",
+          "legendFormat": "Processing"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 5, "color": "yellow" },
+              { "value": 10, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Queue Depth Over Time",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
+      "targets": [
+        {
+          "expr": "unknowns_queue_depth_hot",
+          "legendFormat": "HOT"
+        },
+        {
+          "expr": "unknowns_queue_depth_warm",
+          "legendFormat": "WARM"
+        },
+        {
+          "expr": "unknowns_queue_depth_cold",
+          "legendFormat": "COLD"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 20
+          }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "HOT" }, "properties": [{ "id": "color", "value": { "fixedColor": "red" } }] },
+          { "matcher": { "id": "byName", "options": "WARM" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange" } }] },
+          { "matcher": { "id": "byName", "options": "COLD" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue" } }] }
+        ]
+      }
+    },
+    {
+      "title": "SLA Compliance Over Time",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
+      "targets": [
+        {
+          "expr": "unknowns_sla_compliance * 100",
+          "legendFormat": "SLA Compliance %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 80, "color": "yellow" },
+              { "value": 95, "color": "green" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Operations",
+      "type": "row",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
+      "collapsed": false
+    },
+    {
+      "title": "State Transitions (Rate)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
+      "targets": [
+        {
+          "expr": "rate(unknowns_state_transitions_total[5m])",
+          "legendFormat": "{{from_state}} → {{to_state}}"
+        }
+      ]
+    },
+    {
+      "title": "Processing Time (p95)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(unknowns_processing_time_seconds_bucket[5m]))",
+          "legendFormat": "p95 Processing Time"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        }
+      }
+    },
+    {
+      "title": "Escalations & Failures",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
+      "targets": [
+        {
+          "expr": "rate(unknowns_escalated_total[1h])",
+          "legendFormat": "Escalations"
+        },
+        {
+          "expr": "rate(unknowns_demoted_total[1h])",
+          "legendFormat": "Demotions"
+        },
+        {
+          "expr": "rate(unknowns_expired_total[1h])",
+          "legendFormat": "Expired"
+        },
+        {
+          "expr": "rate(greyqueue_watchdog_failed_total[1h])",
+          "legendFormat": "Failed"
+        }
+      ]
+    },
+    {
+      "title": "Resolution Time by Band",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"hot\"}[1h]))",
+          "legendFormat": "HOT (p50)"
+        },
+        {
+          "expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"warm\"}[1h]))",
+          "legendFormat": "WARM (p50)"
+        },
+        {
+          "expr": "histogram_quantile(0.50, rate(unknowns_resolution_time_hours_bucket{band=\"cold\"}[1h]))",
+          "legendFormat": "COLD (p50)"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h"
+        }
+      }
+    },
+    {
+      "title": "Watchdog Metrics",
+      "type": "row",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
+      "collapsed": false
+    },
+    {
+      "title": "Stuck & Timeout Events",
+      "type": "timeseries",
+      "gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 },
+      "targets": [
+        {
+          "expr": "rate(greyqueue_stuck_total[1h]) * 3600",
+          "legendFormat": "Stuck (per hour)"
+        },
+        {
+          "expr": "rate(greyqueue_timeout_total[1h]) * 3600",
+          "legendFormat": "Timeouts (per hour)"
+        },
+        {
+          "expr": "rate(greyqueue_watchdog_retry_total[1h]) * 3600",
+          "legendFormat": "Forced Retries (per hour)"
+        }
+      ]
+    },
+    {
+      "title": "Currently Processing",
+      "type": "stat",
+      "gridPos": { "h": 6, "w": 6, "x": 12, "y": 31 },
+      "targets": [
+        {
+          "expr": "greyqueue_processing_count",
+          "legendFormat": "In Processing"
+        }
+      ]
+    },
+    {
+      "title": "SLA Breaches Today",
+      "type": "stat",
+      "gridPos": { "h": 6, "w": 6, "x": 18, "y": 31 },
+      "targets": [
+        {
+          "expr": "increase(unknowns_sla_breach_total[24h])",
+          "legendFormat": "Breaches (24h)"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 1, "color": "red" }
+            ]
+          }
+        }
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["unknowns", "security", "sla"],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "title": "Unknowns Queue Dashboard",
+  "uid": "unknowns-queue-dashboard",
+  "version": 1
+}
--- a/devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
+++ b/devops/observability/prometheus/rules/unknowns-queue-alerts.yaml
@@ -0,0 +1,186 @@
+# Unknowns Queue Alert Rules
+# Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-007)
+# 
+# Deploy to Prometheus/Alertmanager
+
+groups:
+  - name: unknowns-queue
+    interval: 1m
+    rules:
+      # =============================================================================
+      # SLA Alerts
+      # =============================================================================
+      
+      - alert: UnknownsSlaBreachCritical
+        expr: unknowns_sla_compliance < 0.80
+        for: 5m
+        labels:
+          severity: critical
+          team: security
+        annotations:
+          summary: "SLA compliance dropped below 80%"
+          description: |
+            SLA compliance is {{ $value | humanizePercentage }}.
+            Multiple unknowns have breached their SLA deadlines.
+            Immediate action required.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
+          
+      - alert: UnknownsSlaBreachWarning
+        expr: unknowns_sla_compliance < 0.95 and unknowns_sla_compliance >= 0.80
+        for: 15m
+        labels:
+          severity: warning
+          team: security
+        annotations:
+          summary: "SLA compliance below 95%"
+          description: |
+            SLA compliance is {{ $value | humanizePercentage }}.
+            Some unknowns are approaching or have breached SLA.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-warning"
+          
+      - alert: UnknownsSlaBreach
+        expr: increase(unknowns_sla_breach_total[1h]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          team: security
+        annotations:
+          summary: "Unknown SLA breached"
+          description: |
+            {{ $value }} unknown(s) have breached SLA in the last hour.
+            Check the unknowns queue dashboard for affected entries.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#sla-breach"
+
+      # =============================================================================
+      # Queue Depth Alerts
+      # =============================================================================
+      
+      - alert: UnknownsHotQueueHigh
+        expr: unknowns_queue_depth_hot > 5
+        for: 10m
+        labels:
+          severity: critical
+          team: security
+        annotations:
+          summary: "High number of HOT unknowns"
+          description: |
+            {{ $value }} HOT unknowns in queue.
+            HOT unknowns have 24-hour SLA and block releases.
+            Prioritize resolution immediately.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
+          
+      - alert: UnknownsHotQueuePresent
+        expr: unknowns_queue_depth_hot > 0
+        for: 1h
+        labels:
+          severity: warning
+          team: security
+        annotations:
+          summary: "HOT unknowns present for over 1 hour"
+          description: |
+            {{ $value }} HOT unknown(s) have been in queue for over 1 hour.
+            50% of 24-hour SLA elapsed.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#hot-queue"
+          
+      - alert: UnknownsQueueBacklog
+        expr: (unknowns_queue_depth_hot + unknowns_queue_depth_warm + unknowns_queue_depth_cold) > 100
+        for: 30m
+        labels:
+          severity: warning
+          team: operations
+        annotations:
+          summary: "Unknowns queue backlog growing"
+          description: |
+            Total queue depth is {{ $value }}.
+            Consider scaling processing capacity or reviewing automation.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#backlog"
+
+      # =============================================================================
+      # Processing Alerts
+      # =============================================================================
+      
+      - alert: UnknownsStuckProcessing
+        expr: greyqueue_processing_count > 10
+        for: 30m
+        labels:
+          severity: warning
+          team: operations
+        annotations:
+          summary: "Many entries stuck in processing"
+          description: |
+            {{ $value }} entries in Processing status for extended period.
+            Check for processing bottlenecks or failures.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#stuck-processing"
+          
+      - alert: UnknownsProcessingTimeout
+        expr: increase(greyqueue_timeout_total[1h]) > 5
+        for: 0m
+        labels:
+          severity: warning
+          team: operations
+        annotations:
+          summary: "Processing timeouts occurring"
+          description: |
+            {{ $value }} processing timeouts in the last hour.
+            Entries are being forcefully retried.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#timeouts"
+          
+      - alert: UnknownsProcessingFailures
+        expr: increase(greyqueue_watchdog_failed_total[1h]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          team: operations
+        annotations:
+          summary: "Processing failures detected"
+          description: |
+            {{ $value }} entries moved to Failed status in the last hour.
+            Manual intervention may be required.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#failures"
+
+      # =============================================================================
+      # Escalation Alerts
+      # =============================================================================
+      
+      - alert: UnknownsEscalationRate
+        expr: increase(unknowns_escalated_total[1h]) > 10
+        for: 0m
+        labels:
+          severity: warning
+          team: security
+        annotations:
+          summary: "High escalation rate"
+          description: |
+            {{ $value }} unknowns escalated in the last hour.
+            Review escalation criteria or upstream data quality.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#escalations"
+
+      # =============================================================================
+      # Service Health Alerts
+      # =============================================================================
+      
+      - alert: UnknownsSlaMonitorDown
+        expr: absent(unknowns_queue_depth_hot) and absent(unknowns_queue_depth_warm)
+        for: 5m
+        labels:
+          severity: critical
+          team: operations
+        annotations:
+          summary: "Unknowns SLA monitor not reporting"
+          description: |
+            No metrics received from unknowns SLA monitor.
+            Check if the service is running.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#service-down"
+          
+      - alert: UnknownsHealthCheckUnhealthy
+        expr: probe_success{job="unknowns-healthcheck"} == 0
+        for: 5m
+        labels:
+          severity: critical
+          team: operations
+        annotations:
+          summary: "Unknowns service health check failing"
+          description: |
+            Health check endpoint returning unhealthy.
+            SLA breaches may exist.
+          runbook_url: "https://docs.stella-ops.org/operations/unknowns-queue-runbook#health-check"