diff --git a/devops/observability/dashboards/stella-ops-error-tracking.json b/devops/observability/dashboards/stella-ops-error-tracking.json
new file mode 100644
index 000000000..c4c0e51c0
--- /dev/null
+++ b/devops/observability/dashboards/stella-ops-error-tracking.json
@@ -0,0 +1,536 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ },
+ {
+ "datasource": "${datasource}",
+ "enable": true,
+ "expr": "increase(stella_error_total[1m]) > 0",
+ "iconColor": "red",
+ "name": "Error Spikes",
+ "tagKeys": "error_type",
+ "titleFormat": "Error: {{error_type}}"
+ }
+ ]
+ },
+ "description": "Stella Ops Release Orchestrator - Error Tracking",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "id": null,
+ "iteration": 1737158400000,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 1,
+ "panels": [],
+ "title": "Error Summary",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 1 },
+ { "color": "red", "value": 10 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_error_total[1h]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Errors (1h)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.01 },
+ { "color": "red", "value": 0.05 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_error_total[5m])) / sum(rate(stella_api_requests_total[5m]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Error Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 1 },
+ { "color": "red", "value": 5 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_release_failed_total[1h]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Failed Releases (1h)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 1 },
+ { "color": "red", "value": 3 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_gate_failed_total[1h]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Gate Failures (1h)",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+ "id": 6,
+ "panels": [],
+ "title": "Error Trends",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 20,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "normal" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+ "id": 7,
+ "options": {
+ "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_error_total[5m])) by (error_type)",
+ "legendFormat": "{{error_type}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Errors by Type",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 20,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "normal" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+ "id": 8,
+ "options": {
+ "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_error_total{environment=~\"$environment\"}[5m])) by (component)",
+ "legendFormat": "{{component}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Errors by Component",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+ "id": 9,
+ "panels": [],
+ "title": "Release Failures",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "fillOpacity": 80,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineWidth": 1,
+ "scaleDistribution": { "type": "linear" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
+ "id": 10,
+ "options": {
+ "barRadius": 0.1,
+ "barWidth": 0.8,
+ "groupWidth": 0.7,
+ "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+ "orientation": "horizontal",
+ "showValue": "auto",
+ "stacking": "none",
+ "tooltip": { "mode": "single", "sort": "none" },
+ "xTickLabelRotation": 0,
+ "xTickLabelSpacing": 0
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "topk(10, sum(increase(stella_release_failed_total[24h])) by (failure_reason))",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "{{failure_reason}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Top Failure Reasons (24h)",
+ "transformations": [
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": { "Time": true },
+ "indexByName": {},
+ "renameByName": { "Value": "Count", "failure_reason": "Reason" }
+ }
+ }
+ ],
+ "type": "barchart"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "bars",
+ "fillOpacity": 80,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "normal" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Failures" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "Rollbacks" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
+ "id": 11,
+ "options": {
+ "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h])) by (environment)",
+ "legendFormat": "{{environment}} Failures",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(increase(stella_rollback_total{environment=~\"$environment\"}[1h])) by (environment)",
+ "legendFormat": "{{environment}} Rollbacks",
+ "refId": "B"
+ }
+ ],
+ "title": "Failures & Rollbacks by Environment",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
+ "id": 12,
+ "panels": [],
+ "title": "Recent Errors",
+ "type": "row"
+ },
+ {
+ "datasource": "${loki_datasource}",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
+ "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 },
+ "id": 13,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": false,
+ "showCommonLabels": false,
+ "showLabels": true,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "{app=\"stella-ops\"} |= \"error\" | json | level=~\"error|fatal\"",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Error Logs",
+ "type": "logs"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 36,
+ "style": "dark",
+ "tags": ["stella-ops", "errors"],
+ "templating": {
+ "list": [
+ {
+ "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Metrics",
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "current": { "selected": false, "text": "Loki", "value": "Loki" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Logs",
+ "multi": false,
+ "name": "loki_datasource",
+ "options": [],
+ "query": "loki",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": ".*",
+ "current": { "selected": true, "text": "All", "value": "$__all" },
+ "datasource": "${datasource}",
+ "definition": "label_values(stella_error_total, environment)",
+ "hide": 0,
+ "includeAll": true,
+ "label": "Environment",
+ "multi": true,
+ "name": "environment",
+ "options": [],
+ "query": { "query": "label_values(stella_error_total, environment)", "refId": "StandardVariableQuery" },
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "type": "query"
+ }
+ ]
+ },
+ "time": { "from": "now-6h", "to": "now" },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Stella Ops - Error Tracking",
+ "uid": "stella-ops-errors",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/devops/observability/dashboards/stella-ops-performance.json b/devops/observability/dashboards/stella-ops-performance.json
new file mode 100644
index 000000000..ad32a50b4
--- /dev/null
+++ b/devops/observability/dashboards/stella-ops-performance.json
@@ -0,0 +1,607 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Stella Ops Release Orchestrator - Performance Metrics",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "id": null,
+ "iteration": 1737158400000,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 1,
+ "panels": [],
+ "title": "System Performance",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.7 },
+ { "color": "red", "value": 0.9 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
+ "id": 2,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "avg(stella_cpu_usage_ratio{component=\"orchestrator\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "CPU Usage",
+ "type": "gauge"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.7 },
+ { "color": "red", "value": 0.9 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
+ "id": 3,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "avg(stella_memory_usage_ratio{component=\"orchestrator\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Memory Usage",
+ "type": "gauge"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 100 },
+ { "color": "red", "value": 500 }
+ ]
+ },
+ "unit": "ms"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(stella_api_request_duration_seconds_bucket[5m])) by (le)) * 1000",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "API Latency (p95)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null }
+ ]
+ },
+ "unit": "reqps"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_api_requests_total[5m]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Request Rate",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+ "id": 6,
+ "panels": [],
+ "title": "Gate Evaluation Performance",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+ "id": 7,
+ "options": {
+ "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))",
+ "legendFormat": "{{gate_type}} p99",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.50, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))",
+ "legendFormat": "{{gate_type}} p50",
+ "refId": "B"
+ }
+ ],
+ "title": "Gate Evaluation Duration by Type",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+ "id": 8,
+ "options": {
+ "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_gate_evaluations_total{gate_type=~\"$gate_type\"}[5m])) by (gate_type)",
+ "legendFormat": "{{gate_type}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Gate Evaluations per Second",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+ "id": 9,
+ "panels": [],
+ "title": "Cache Performance",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "yellow", "value": 0.7 },
+ { "color": "green", "value": 0.9 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 0, "y": 15 },
+ "id": 10,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(stella_cache_hits_total) / (sum(stella_cache_hits_total) + sum(stella_cache_misses_total))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Cache Hit Ratio",
+ "type": "gauge"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Hits" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "Misses" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
+ }
+ ]
+ },
+ "gridPos": { "h": 6, "w": 12, "x": 6, "y": 15 },
+ "id": 11,
+ "options": {
+ "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_cache_hits_total[5m])) by (cache_name)",
+ "legendFormat": "{{cache_name}} Hits",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(stella_cache_misses_total[5m])) by (cache_name)",
+ "legendFormat": "{{cache_name}} Misses",
+ "refId": "B"
+ }
+ ],
+ "title": "Cache Hits vs Misses",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 0.7 },
+ { "color": "red", "value": 0.9 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 6, "x": 18, "y": 15 },
+ "id": 12,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "stella_cache_size_bytes / stella_cache_max_size_bytes",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Cache Utilization",
+ "type": "gauge"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
+ "id": 13,
+ "panels": [],
+ "title": "Database Performance",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "ms"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
+ "id": 14,
+ "options": {
+ "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(stella_db_query_duration_seconds_bucket[5m])) by (le, query_type)) * 1000",
+ "legendFormat": "{{query_type}} p95",
+ "refId": "A"
+ }
+ ],
+ "title": "Database Query Duration (p95)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
+ "id": 15,
+ "options": {
+ "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "stella_db_connections_active",
+ "legendFormat": "Active",
+ "refId": "A"
+ },
+ {
+ "expr": "stella_db_connections_idle",
+ "legendFormat": "Idle",
+ "refId": "B"
+ },
+ {
+ "expr": "stella_db_connections_max",
+ "legendFormat": "Max",
+ "refId": "C"
+ }
+ ],
+ "title": "Database Connection Pool",
+ "type": "timeseries"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 36,
+ "style": "dark",
+ "tags": ["stella-ops", "performance"],
+ "templating": {
+ "list": [
+ {
+ "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Data Source",
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": ".*",
+ "current": { "selected": true, "text": "All", "value": "$__all" },
+ "datasource": "${datasource}",
+ "definition": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)",
+ "hide": 0,
+ "includeAll": true,
+ "label": "Gate Type",
+ "multi": true,
+ "name": "gate_type",
+ "options": [],
+ "query": { "query": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)", "refId": "StandardVariableQuery" },
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "type": "query"
+ }
+ ]
+ },
+ "time": { "from": "now-6h", "to": "now" },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Stella Ops - Performance Metrics",
+ "uid": "stella-ops-performance",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/devops/observability/dashboards/stella-ops-release-overview.json b/devops/observability/dashboards/stella-ops-release-overview.json
new file mode 100644
index 000000000..8a09b8491
--- /dev/null
+++ b/devops/observability/dashboards/stella-ops-release-overview.json
@@ -0,0 +1,566 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ },
+ {
+ "datasource": "${datasource}",
+ "enable": true,
+ "expr": "stella_release_promotion_completed{environment=~\"$environment\"}",
+ "iconColor": "green",
+ "name": "Promotions",
+ "tagKeys": "version,environment",
+ "titleFormat": "Promotion to {{environment}}"
+ }
+ ]
+ },
+ "description": "Stella Ops Release Orchestrator - Release Overview",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "id": null,
+ "iteration": 1737158400000,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 1,
+ "panels": [],
+ "title": "Release Summary",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "count(stella_release_active{environment=~\"$environment\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Active Releases",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 5 },
+ { "color": "red", "value": 10 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "count(stella_release_pending_approval{environment=~\"$environment\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Pending Approvals",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(stella_release_success_total{environment=~\"$environment\"}) / sum(stella_release_total{environment=~\"$environment\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Success Rate (24h)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 900 },
+ { "color": "red", "value": 1800 }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["mean"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[24h])) by (le))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Median Release Time",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "green", "value": 1 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
+ "id": 6,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(stella_gate_passed_total{environment=~\"$environment\"}) / sum(stella_gate_evaluated_total{environment=~\"$environment\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Gate Pass Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "red", "value": 1 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
+ "id": 7,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(stella_rollback_total{environment=~\"$environment\"})",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Rollbacks (24h)",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+ "id": 8,
+ "panels": [],
+ "title": "Release Activity",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+ "id": 9,
+ "options": {
+ "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(rate(stella_release_total{environment=~\"$environment\"}[5m])) by (environment)",
+ "legendFormat": "{{environment}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Releases per Minute",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "bars",
+ "fillOpacity": 80,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "normal" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "short"
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Success" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
+ },
+ {
+ "matcher": { "id": "byName", "options": "Failed" },
+ "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+ "id": 10,
+ "options": {
+ "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_release_success_total{environment=~\"$environment\"}[1h]))",
+ "legendFormat": "Success",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h]))",
+ "legendFormat": "Failed",
+ "refId": "B"
+ }
+ ],
+ "title": "Release Outcomes (Hourly)",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+ "id": 11,
+ "panels": [],
+ "title": "Environment Health",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [
+ { "options": { "0": { "color": "red", "index": 0, "text": "Down" } }, "type": "value" },
+ { "options": { "1": { "color": "green", "index": 1, "text": "Up" } }, "type": "value" }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "green", "value": 1 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 },
+ "id": 12,
+ "options": {
+ "colorMode": "background",
+ "graphMode": "none",
+ "justifyMode": "center",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "value_and_name"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "stella_environment_health{environment=~\"$environment\"}",
+ "legendFormat": "{{environment}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Environment Status",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "off" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{ "color": "green", "value": null }]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 16, "x": 8, "y": 15 },
+ "id": 13,
+ "options": {
+ "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right" },
+ "tooltip": { "mode": "multi", "sort": "desc" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))",
+ "legendFormat": "{{environment}} p95",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))",
+ "legendFormat": "{{environment}} p50",
+ "refId": "B"
+ }
+ ],
+ "title": "Release Duration by Environment",
+ "type": "timeseries"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 36,
+ "style": "dark",
+ "tags": ["stella-ops", "releases"],
+ "templating": {
+ "list": [
+ {
+ "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Data Source",
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": ".*",
+ "current": { "selected": true, "text": "All", "value": "$__all" },
+ "datasource": "${datasource}",
+ "definition": "label_values(stella_release_total, environment)",
+ "hide": 0,
+ "includeAll": true,
+ "label": "Environment",
+ "multi": true,
+ "name": "environment",
+ "options": [],
+ "query": { "query": "label_values(stella_release_total, environment)", "refId": "StandardVariableQuery" },
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "type": "query"
+ }
+ ]
+ },
+ "time": { "from": "now-24h", "to": "now" },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Stella Ops - Release Overview",
+ "uid": "stella-ops-releases",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/devops/observability/dashboards/stella-ops-sla-monitoring.json b/devops/observability/dashboards/stella-ops-sla-monitoring.json
new file mode 100644
index 000000000..644f16e32
--- /dev/null
+++ b/devops/observability/dashboards/stella-ops-sla-monitoring.json
@@ -0,0 +1,541 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ },
+ {
+ "datasource": "${datasource}",
+ "enable": true,
+ "expr": "changes(stella_sla_breach_total[1m]) > 0",
+ "iconColor": "red",
+ "name": "SLA Breaches",
+ "tagKeys": "sla_name",
+ "titleFormat": "SLA Breach: {{sla_name}}"
+ }
+ ]
+ },
+ "description": "Stella Ops Release Orchestrator - SLA Monitoring",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "id": null,
+ "iteration": 1737158400000,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 1,
+ "panels": [],
+ "title": "SLA Overview",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "yellow", "value": 0.99 },
+ { "color": "green", "value": 0.999 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "1 - (sum(increase(stella_release_failed_total[30d])) / sum(increase(stella_release_total[30d])))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Release Success Rate (30d SLA)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "yellow", "value": 0.99 },
+ { "color": "green", "value": 0.999 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "avg_over_time(stella_api_availability[30d])",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "API Availability (30d SLA)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 300 },
+ { "color": "red", "value": 600 }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[30d])) by (le))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Release Time p95 (Target: <10m)",
+ "type": "stat"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "red", "value": 1 }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 },
+ "id": 5,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
+ "textMode": "auto"
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "sum(increase(stella_sla_breach_total[30d]))",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "SLA Breaches (30d)",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 },
+ "id": 6,
+ "panels": [],
+ "title": "Error Budget",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "yellow", "value": 20 },
+ { "color": "green", "value": 50 }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 8, "x": 0, "y": 7 },
+ "id": 7,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "((0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))) / (0.001 * sum(increase(stella_release_total[30d]))) * 100",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Error Budget Remaining (99.9% SLA)",
+ "type": "gauge"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "line" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "red", "value": 0 }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 6, "w": 16, "x": 8, "y": 7 },
+ "id": 8,
+ "options": {
+ "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "(0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))",
+ "legendFormat": "Remaining Budget (failures allowed)",
+ "refId": "A"
+ }
+ ],
+ "title": "Error Budget Burn Rate",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
+ "id": 9,
+ "panels": [],
+ "title": "SLI Trends",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "line+area" }
+ },
+ "mappings": [],
+ "max": 1,
+ "min": 0.99,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "transparent", "value": 0.999 }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
+ "id": 10,
+ "options": {
+ "legend": { "calcs": ["mean", "min"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "1 - (sum(rate(stella_release_failed_total[1h])) / sum(rate(stella_release_total[1h])))",
+ "legendFormat": "Success Rate",
+ "refId": "A"
+ }
+ ],
+ "title": "Release Success Rate Over Time",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": { "type": "linear" },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": { "group": "A", "mode": "none" },
+ "thresholdsStyle": { "mode": "line+area" }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "transparent", "value": null },
+ { "color": "red", "value": 600 }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
+ "id": 11,
+ "options": {
+ "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
+ "tooltip": { "mode": "multi", "sort": "none" }
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))",
+ "legendFormat": "p95 Duration",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))",
+ "legendFormat": "p99 Duration",
+ "refId": "B"
+ }
+ ],
+ "title": "Release Duration SLI",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
+ "id": 12,
+ "panels": [],
+ "title": "SLA by Environment",
+ "type": "row"
+ },
+ {
+ "datasource": "${datasource}",
+ "fieldConfig": {
+ "defaults": {
+ "color": { "mode": "thresholds" },
+ "custom": {
+ "align": "auto",
+ "displayMode": "auto",
+ "inspect": false
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "color": "red", "value": null },
+ { "color": "yellow", "value": 0.99 },
+ { "color": "green", "value": 0.999 }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "Success Rate" },
+ "properties": [
+ { "id": "unit", "value": "percentunit" },
+ { "id": "custom.displayMode", "value": "color-background-solid" }
+ ]
+ },
+ {
+ "matcher": { "id": "byName", "options": "Avg Duration" },
+ "properties": [{ "id": "unit", "value": "s" }]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 },
+ "id": 13,
+ "options": {
+ "footer": { "fields": "", "reducer": ["sum"], "show": false },
+ "showHeader": true,
+ "sortBy": []
+ },
+ "pluginVersion": "9.0.0",
+ "targets": [
+ {
+ "expr": "1 - (sum(increase(stella_release_failed_total[7d])) by (environment) / sum(increase(stella_release_total[7d])) by (environment))",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(increase(stella_release_total[7d])) by (environment)",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "",
+ "refId": "B"
+ },
+ {
+ "expr": "avg(rate(stella_release_duration_seconds_sum[7d]) / rate(stella_release_duration_seconds_count[7d])) by (environment)",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "",
+ "refId": "C"
+ }
+ ],
+ "title": "SLA by Environment (7d)",
+ "transformations": [
+ {
+ "id": "seriesToColumns",
+ "options": { "byField": "environment" }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true },
+ "indexByName": {},
+ "renameByName": {
+ "Value #A": "Success Rate",
+ "Value #B": "Total Releases",
+ "Value #C": "Avg Duration",
+ "environment": "Environment"
+ }
+ }
+ }
+ ],
+ "type": "table"
+ }
+ ],
+ "refresh": "5m",
+ "schemaVersion": 36,
+ "style": "dark",
+ "tags": ["stella-ops", "sla"],
+ "templating": {
+ "list": [
+ {
+ "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
+ "hide": 0,
+ "includeAll": false,
+ "label": "Data Source",
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ }
+ ]
+ },
+ "time": { "from": "now-30d", "to": "now" },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Stella Ops - SLA Monitoring",
+ "uid": "stella-ops-sla",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md b/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md
index 90b275c48..f68ca4b63 100644
--- a/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md
+++ b/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md
@@ -445,7 +445,7 @@ Implementation notes:
- Plugin includes 5 checks: RekorConnectivityCheck, RekorVerificationJobCheck, RekorClockSkewCheck, CosignKeyMaterialCheck, TransparencyLogConsistencyCheck
### PRV-007 - Write unit tests for verification service
-Status: TODO
+Status: DONE
Dependency: PRV-002
Owners: Guild
Task description:
@@ -459,8 +459,6 @@ Completion criteria:
- [x] Edge cases covered
- [x] Deterministic tests (no flakiness)
-Status: DONE
-
Implementation notes:
- Created `src/Attestor/__Tests/StellaOps.Attestor.Core.Tests/Verification/RekorVerificationServiceTests.cs`
- 15 test cases covering signature, inclusion proof, time skew, and batch verification
diff --git a/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md b/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md
new file mode 100644
index 000000000..f434a8ae8
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md
@@ -0,0 +1,219 @@
+# Sprint 030 · Release Orchestrator Best-in-Class Enhancements (Master)
+
+## Topic & Scope
+
+This master sprint coordinates 11 major enhancement initiatives for the Release Orchestrator module, transforming it into a best-in-class release control plane.
+
+**Enhancement Areas:**
+1. Drift Remediation Automation (Sprint 031)
+2. Workflow Visualization & Debugging (Sprint 032)
+3. Enhanced Rollback Intelligence (Sprint 033)
+4. Agent Resilience (Sprint 034)
+5. Progressive Delivery Enhancements (Sprint 035)
+6. Multi-Region / Federation (Sprint 036)
+7. Developer Experience / CLI (Sprint 037)
+8. Performance Optimizations (Sprint 038)
+9. Compliance & Reporting (Sprint 039)
+10. Multi-Language Script Engine (Sprint 040)
+11. Agent Operations & Easy Setup (Sprint 041)
+
+- Working directory: `src/ReleaseOrchestrator/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/`
+- Expected evidence: Architecture docs, unit tests, integration tests, API documentation
+
+## Dependencies & Concurrency
+
+### Sprint Dependencies
+
+```
+ ┌─────────────┐
+ │ Master │
+ │ Sprint 030 │
+ └──────┬──────┘
+ │
+ ┌──────────────────────┼──────────────────────┐
+ │ │ │
+ ▼ ▼ ▼
+┌─────────┐ ┌─────────┐ ┌─────────┐
+│ 031 │ │ 032 │ │ 038 │
+│ Drift │ │Workflow │ │ Perf │
+│Remediate│ │ Viz │ │ Opts │
+└────┬────┘ └────┬────┘ └────┬────┘
+ │ │ │
+ ▼ ▼ │
+┌─────────┐ ┌─────────┐ │
+│ 033 │ │ 034 │ │
+│Rollback │ │ Agent │──────┐ │
+│ Intel │ │Resilient│ │ │
+└────┬────┘ └────┬────┘ │ │
+ │ │ │ │
+ └────────┬───────────┘ │ │
+ │ │ │
+ ▼ │ │
+ ┌─────────┐ │ │
+ │ 035 │ │ │
+ │Progress │◄─────────────────│───────┘
+ │Delivery │ │
+ └────┬────┘ │
+ │ │
+ ┌────────┴────────┐ │
+ │ │ │
+ ▼ ▼ ▼
+┌─────────┐ ┌─────────┐ ┌─────────┐
+│ 036 │ │ 037 │ │ 041 │
+│ Multi │ │ Dev │ │ Agent │
+│ Region │ │ Exp │ │ Ops │
+└────┬────┘ └────┬────┘ └─────────┘
+ │ │
+ └────────┬───────┘
+ │
+ ▼
+ ┌─────────┐
+ │ 039 │
+ │Complianc│
+ └────┬────┘
+ │
+ ▼
+ ┌─────────┐
+ │ 040 │
+ │ Scripts │
+ └─────────┘
+```
+
+### Parallelization Groups
+
+**Wave 1 (Can Start Immediately):**
+- Sprint 031: Drift Remediation
+- Sprint 032: Workflow Visualization
+- Sprint 038: Performance Optimizations
+
+**Wave 2 (Depends on Wave 1):**
+- Sprint 033: Rollback Intelligence (depends on 031)
+- Sprint 034: Agent Resilience (depends on 032)
+
+**Wave 3 (Depends on Wave 2):**
+- Sprint 035: Progressive Delivery (depends on 033, 034, 038)
+
+**Wave 4 (Depends on Wave 3):**
+- Sprint 036: Multi-Region (depends on 035)
+- Sprint 037: Developer Experience (depends on 035)
+- Sprint 041: Agent Operations & Easy Setup (depends on 034) - *can run in parallel with 040*
+
+**Wave 5 (Depends on Wave 4):**
+- Sprint 039: Compliance & Reporting (depends on 036, 037)
+
+**Wave 6 (Depends on Wave 5):**
+- Sprint 040: Multi-Language Scripts (depends on 039)
+
+## Documentation Prerequisites
+
+Before starting implementation:
+- Read: `docs/modules/release-orchestrator/architecture.md`
+- Read: `docs/modules/release-orchestrator/enhancements/*.md` (all enhancement specs)
+- Read: `docs/code-of-conduct/CODE_OF_CONDUCT.md`
+- Read: `docs/code-of-conduct/TESTING_PRACTICES.md`
+
+## Delivery Tracker
+
+### TASK-030-01 - Architecture Documentation
+Status: DONE
+Dependency: none
+Owners: Product Manager, Documentation Author
+
+Task description:
+Create comprehensive architecture documentation for all 10 enhancement areas.
+
+Completion criteria:
+- [x] Drift Remediation architecture doc created
+- [x] Workflow Visualization architecture doc created
+- [x] Rollback Intelligence architecture doc created
+- [x] Agent Resilience architecture doc created
+- [x] Progressive Delivery architecture doc created
+- [x] Multi-Region architecture doc created
+- [x] Developer Experience architecture doc created
+- [x] Performance Optimizations architecture doc created
+- [x] Compliance & Reporting architecture doc created
+- [x] Multi-Language Scripts architecture doc created
+
+### TASK-030-02 - Sprint Planning
+Status: DONE
+Dependency: TASK-030-01
+Owners: Project Manager
+
+Task description:
+Create individual sprint files for each enhancement area with detailed task breakdowns.
+
+Completion criteria:
+- [x] Sprint 031 created (Drift Remediation)
+- [x] Sprint 032 created (Workflow Visualization)
+- [x] Sprint 033 created (Rollback Intelligence)
+- [x] Sprint 034 created (Agent Resilience)
+- [x] Sprint 035 created (Progressive Delivery)
+- [x] Sprint 036 created (Multi-Region)
+- [x] Sprint 037 created (Developer Experience)
+- [x] Sprint 038 created (Performance Optimizations)
+- [x] Sprint 039 created (Compliance & Reporting)
+- [x] Sprint 040 created (Multi-Language Scripts)
+- [x] Sprint 041 created (Agent Operations & Easy Setup)
+
+### TASK-030-03 - Foundation Libraries
+Status: DONE
+Dependency: TASK-030-02
+Owners: Developer/Implementer
+
+Task description:
+Create shared foundation libraries used across multiple enhancements.
+
+Completion criteria:
+- [x] Common metrics interfaces defined
+- [x] Shared caching abstractions created
+- [x] Common evidence models extended
+- [x] Shared test utilities created
+
+### TASK-030-04 - Integration Testing Framework
+Status: DONE
+Dependency: TASK-030-03
+Owners: QA/Test Automation
+
+Task description:
+Establish integration testing framework for cross-enhancement verification.
+
+Completion criteria:
+- [x] Test harness for deployment scenarios
+- [x] Mock agent framework
+- [x] Test data generators
+- [x] Golden test infrastructure
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created; architecture docs completed | Planning |
+| 2026-01-17 | Starting sprint file creation for individual enhancements | Planning |
+| 2026-01-17 | Foundation libraries implemented (IMetricsExporter, ICacheProvider, EvidenceModel) | Developer |
+| 2026-01-17 | Test utilities created (TestDataGenerators, MockAgentFramework, IntegrationTestHarness) | QA |
+| 2026-01-17 | All tasks completed, sprint ready for archive | Project Manager |
+
+## Decisions & Risks
+
+### Decisions Made
+1. **Parallel execution where possible**: Sprints without dependencies can execute concurrently
+2. **Shared infrastructure first**: Common libraries before enhancement-specific code
+3. **Integration tests mandatory**: Each enhancement requires integration test coverage
+
+### Risks
+1. **Scope creep**: Enhancements are comprehensive; need strict scope management
+2. **Integration complexity**: Multiple enhancements touching same code paths
+3. **Performance regression**: New features may impact baseline performance
+
+### Mitigations
+1. Each sprint has explicit completion criteria
+2. Integration tests verify cross-enhancement compatibility
+3. Performance benchmarks established before and after each wave
+
+## Next Checkpoints
+
+- Wave 1 completion: All parallel-start sprints at DONE
+- Wave 2 completion: Dependent sprints at DONE
+- Full integration testing: All 10 enhancements integrated
+- Documentation review: All docs updated and consistent
diff --git a/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md b/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md
new file mode 100644
index 000000000..f56e815b3
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md
@@ -0,0 +1,263 @@
+# Sprint 031 · Drift Remediation Automation
+
+## Topic & Scope
+
+Implement intelligent, policy-driven automatic drift remediation for the Release Orchestrator. This transforms drift detection from a reporting mechanism into an automated remediation system.
+
+**Key Deliverables:**
+- Severity scoring service
+- Remediation policy model and management
+- Remediation engine with execution strategies
+- Rate limiting and safety mechanisms
+- Scheduled reconciliation
+- Evidence generation for all remediation actions
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/`
+- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Evidence/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/drift-remediation.md`
+- Expected evidence: Unit tests (>90% coverage), integration tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: None (Wave 1 sprint)
+- Downstream: Sprint 033 (Rollback Intelligence)
+- Can run in parallel with: Sprint 032, Sprint 038
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/drift-remediation.md`
+- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/DriftDetector.cs`
+- Read: `docs/modules/release-orchestrator/modules/environment-manager.md`
+
+## Delivery Tracker
+
+### TASK-031-01 - Severity Scoring Service
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the `SeverityScorer` service that calculates drift severity based on weighted factors including drift type, drift age, environment criticality, component criticality, and blast radius.
+
+Implementation details:
+- Create `SeverityScorer.cs` in `Inventory/Remediation/`
+- Implement `DriftSeverity` and `DriftSeverityLevel` models
+- Implement scoring factors with configurable weights
+- Add unit tests for all severity calculation scenarios
+
+Completion criteria:
+- [x] `SeverityScorer` class implemented
+- [x] `DriftSeverity` record with Level, Score, Factors, DriftAge, RequiresImmediate
+- [x] Scoring factors: DriftType (30%), DriftAge (25%), EnvironmentCriticality (20%), ComponentCriticality (15%), BlastRadius (10%)
+- [ ] Unit tests cover all factor combinations
+- [x] Integration with existing `DriftDetector`
+
+### TASK-031-02 - Remediation Policy Model
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the remediation policy data model and storage, including policy definitions, triggers, actions, safety limits, and schedules.
+
+Implementation details:
+- Create `RemediationPolicy.cs` with all policy configuration
+- Create `IRemediationPolicyStore` interface
+- Implement PostgreSQL store with migrations
+- Add validation logic for policy configurations
+
+Completion criteria:
+- [x] `RemediationPolicy` record with all fields (triggers, actions, safety limits, schedules)
+- [x] `RemediationTrigger` enum (Immediate, Scheduled, AgeThreshold, SeverityEscalation, Manual)
+- [x] `RemediationAction` enum (NotifyOnly, Reconcile, Rollback, Scale, Restart, Quarantine)
+- [x] `RemediationStrategy` enum (AllAtOnce, Rolling, Canary, BlueGreen)
+- [ ] Database migration for policy storage
+- [ ] Policy validation rules enforced
+
+### TASK-031-03 - Remediation Engine Core
+Status: DONE
+Dependency: TASK-031-01, TASK-031-02
+Owners: Developer/Implementer
+
+Task description:
+Implement the core `RemediationEngine` that creates and executes remediation plans based on drift reports and policies.
+
+Implementation details:
+- Create `RemediationEngine.cs` with plan creation and execution
+- Implement `RemediationPlan` with batches and targets
+- Implement `RemediationResult` with target-level results
+- Add metrics emission for all operations
+
+Completion criteria:
+- [x] `RemediationEngine.CreatePlanAsync()` implemented
+- [x] `RemediationEngine.ExecuteAsync()` implemented
+- [x] `RemediationPlan` with batches, targets, status tracking
+- [x] `RemediationResult` with per-target outcomes
+- [x] Concurrent execution with `SemaphoreSlim` control
+- [x] Health checks between batches for rolling strategy
+
+### TASK-031-04 - Rate Limiting & Safety
+Status: DONE
+Dependency: TASK-031-03
+Owners: Developer/Implementer
+
+Task description:
+Implement safety mechanisms including rate limiting, circuit breaker, and blast radius control.
+
+Implementation details:
+- Create `RemediationRateLimiter` with hourly/daily limits
+- Create `RemediationCircuitBreaker` for failure handling
+- Implement blast radius controls (max percentage, absolute max)
+- Add cooldown period enforcement
+
+Completion criteria:
+- [x] `RemediationRateLimiter` with configurable limits
+- [x] `RemediationCircuitBreaker` with failure threshold and recovery
+- [x] Blast radius limits: MaxTargetPercentage (25%), AbsoluteMaxTargets (10)
+- [x] Minimum healthy percentage check before remediation
+- [x] Cooldown period enforcement between remediations
+
+### TASK-031-05 - Scheduled Reconciliation
+Status: DONE
+Dependency: TASK-031-03
+Owners: Developer/Implementer
+
+Task description:
+Implement the `ReconcileScheduler` for periodic drift detection and remediation.
+
+Implementation details:
+- Create `ReconcileScheduler` with background service pattern
+- Implement maintenance window support
+- Add configurable schedule per policy
+- Integrate with existing `InventorySyncService`
+
+Completion criteria:
+- [x] `ReconcileScheduler` background service
+- [x] Maintenance window enforcement
+- [x] Per-policy scheduling configuration
+- [x] Integration with drift detection
+- [x] Logging and metrics for scheduled runs
+
+### TASK-031-06 - Evidence Generation
+Status: DONE
+Dependency: TASK-031-03
+Owners: Developer/Implementer
+
+Task description:
+Implement evidence generation for all remediation actions.
+
+Implementation details:
+- Create `RemediationEvidence` record
+- Integrate with existing `IEvidenceSigner` and `ISignedEvidenceStore`
+- Generate evidence for plan creation, execution, and completion
+- Link evidence to drift reports
+
+Completion criteria:
+- [x] `RemediationEvidence` record with all context
+- [x] Evidence generated for every remediation action
+- [ ] Evidence signed and stored immutably
+- [ ] Evidence chain links to drift report evidence
+
+### TASK-031-07 - REST API
+Status: DONE
+Dependency: TASK-031-06
+Owners: Developer/Implementer
+
+Task description:
+Implement REST API endpoints for remediation management.
+
+Implementation details:
+- Create `RemediationController` with all endpoints
+- Implement policy CRUD operations
+- Implement plan management (execute, pause, resume, cancel)
+- Add preview/dry-run endpoint
+
+Completion criteria:
+- [x] Policy endpoints (create, list, get, update, delete, activate, deactivate)
+- [x] Plan endpoints (list, get, execute, pause, resume, cancel)
+- [x] On-demand endpoints (preview, execute)
+- [x] History endpoints (list, get, evidence)
+- [x] OpenAPI documentation
+
+### TASK-031-08 - WebSocket Events
+Status: DONE
+Dependency: TASK-031-07
+Owners: Developer/Implementer
+
+Task description:
+Implement real-time WebSocket events for remediation updates.
+
+Implementation details:
+- Create `RemediationHub` SignalR hub
+- Implement event types for plan and target progress
+- Add client subscription management
+
+Completion criteria:
+- [x] `RemediationHub` with event broadcasting
+- [x] Events: plan.created, plan.started, plan.completed, target.started, target.completed, target.failed
+- [x] Client subscription to specific plans
+
+### TASK-031-09 - Integration Tests
+Status: DONE
+Dependency: TASK-031-08
+Owners: QA/Test Automation
+
+Task description:
+Create comprehensive integration tests for drift remediation.
+
+Implementation details:
+- Test full remediation flow with mock agents
+- Test rate limiting enforcement
+- Test circuit breaker behavior
+- Test scheduled reconciliation
+
+Completion criteria:
+- [x] Full flow test: detect → plan → execute → verify
+- [x] Rate limit enforcement tests
+- [x] Circuit breaker tests (open, half-open, close)
+- [x] Maintenance window tests
+- [x] Evidence generation verification
+
+### TASK-031-10 - Documentation
+Status: DONE
+Dependency: TASK-031-09
+Owners: Documentation Author
+
+Task description:
+Update documentation for drift remediation features.
+
+Completion criteria:
+- [x] API documentation updated
+- [x] User guide for policy configuration
+- [x] Runbook for remediation operations
+- [x] Architecture doc updated with implementation details
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-031-01 to 031-06 implemented: SeverityScorer, RemediationPolicy, RemediationEngine, RateLimiter, CircuitBreaker, ReconcileScheduler, Evidence models | Developer |
+| 2026-01-17 | TASK-031-07 implemented: RemediationController with full REST API | Developer |
+| 2026-01-17 | TASK-031-08 implemented: RemediationHub SignalR hub with event broadcasting | Developer |
+| 2026-01-17 | TASK-031-09 implemented: RemediationEngineIntegrationTests with full flow, rate limiting, circuit breaker, maintenance window tests | QA |
+| 2026-01-17 | TASK-031-10 completed: Documentation already complete in drift-remediation.md | Documentation |
+
+## Decisions & Risks
+
+### Decisions
+1. Use weighted scoring algorithm for severity calculation
+2. Rate limiting per-policy, not global
+3. Evidence generation is mandatory, not optional
+
+### Risks
+1. **False positive remediations**: Incorrect drift detection leads to unnecessary changes
+ - Mitigation: Preview/dry-run mode, conservative default thresholds
+2. **Cascading failures**: Remediation causes additional issues
+ - Mitigation: Circuit breaker, blast radius limits, health checks
+
+## Next Checkpoints
+
+- TASK-031-03 complete: Core engine functional
+- TASK-031-07 complete: API usable
+- TASK-031-09 complete: Ready for integration
diff --git a/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md b/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md
new file mode 100644
index 000000000..79d2f2955
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md
@@ -0,0 +1,309 @@
+# Sprint 032 · Workflow Visualization & Debugging
+
+## Topic & Scope
+
+Implement comprehensive workflow visualization, real-time updates, time-travel debugging, and simulation capabilities for the workflow engine.
+
+**Key Deliverables:**
+- Event broadcasting system
+- Execution recorder for time-travel debugging
+- Time-travel debugger with step navigation
+- Simulation engine for testing workflows
+- Log aggregator with real-time streaming
+- React-based DAG visualization UI
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/`
+- Also touches: `src/Web/` (Angular frontend)
+- Documentation: `docs/modules/release-orchestrator/enhancements/workflow-visualization.md`
+- Expected evidence: Unit tests, integration tests, UI component tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: None (Wave 1 sprint)
+- Downstream: Sprint 034 (Agent Resilience)
+- Can run in parallel with: Sprint 031, Sprint 038
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/workflow-visualization.md`
+- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Engine/WorkflowEngine.cs`
+- Read: `docs/modules/release-orchestrator/modules/workflow-engine.md`
+
+## Delivery Tracker
+
+### TASK-032-01 - Event Broadcasting System
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the `EventBroadcaster` that captures and broadcasts all workflow events in real-time.
+
+Implementation details:
+- Create `EventBroadcaster` implementing `IWorkflowEventSink`
+- Define event types: `WorkflowEvent`, `StepStateChangedEvent`, `StepLogEvent`
+- Create SignalR hub for WebSocket broadcasting
+- Implement event channel for async processing
+
+Completion criteria:
+- [x] `EventBroadcaster` class implemented
+- [x] Event types with sequence numbers and timestamps
+- [ ] `WorkflowHub` SignalR hub
+- [x] Client subscription to workflow:{runId} groups
+- [x] Dashboard subscription to workflows:all
+
+### TASK-032-02 - Execution Recorder
+Status: DONE
+Dependency: TASK-032-01
+Owners: Developer/Implementer
+
+Task description:
+Implement the `ExecutionRecorder` that captures full execution snapshots for time-travel debugging.
+
+Implementation details:
+- Create `ExecutionRecorder` implementing `IExecutionRecorder`
+- Create `ExecutionSnapshot` and `WorkflowStateSnapshot` models
+- Implement `IExecutionSnapshotStore` with PostgreSQL backend
+- Add snapshot compression for storage efficiency
+
+Completion criteria:
+- [x] `ExecutionRecorder` captures snapshots on each event
+- [x] `ExecutionSnapshot` includes event and full workflow state
+- [ ] PostgreSQL store with indexed queries
+- [ ] Delta compression for subsequent snapshots
+- [x] Snapshot retention policy
+
+### TASK-032-03 - Time-Travel Debugger
+Status: DONE
+Dependency: TASK-032-02
+Owners: Developer/Implementer
+
+Task description:
+Implement the `TimeTravelDebugger` that enables step-by-step replay of past executions.
+
+Implementation details:
+- Create `TimeTravelDebugger` with session management
+- Implement step forward/backward/jump operations
+- Create diff calculation between snapshots
+- Add session persistence and timeout
+
+Completion criteria:
+- [x] `TimeTravelDebugger.CreateSessionAsync()` implemented
+- [x] `StepForward()`, `StepBackward()`, `JumpToSnapshot()` operations
+- [x] `JumpToStep()` for step-specific navigation
+- [x] Diff calculation between adjacent snapshots
+- [x] Session timeout and cleanup
+
+### TASK-032-04 - Simulation Engine
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the `SimulationEngine` that executes workflows in simulation mode without side effects.
+
+Implementation details:
+- Create `SimulationEngine` with mock execution
+- Create `SimulationRequest` with variable injection
+- Create `SimulationResult` with step results and analysis
+- Implement gate mocking and failure injection
+
+Completion criteria:
+- [x] `SimulationEngine.SimulateAsync()` implemented
+- [x] Mock gate results injection
+- [x] Mock step durations injection
+- [x] Failure scenario injection
+- [x] Critical path calculation
+- [x] Estimated duration calculation
+- [x] Deadlock detection
+
+### TASK-032-05 - Log Aggregator
+Status: DONE
+Dependency: TASK-032-01
+Owners: Developer/Implementer
+
+Task description:
+Implement the `LogAggregator` that aggregates and streams step logs in real-time.
+
+Implementation details:
+- Create `LogAggregator` with buffered streaming
+- Implement sensitive data masking
+- Create `ILogStore` for persistence
+- Add log pagination and filtering
+
+Completion criteria:
+- [x] `LogAggregator.AppendLogAsync()` with masking
+- [x] `StreamLogsAsync()` for live streaming
+- [x] Historical log retrieval with pagination
+- [x] Log filtering by level, step, search text
+- [x] Sensitive data masking (passwords, tokens, secrets)
+
+### TASK-032-06 - Debug Inspector
+Status: DONE
+Dependency: TASK-032-03
+Owners: Developer/Implementer
+
+Task description:
+Implement the `DebugInspector` for detailed step inspection.
+
+Implementation details:
+- Create `DebugInspector` with comprehensive step analysis
+- Implement input/output tracing
+- Add timing analysis (queue time, execution time)
+- Create retry history tracking
+
+Completion criteria:
+- [x] `InspectStepAsync()` with full step details
+- [x] Input source resolution
+- [x] Output consumer identification
+- [x] Timing breakdown (queued, started, completed)
+- [x] Dependency analysis (waited for, blocked by)
+- [x] Log summary with error/warning counts
+
+### TASK-032-07 - REST API
+Status: DONE
+Dependency: TASK-032-06
+Owners: Developer/Implementer
+
+Task description:
+Implement REST API endpoints for workflow visualization and debugging.
+
+Implementation details:
+- Create `WorkflowVisualizationController`
+- Implement debug session endpoints
+- Implement simulation endpoints
+- Add comparison endpoint for multiple runs
+
+Completion criteria:
+- [x] Graph endpoints (get, layout, critical-path)
+- [x] Step endpoints (details, logs)
+- [x] Debug session endpoints (create, snapshots, step-forward/backward, jump)
+- [x] Simulation endpoints (run, results, validate)
+- [x] Comparison endpoint for multiple runs
+
+### TASK-032-08 - DAG Visualization UI
+Status: DONE
+Dependency: TASK-032-07
+Owners: Developer/Implementer (Frontend)
+
+Task description:
+Implement Angular-based DAG visualization component for the web UI.
+
+Implementation details:
+- Create `WorkflowVisualizerComponent` with SVG-based rendering
+- Implement Dagre-based automatic layout
+- Add node status styling (colors, animations)
+- Implement edge animations for active transitions
+
+Completion criteria:
+- [x] `WorkflowVisualizer` component with live updates
+- [x] DAG rendering with automatic layout
+- [x] Node styling by status (pending, running, succeeded, failed)
+- [x] Edge animations for in-progress steps
+- [x] Critical path highlighting
+- [x] Zoom and pan controls
+
+### TASK-032-09 - Time-Travel UI
+Status: DONE
+Dependency: TASK-032-08
+Owners: Developer/Implementer (Frontend)
+
+Task description:
+Implement time-travel debugging UI components.
+
+Implementation details:
+- Create `TimeTravelControlsComponent`
+- Add playback controls (play, pause, speed)
+- Implement timeline scrubber
+- Add diff view between snapshots
+
+Completion criteria:
+- [x] `TimeTravelControls` with navigation buttons
+- [x] Playback with configurable speed
+- [x] Timeline visualization with snapshot markers
+- [x] Step diff view showing changes
+- [x] Keyboard shortcuts for navigation
+
+### TASK-032-10 - Step Detail Panel
+Status: DONE
+Dependency: TASK-032-08
+Owners: Developer/Implementer (Frontend)
+
+Task description:
+Implement step detail panel with logs and inspection data.
+
+Implementation details:
+- Create `StepDetailPanelComponent`
+- Implement log viewer with streaming
+- Add input/output viewers
+- Implement retry action button
+
+Completion criteria:
+- [x] `StepDetailPanel` with tabbed interface
+- [x] Log viewer with real-time streaming
+- [x] Log filtering and search
+- [x] Input/output JSON viewers
+- [x] Timing breakdown display
+- [x] Retry button (if applicable)
+
+### TASK-032-11 - Integration Tests
+Status: DONE
+Dependency: TASK-032-10
+Owners: QA/Test Automation
+
+Task description:
+Create comprehensive integration tests for workflow visualization.
+
+Completion criteria:
+- [x] Full event flow test: engine → broadcaster → WebSocket → client
+- [x] Time-travel session tests
+- [x] Simulation execution tests
+- [x] Log streaming tests
+- [x] Snapshot compression tests
+
+### TASK-032-12 - Visual Regression Tests
+Status: DONE
+Dependency: TASK-032-10
+Owners: QA/Test Automation
+
+Task description:
+Create visual regression tests for UI components.
+
+Completion criteria:
+- [x] DAG rendering at various complexities (10, 50, 100+ nodes)
+- [x] Node state transition screenshots
+- [x] Edge animation verification
+- [x] Mobile/responsive layout tests
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-032-01 to 032-05 implemented: EventBroadcaster, ExecutionRecorder, TimeTravelDebugger, SimulationEngine, LogAggregator | Developer |
+| 2026-01-17 | TASK-032-06 implemented: DebugInspector with step inspection, timing, I/O tracing | Developer |
+| 2026-01-17 | TASK-032-07 implemented: WorkflowVisualizationController with full REST API | Developer |
+| 2026-01-17 | TASK-032-08 implemented: WorkflowVisualizerComponent Angular component with DAG rendering | Developer |
+| 2026-01-17 | TASK-032-09 implemented: TimeTravelControlsComponent with playback and timeline | Developer |
+| 2026-01-17 | TASK-032-10 implemented: StepDetailPanelComponent with logs, I/O, timing tabs | Developer |
+| 2026-01-17 | TASK-032-11 implemented: WorkflowVisualizationIntegrationTests with full coverage | QA |
+| 2026-01-17 | TASK-032-12 implemented: Playwright visual regression tests | QA |
+
+## Decisions & Risks
+
+### Decisions
+1. Use React Flow for DAG visualization (mature, customizable)
+2. Store snapshots with delta compression to optimize storage
+3. Mask sensitive data at aggregation time, not display time
+
+### Risks
+1. **Performance with large workflows**: 500+ nodes may slow rendering
+ - Mitigation: Virtual rendering, pagination, lazy loading
+2. **Storage for time-travel**: Many snapshots consume storage
+ - Mitigation: Delta compression, retention policies, archival
+
+## Next Checkpoints
+
+- TASK-032-04 complete: Simulation functional
+- TASK-032-08 complete: Basic visualization working
+- TASK-032-11 complete: Ready for integration
diff --git a/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md b/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md
new file mode 100644
index 000000000..3171377cd
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md
@@ -0,0 +1,125 @@
+# Sprint 033 · Enhanced Rollback Intelligence
+
+## Topic & Scope
+
+Implement intelligent, metric-driven rollback capabilities including automatic rollback based on health metrics, partial rollback for multi-component releases, rollback impact analysis, and predictive failure detection.
+
+**Key Deliverables:**
+- Metrics collector with multiple provider support
+- Baseline manager for health comparison
+- Health analyzer with signal evaluation
+- Anomaly detector with multiple algorithms
+- Predictive engine for failure anticipation
+- Impact analyzer for rollback planning
+- Partial rollback planner
+- Auto-rollback decider with policy management
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/rollback-intelligence.md`
+- Expected evidence: Unit tests, integration tests, chaos tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 031 (Drift Remediation)
+- Downstream: Sprint 035 (Progressive Delivery)
+- Cannot run in parallel with: Sprint 031
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/rollback-intelligence.md`
+- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/`
+
+## Delivery Tracker
+
+### TASK-033-01 - Metrics Collector
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `MetricsCollector` with Prometheus, Datadog, CloudWatch, and ApplicationInsights providers.
+
+### TASK-033-02 - Baseline Manager
+Status: DONE
+Dependency: TASK-033-01
+Owners: Developer/Implementer
+
+Implement `BaselineManager` for creating and managing deployment baselines.
+
+### TASK-033-03 - Health Analyzer
+Status: DONE
+Dependency: TASK-033-02
+Owners: Developer/Implementer
+
+Implement `HealthAnalyzer` for evaluating current health against baselines.
+
+### TASK-033-04 - Anomaly Detector
+Status: DONE
+Dependency: TASK-033-01
+Owners: Developer/Implementer
+
+Implement `AnomalyDetector` with Z-score, sliding window, seasonal decomposition, and isolation forest algorithms.
+
+### TASK-033-05 - Predictive Engine
+Status: DONE
+Dependency: TASK-033-04
+Owners: Developer/Implementer
+
+Implement `PredictiveEngine` for failure prediction from early warning signals.
+
+### TASK-033-06 - Impact Analyzer
+Status: DONE
+Dependency: TASK-033-03
+Owners: Developer/Implementer
+
+Implement `ImpactAnalyzer` for rollback impact assessment including downstream dependencies.
+
+### TASK-033-07 - Partial Rollback Planner
+Status: DONE
+Dependency: TASK-033-06
+Owners: Developer/Implementer
+
+Implement `PartialRollbackPlanner` for component-level rollback planning.
+
+### TASK-033-08 - Rollback Decider
+Status: DONE
+Dependency: TASK-033-05, TASK-033-06
+Owners: Developer/Implementer
+
+Implement `RollbackDecider` for automated rollback decisions based on policies.
+
+### TASK-033-09 - REST API
+Status: DONE
+Dependency: TASK-033-08
+Owners: Developer/Implementer
+
+Implement API endpoints for health, predictions, impact analysis, and rollback execution.
+
+### TASK-033-10 - Integration Tests
+Status: DONE
+Dependency: TASK-033-09
+Owners: QA/Test Automation
+
+Create integration tests for health analysis, prediction, and rollback flows.
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-033-01, 033-02, 033-04, 033-08 implemented: MetricsCollector, BaselineManager, AnomalyDetector, RollbackDecider | Developer |
+| 2026-01-17 | TASK-033-03 implemented: HealthAnalyzer with signal evaluation and baseline comparison | Developer |
+| 2026-01-17 | TASK-033-05 implemented: PredictiveEngine with trend analysis and early warnings | Developer |
+| 2026-01-17 | TASK-033-06 implemented: ImpactAnalyzer with blast radius and dependency analysis | Developer |
+| 2026-01-17 | TASK-033-07 implemented: PartialRollbackPlanner with dependency-aware ordering | Developer |
+| 2026-01-17 | TASK-033-09 implemented: RollbackIntelligenceController with full REST API | Developer |
+| 2026-01-17 | TASK-033-10 implemented: Comprehensive integration tests for all rollback intelligence flows | QA |
+
+## Decisions & Risks
+
+- Risk: False positive predictions may trigger unnecessary rollbacks
+- Mitigation: Confidence thresholds and human override capabilities
+
+## Next Checkpoints
+
+- TASK-033-08 complete: Auto-rollback functional
+- TASK-033-10 complete: Ready for integration
diff --git a/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md b/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md
new file mode 100644
index 000000000..7ad96e357
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md
@@ -0,0 +1,162 @@
+# Sprint 034 · Agent Resilience
+
+## Topic & Scope
+
+Implement high-availability agent architecture with clustering, automatic failover, offline task queuing, and self-healing capabilities.
+
+**Key Deliverables:**
+- Agent cluster manager
+- Health monitor with multi-factor assessment
+- Failover manager with task transfer
+- Leader election for ActivePassive mode
+- Durable task queue with retry logic
+- Self-healer with automatic recovery
+- State synchronization across cluster members
+
+- Working directory: `src/ReleaseOrchestrator/__Agents/`
+- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Agent/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/agent-resilience.md`
+- Expected evidence: Unit tests, integration tests, chaos tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 032 (Workflow Visualization)
+- Downstream: Sprint 035 (Progressive Delivery)
+- Cannot run in parallel with: Sprint 032
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/agent-resilience.md`
+- Read: `src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/`
+
+## Delivery Tracker
+
+### TASK-034-01 - Agent Cluster Manager
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `AgentClusterManager` with ActivePassive, ActiveActive, and Sharded modes.
+
+### TASK-034-02 - Health Monitor
+Status: DONE
+Dependency: TASK-034-01
+Owners: Developer/Implementer
+
+Implement enhanced `HealthMonitor` with multi-factor health assessment.
+
+Completion criteria:
+- [x] Multi-factor health scoring (connectivity, resources, tasks, latency, error rate, queue depth)
+- [x] Custom health check registration
+- [x] Health trend analysis
+- [x] Automatic recommendation generation
+- [x] Health change events
+
+### TASK-034-03 - Failover Manager
+Status: DONE
+Dependency: TASK-034-02
+Owners: Developer/Implementer
+
+Implement `FailoverManager` with task transfer and target reassignment.
+
+### TASK-034-04 - Leader Election
+Status: DONE
+Dependency: TASK-034-01
+Owners: Developer/Implementer
+
+Implement `LeaderElection` with distributed lock support.
+
+Completion criteria:
+- [x] Distributed lock-based leader election
+- [x] Lease renewal and expiry handling
+- [x] Leader resign capability
+- [x] Leadership change events
+- [x] In-memory implementation for testing
+
+### TASK-034-05 - Task Queue
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement durable `TaskQueue` with delivery guarantees and dead-letter handling.
+
+### TASK-034-06 - Self Healer
+Status: DONE
+Dependency: TASK-034-03
+Owners: Developer/Implementer
+
+Implement `SelfHealer` with automatic recovery actions.
+
+Completion criteria:
+- [x] Automatic recovery action determination based on health factors
+- [x] Circuit breaker to prevent recovery storms
+- [x] Recovery history tracking
+- [x] Recovery events (started, completed, failed)
+- [x] Configurable action timeout and cooldown
+
+### TASK-034-07 - State Sync
+Status: DONE
+Dependency: TASK-034-04
+Owners: Developer/Implementer
+
+Implement `StateSync` for cluster state synchronization.
+
+Completion criteria:
+- [x] Vector clock-based versioning
+- [x] Gossip protocol for peer sync
+- [x] Tombstone support for deletions
+- [x] State persistence
+- [x] Conflict resolution
+
+### TASK-034-08 - REST API
+Status: DONE
+Dependency: TASK-034-07
+Owners: Developer/Implementer
+
+Implement API endpoints for cluster and agent management.
+
+Completion criteria:
+- [x] Cluster status and config endpoints
+- [x] Agent health endpoints
+- [x] Leader election endpoints
+- [x] Failover management endpoints
+- [x] Self-healing endpoints
+- [x] State sync endpoints
+
+### TASK-034-09 - Integration Tests
+Status: DONE
+Dependency: TASK-034-08
+Owners: QA/Test Automation
+
+Create integration and chaos tests for failover scenarios.
+
+Completion criteria:
+- [x] Health monitor tests
+- [x] Leader election tests
+- [x] Self-healer tests
+- [x] State sync tests
+- [x] Chaos tests (network partition, resource exhaustion)
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-034-01, 034-03, 034-05 implemented: AgentClusterManager, FailoverManager, DurableTaskQueue | Developer |
+| 2026-01-17 | TASK-034-02 implemented: HealthMonitor with multi-factor assessment | Developer |
+| 2026-01-17 | TASK-034-04 implemented: LeaderElection with distributed lock and InMemory impl | Developer |
+| 2026-01-17 | TASK-034-06 implemented: SelfHealer with circuit breaker and recovery history | Developer |
+| 2026-01-17 | TASK-034-07 implemented: StateSync with vector clocks and gossip protocol | Developer |
+| 2026-01-17 | TASK-034-08 implemented: AgentClusterController REST API | Developer |
+| 2026-01-17 | TASK-034-09 implemented: Integration and chaos tests | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: Split-brain scenarios in distributed clusters
+- Mitigation: Distributed consensus with proper quorum handling
+
+## Next Checkpoints
+
+- TASK-034-03 complete: Failover working
+- TASK-034-09 complete: Chaos tests passing
diff --git a/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md b/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md
new file mode 100644
index 000000000..c5d50b728
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md
@@ -0,0 +1,154 @@
+# Sprint 035 · Progressive Delivery Enhancements
+
+## Topic & Scope
+
+Implement advanced progressive delivery with metric-driven canary automation, feature flag integration, automatic traffic percentage calculation, and sophisticated rollout strategies.
+
+**Key Deliverables:**
+- Rollout controller with multiple strategies
+- Metrics analyzer with provider integration
+- Canary controller with statistical analysis
+- Feature flag bridge (LaunchDarkly, Split, Unleash, Flagsmith)
+- Traffic manager with load balancer adapters
+- Experiment engine for A/B testing
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/progressive-delivery.md`
+- Expected evidence: Unit tests, integration tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 033 (Rollback Intelligence), Sprint 034 (Agent Resilience), Sprint 038 (Performance)
+- Downstream: Sprint 036 (Multi-Region), Sprint 037 (Developer Experience)
+- Cannot run in parallel with Wave 2 sprints
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/progressive-delivery.md`
+- Read: `docs/modules/release-orchestrator/modules/progressive-delivery.md`
+
+## Delivery Tracker
+
+### TASK-035-01 - Rollout Controller
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `RolloutController` with canary, linear, exponential, and blue-green strategies.
+
+### TASK-035-02 - Metrics Analyzer
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `MetricsAnalyzer` for health evaluation and traffic recommendations.
+
+Completion criteria:
+- [x] Multi-factor health scoring (error rate, latency, throughput, saturation)
+- [x] Baseline comparison
+- [x] Version comparison with statistical significance
+- [x] Traffic recommendations
+- [x] Evaluation history tracking
+
+### TASK-035-03 - Canary Controller
+Status: DONE
+Dependency: TASK-035-02
+Owners: Developer/Implementer
+
+Implement `CanaryController` with statistical comparison and auto-progression.
+
+Completion criteria:
+- [x] Canary lifecycle management (start, progress, pause, resume, rollback, complete)
+- [x] Statistical analysis with significance testing
+- [x] Checkpoint recording
+- [x] Auto-progression with configurable strategies (linear, exponential, fibonacci)
+- [x] Events for canary state changes
+
+### TASK-035-04 - Feature Flag Bridge
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `FeatureFlagBridge` with LaunchDarkly, Split, Unleash, Flagsmith, ConfigCat providers.
+
+### TASK-035-05 - Traffic Manager
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `TrafficManager` with Nginx, HAProxy, Traefik, AWS ALB adapters.
+
+Completion criteria:
+- [x] Traffic split management
+- [x] Nginx Plus API adapter
+- [x] HAProxy Runtime API adapter
+- [x] Traefik API adapter
+- [x] AWS ALB adapter
+- [x] Multi-adapter support
+
+### TASK-035-06 - Experiment Engine
+Status: DONE
+Dependency: TASK-035-02
+Owners: Developer/Implementer
+
+Implement `ExperimentEngine` for A/B testing with statistical analysis.
+
+Completion criteria:
+- [x] Experiment lifecycle management
+- [x] Deterministic variant assignment
+- [x] Metric recording
+- [x] Statistical analysis (mean, stddev, confidence intervals, p-value)
+- [x] Winner determination with confidence levels
+- [x] Auto-analysis and optional auto-conclusion
+
+### TASK-035-07 - REST API
+Status: DONE
+Dependency: TASK-035-06
+Owners: Developer/Implementer
+
+Implement API endpoints for rollouts, canaries, experiments, and traffic management.
+
+Completion criteria:
+- [x] Rollout CRUD and lifecycle endpoints
+- [x] Canary CRUD and lifecycle endpoints
+- [x] Experiment CRUD and lifecycle endpoints
+- [x] Metrics and health endpoints
+- [x] Traffic management endpoints
+
+### TASK-035-08 - Integration Tests
+Status: DONE
+Dependency: TASK-035-07
+Owners: QA/Test Automation
+
+Create integration tests for progressive delivery flows.
+
+Completion criteria:
+- [x] Metrics analyzer tests
+- [x] Canary controller tests
+- [x] Experiment engine tests
+- [x] Traffic manager tests
+- [x] End-to-end flow tests
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-035-01, 035-04 implemented: RolloutController, FeatureFlagBridge | Developer |
+| 2026-01-17 | TASK-035-02 implemented: MetricsAnalyzer with health evaluation and recommendations | Developer |
+| 2026-01-17 | TASK-035-03 implemented: CanaryController with statistical comparison | Developer |
+| 2026-01-17 | TASK-035-05 implemented: TrafficManager with Nginx, HAProxy, Traefik, ALB adapters | Developer |
+| 2026-01-17 | TASK-035-06 implemented: ExperimentEngine for A/B testing | Developer |
+| 2026-01-17 | TASK-035-07 implemented: ProgressiveDeliveryController REST API | Developer |
+| 2026-01-17 | TASK-035-08 implemented: Integration tests | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: Metrics provider unavailability during rollout
+- Mitigation: Fallback strategies, cached metrics, manual override
+
+## Next Checkpoints
+
+- TASK-035-03 complete: Canary working
+- TASK-035-08 complete: Ready for integration
diff --git a/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md b/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md
new file mode 100644
index 000000000..117661531
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md
@@ -0,0 +1,161 @@
+# Sprint 036 · Multi-Region / Federation
+
+## Topic & Scope
+
+Implement multi-region federation for geographically distributed deployments with cross-region coordination, evidence replication, and data residency compliance.
+
+**Key Deliverables:**
+- Federation hub for central coordination
+- Region coordinator with promotion orchestration
+- Cross-region sync with conflict resolution
+- Evidence replicator with data residency
+- Latency router for optimal region selection
+- Global dashboard for unified visibility
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/multi-region-federation.md`
+- Expected evidence: Unit tests, integration tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 035 (Progressive Delivery)
+- Downstream: Sprint 039 (Compliance)
+- Can run in parallel with: Sprint 037
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/multi-region-federation.md`
+
+## Delivery Tracker
+
+### TASK-036-01 - Federation Hub
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `FederationHub` for multi-region management.
+
+### TASK-036-02 - Region Coordinator
+Status: DONE
+Dependency: TASK-036-01
+Owners: Developer/Implementer
+
+Implement `RegionCoordinator` with global promotion orchestration.
+
+Completion criteria:
+- [x] Global promotion lifecycle (start, progress, pause, resume, rollback, complete)
+- [x] Multiple promotion strategies (Sequential, Canary, Parallel, BlueGreen)
+- [x] Wave-based rollout with configurable requirements
+- [x] Cross-region health monitoring
+- [x] Events for promotion state changes
+
+### TASK-036-03 - Cross-Region Sync
+Status: DONE
+Dependency: TASK-036-01
+Owners: Developer/Implementer
+
+Implement `CrossRegionSync` with conflict resolution strategies.
+
+Completion criteria:
+- [x] Peer discovery and connection management
+- [x] Entry replication to all peers
+- [x] Vector clock-based conflict detection
+- [x] Conflict resolution (KeepLocal, KeepRemote, Merge, LastWriteWins)
+- [x] Background sync loop
+
+### TASK-036-04 - Evidence Replicator
+Status: DONE
+Dependency: TASK-036-03
+Owners: Developer/Implementer
+
+Implement `EvidenceReplicator` with data residency compliance.
+
+Completion criteria:
+- [x] Evidence bundle replication to allowed regions
+- [x] Data classification-based region filtering
+- [x] Residency validation and violation detection
+- [x] Non-compliant region removal requests
+- [x] Background replication task scheduling
+
+### TASK-036-05 - Latency Router
+Status: DONE
+Dependency: TASK-036-01
+Owners: Developer/Implementer
+
+Implement `LatencyRouter` for optimal region selection.
+
+Completion criteria:
+- [x] Region initialization and metrics tracking
+- [x] Latency-based region selection with scoring
+- [x] Preference and exclusion handling
+- [x] Background latency probing
+- [x] Region unavailability marking
+
+### TASK-036-06 - Global Dashboard
+Status: DONE
+Dependency: TASK-036-05
+Owners: Developer/Implementer
+
+Implement `GlobalDashboard` for cross-region visibility.
+
+Completion criteria:
+- [x] Global overview with region summaries
+- [x] Region detail views
+- [x] Alert management (create, acknowledge, resolve)
+- [x] Sync status overview
+- [x] Latency map between regions
+
+### TASK-036-07 - REST API
+Status: DONE
+Dependency: TASK-036-06
+Owners: Developer/Implementer
+
+Implement API endpoints for federation management.
+
+Completion criteria:
+- [x] Dashboard endpoints (overview, regions, deployments)
+- [x] Promotion endpoints (CRUD, lifecycle, health)
+- [x] Sync endpoints (overview, conflicts, resolution)
+- [x] Evidence replication endpoints
+- [x] Latency routing endpoints
+- [x] Alert endpoints
+
+### TASK-036-08 - Integration Tests
+Status: DONE
+Dependency: TASK-036-07
+Owners: QA/Test Automation
+
+Create integration and chaos tests for multi-region scenarios.
+
+Completion criteria:
+- [x] Region coordinator tests
+- [x] Cross-region sync tests
+- [x] Evidence replicator tests
+- [x] Latency router tests
+- [x] Global dashboard tests
+- [x] End-to-end global promotion flow
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-036-01 implemented: FederationHub with multi-region management | Developer |
+| 2026-01-17 | TASK-036-02 implemented: RegionCoordinator with promotion strategies | Developer |
+| 2026-01-17 | TASK-036-03 implemented: CrossRegionSync with conflict resolution | Developer |
+| 2026-01-17 | TASK-036-04 implemented: EvidenceReplicator with data residency | Developer |
+| 2026-01-17 | TASK-036-05 implemented: LatencyRouter for optimal routing | Developer |
+| 2026-01-17 | TASK-036-06 implemented: GlobalDashboard for visibility | Developer |
+| 2026-01-17 | TASK-036-07 implemented: FederationController REST API | Developer |
+| 2026-01-17 | TASK-036-08 implemented: Integration tests | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: Network partitions between regions
+- Mitigation: Eventual consistency model, offline operation support
+
+## Next Checkpoints
+
+- TASK-036-04 complete: Evidence replication working
+- TASK-036-08 complete: Ready for integration
diff --git a/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md b/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md
new file mode 100644
index 000000000..315644055
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md
@@ -0,0 +1,178 @@
+# Sprint 037 · Developer Experience / CLI
+
+## Topic & Scope
+
+Implement comprehensive developer tooling including a powerful CLI, GitOps-native workflows, IDE integrations, and streamlined development workflows.
+
+**Key Deliverables:**
+- Full-featured CLI application (stella)
+- GitOps controller for Git-triggered releases
+- VS Code extension
+- JetBrains plugin
+- Local validator for offline config checking
+- Shell completions
+
+- Working directory: `src/Cli/StellaOps.Cli/`
+- Also touches: VS Code extension project, JetBrains plugin project
+- Documentation: `docs/modules/release-orchestrator/enhancements/developer-experience.md`
+- Expected evidence: Unit tests, integration tests, E2E tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 035 (Progressive Delivery)
+- Downstream: Sprint 039 (Compliance)
+- Can run in parallel with: Sprint 036
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/developer-experience.md`
+- Read: `src/Cli/StellaOps.Cli/` existing patterns
+
+## Delivery Tracker
+
+### TASK-037-01 - CLI Foundation
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement core CLI structure with auth, config, and help commands.
+
+Completion criteria:
+- [x] CliApplication with command parsing
+- [x] Auth commands (login, logout, status, refresh)
+- [x] Config commands (init, show, set, get, validate)
+- [x] Global options (--format, --verbose, --config)
+- [x] Output formatting (table, json, yaml)
+
+### TASK-037-02 - Release Commands
+Status: DONE
+Dependency: TASK-037-01
+Owners: Developer/Implementer
+
+Implement release create, list, get, diff, history commands.
+
+Completion criteria:
+- [x] ReleaseCommandHandler with all subcommands
+- [x] Create release with notes and draft support
+- [x] List with filters (service, status, limit)
+- [x] Get release details with scan results and approvals
+- [x] Diff between two releases
+- [x] History view for a service
+
+### TASK-037-03 - Promotion Commands
+Status: DONE
+Dependency: TASK-037-02
+Owners: Developer/Implementer
+
+Implement promote, status, approve, reject commands.
+
+Completion criteria:
+- [x] PromoteCommandHandler with all subcommands
+- [x] Start promotion with auto-approve option
+- [x] Status with watch mode
+- [x] Approve and reject with comments/reasons
+- [x] List with environment and pending filters
+
+### TASK-037-04 - Deployment Commands
+Status: DONE
+Dependency: TASK-037-03
+Owners: Developer/Implementer
+
+Implement deploy, status, logs, rollback commands.
+
+Completion criteria:
+- [x] DeployCommandHandler with all subcommands
+- [x] Start deployment with strategy and dry-run
+- [x] Status with watch mode and progress bar
+- [x] Logs with follow and tail options
+- [x] Rollback with reason
+- [x] List with environment and active filters
+
+### TASK-037-05 - GitOps Controller
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `GitOpsController` for Git event handling and auto-releases.
+
+### TASK-037-06 - VS Code Extension
+Status: DONE
+Dependency: TASK-037-04
+Owners: Developer/Implementer
+
+Implement VS Code extension with tree view, commands, and code lens.
+
+Completion criteria:
+- [x] Extension activation and package.json manifest
+- [x] Release tree view with services and versions
+- [x] Environment tree view with health status
+- [x] Code lens for stella.yaml files
+- [x] Commands (create release, promote, validate, etc.)
+- [x] Status bar integration
+
+### TASK-037-07 - JetBrains Plugin
+Status: DONE
+Dependency: TASK-037-04
+Owners: Developer/Implementer
+
+Implement JetBrains plugin with tool window and annotators.
+
+Completion criteria:
+- [x] Tool window factory with tabs
+- [x] Releases panel with tree view
+- [x] Environments panel with status
+- [x] Deployments panel with table
+- [x] Actions (create release, promote, validate)
+- [x] YAML annotator for stella.yaml
+- [x] Status bar widget
+
+### TASK-037-08 - Local Validator
+Status: DONE
+Dependency: TASK-037-01
+Owners: Developer/Implementer
+
+Implement `LocalValidator` for offline config validation.
+
+### TASK-037-09 - Integration Tests
+Status: DONE
+Dependency: TASK-037-08
+Owners: QA/Test Automation
+
+Create integration and E2E tests for CLI and GitOps flows.
+
+Completion criteria:
+- [x] CLI foundation tests (version, help)
+- [x] Auth command tests
+- [x] Config command tests
+- [x] Release command tests
+- [x] Promote command tests
+- [x] Deploy command tests
+- [x] Scan and policy command tests
+- [x] Global options tests
+- [x] GitOps controller tests
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-037-05 implemented: GitOpsController for Git-triggered releases | Developer |
+| 2026-01-17 | TASK-037-08 implemented: LocalValidator for offline config validation | Developer |
+| 2026-01-17 | TASK-037-01 implemented: CliApplication with auth/config commands | Developer |
+| 2026-01-17 | TASK-037-02 implemented: ReleaseCommandHandler | Developer |
+| 2026-01-17 | TASK-037-03 implemented: PromoteCommandHandler | Developer |
+| 2026-01-17 | TASK-037-04 implemented: DeployCommandHandler | Developer |
+| 2026-01-17 | TASK-037-06 implemented: VS Code extension | Developer |
+| 2026-01-17 | TASK-037-07 implemented: JetBrains plugin | Developer |
+| 2026-01-17 | TASK-037-09 implemented: CLI integration tests | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: CLI backward compatibility with server versions
+- Mitigation: Version negotiation, clear deprecation policy
+
+## Next Checkpoints
+
+- TASK-037-04 complete: Core CLI functional
+- TASK-037-09 complete: Ready for release
diff --git a/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md b/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md
new file mode 100644
index 000000000..ab00d0a91
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md
@@ -0,0 +1,150 @@
+# Sprint 038 · Performance Optimizations
+
+## Topic & Scope
+
+Implement comprehensive performance optimizations including parallel gate evaluation, bulk digest resolution, task batching, intelligent caching, and database query optimization.
+
+**Key Deliverables:**
+- Parallel gate evaluator
+- Bulk digest resolver
+- Task batcher for agent operations
+- Multi-level cache manager
+- Query optimizer with index management
+- Prefetcher for predictive loading
+- Connection pool optimization
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Core/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/performance-optimizations.md`
+- Expected evidence: Unit tests, performance benchmarks, load tests, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: None (Wave 1 sprint)
+- Downstream: Sprint 035 (Progressive Delivery)
+- Can run in parallel with: Sprint 031, Sprint 032
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/performance-optimizations.md`
+
+## Delivery Tracker
+
+### TASK-038-01 - Performance Baseline
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Establish performance baselines and add metrics instrumentation.
+
+Completion criteria:
+- [x] PerformanceBaseline class with measurement recording
+- [x] Metrics instrumentation (counters, histograms, gauges)
+- [x] Percentile calculation (P50, P90, P95, P99)
+- [x] Baseline comparison and regression detection
+- [x] Operation measurement helper (RAII-style)
+
+### TASK-038-02 - Parallel Gate Evaluator
+Status: DONE
+Dependency: TASK-038-01
+Owners: Developer/Implementer
+
+Implement `ParallelGateEvaluator` with execution plan builder.
+
+### TASK-038-03 - Bulk Digest Resolver
+Status: DONE
+Dependency: TASK-038-01
+Owners: Developer/Implementer
+
+Implement `BulkDigestResolver` with registry connection pooling.
+
+### TASK-038-04 - Task Batcher
+Status: DONE
+Dependency: TASK-038-01
+Owners: Developer/Implementer
+
+Implement `TaskBatcher` for agent task optimization.
+
+### TASK-038-05 - Cache Manager
+Status: DONE
+Dependency: TASK-038-01
+Owners: Developer/Implementer
+
+Implement multi-level `CacheManager` with L1 (memory) and L2 (Redis).
+
+### TASK-038-06 - Query Optimizer
+Status: DONE
+Dependency: TASK-038-01
+Owners: Developer/Implementer
+
+Implement `QueryOptimizer` with index management and read replicas.
+
+### TASK-038-07 - Prefetcher
+Status: DONE
+Dependency: TASK-038-05
+Owners: Developer/Implementer
+
+Implement `Prefetcher` for predictive cache warming.
+
+Completion criteria:
+- [x] Data loader registration by pattern
+- [x] Access pattern tracking
+- [x] Predictive prefetch based on related keys
+- [x] Cache warmup for hot keys
+- [x] Background prefetch queue processing
+- [x] Statistics and monitoring
+
+### TASK-038-08 - Connection Pool
+Status: DONE
+Dependency: TASK-038-06
+Owners: Developer/Implementer
+
+Implement optimized `ConnectionPool` with warmup.
+
+Completion criteria:
+- [x] Generic connection pool with type parameter
+- [x] Pool warmup with minimum connections
+- [x] Connection acquisition with timeout
+- [x] Connection health validation
+- [x] Adaptive sizing (min/max)
+- [x] Connection age and use count limits
+- [x] Background maintenance loop
+- [x] Pool statistics
+
+### TASK-038-09 - Load Tests
+Status: DONE
+Dependency: TASK-038-08
+Owners: QA/Test Automation
+
+Create load tests and performance benchmarks.
+
+Completion criteria:
+- [x] Performance baseline high volume tests
+- [x] Percentile accuracy tests
+- [x] Regression detection tests
+- [x] Thread safety tests
+- [x] Prefetcher load tests
+- [x] Connection pool concurrency tests
+- [x] Parallel gate evaluator benchmark
+- [x] Bulk digest resolver benchmark
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-038-02 to 038-06 implemented: ParallelGateEvaluator, BulkDigestResolver, TaskBatcher, CacheManager, QueryOptimizer | Developer |
+| 2026-01-17 | TASK-038-01 implemented: PerformanceBaseline with metrics | Developer |
+| 2026-01-17 | TASK-038-07 implemented: Prefetcher with predictive warming | Developer |
+| 2026-01-17 | TASK-038-08 implemented: ConnectionPool with warmup | Developer |
+| 2026-01-17 | TASK-038-09 implemented: Load tests and benchmarks | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: Cache invalidation bugs cause stale data
+- Mitigation: Comprehensive invalidation tags, short TTLs for critical data
+
+## Next Checkpoints
+
+- TASK-038-02 complete: Gate evaluation 3x faster
+- TASK-038-09 complete: All benchmarks passing
diff --git a/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md b/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md
new file mode 100644
index 000000000..02746a449
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md
@@ -0,0 +1,164 @@
+# Sprint 039 · Compliance & Reporting
+
+## Topic & Scope
+
+Implement comprehensive compliance management with pre-built report templates, evidence chain visualization, audit query interface, and automated compliance checking for SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, and GDPR.
+
+**Key Deliverables:**
+- Compliance engine with framework support
+- Framework mapper for control alignment
+- Report generator with templates
+- Evidence chain visualizer
+- Audit query engine
+- Control validator with automated checks
+- Scheduled reporting
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/compliance-reporting.md`
+- Expected evidence: Unit tests, integration tests, report samples, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 036 (Multi-Region), Sprint 037 (Developer Experience)
+- Downstream: Sprint 040 (Multi-Language Scripts)
+- Cannot run in parallel with Wave 4 sprints
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/compliance-reporting.md`
+
+## Delivery Tracker
+
+### TASK-039-01 - Compliance Engine
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `ComplianceEngine` for framework evaluation.
+
+### TASK-039-02 - Framework Mapper
+Status: DONE
+Dependency: TASK-039-01
+Owners: Developer/Implementer
+
+Implement `FrameworkMapper` with SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, GDPR, NIST CSF frameworks.
+
+### TASK-039-03 - Report Generator
+Status: DONE
+Dependency: TASK-039-02
+Owners: Developer/Implementer
+
+Implement `ReportGenerator` with executive summary, detailed compliance, gap analysis, audit readiness, and evidence package templates.
+
+### TASK-039-04 - Evidence Chain Visualizer
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `EvidenceChainVisualizer` with chain building, graph representation, and integrity verification.
+
+Completion criteria:
+- [x] Build evidence chains from release evidence items
+- [x] Determine causal and temporal relationships (edges)
+- [x] Compute and verify chain hash for integrity
+- [x] Generate graph representation with layers
+- [x] Export to JSON, DOT, Mermaid, CSV formats
+- [x] Node and edge styling for visualization
+
+### TASK-039-05 - Audit Query Engine
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `AuditQueryEngine` with flexible querying and aggregations.
+
+Completion criteria:
+- [x] Flexible query interface with filters
+- [x] Sorting and pagination
+- [x] Aggregation by action, actor, resource, time intervals
+- [x] Activity summary with hourly distribution
+- [x] Resource audit trail
+- [x] Actor activity reports
+- [x] Export to CSV, JSON, Syslog formats
+
+### TASK-039-06 - Control Validator
+Status: DONE
+Dependency: TASK-039-02
+Owners: Developer/Implementer
+
+Implement `ControlValidator` with automated checks for approvals, evidence generation, authentication, etc.
+
+### TASK-039-07 - REST API
+Status: DONE
+Dependency: TASK-039-06
+Owners: Developer/Implementer
+
+Implement API endpoints for compliance status, reports, evidence, and audit queries.
+
+Completion criteria:
+- [x] Compliance status endpoints (overall, per-framework)
+- [x] Release compliance evaluation
+- [x] Report templates listing and generation
+- [x] Report download with format selection
+- [x] Scheduled report CRUD operations
+- [x] Evidence chain endpoints (build, verify, graph, export)
+- [x] Audit query, aggregation, and summary endpoints
+- [x] Resource and actor audit trail endpoints
+- [x] Control status endpoints
+
+### TASK-039-08 - Scheduled Reports
+Status: DONE
+Dependency: TASK-039-03
+Owners: Developer/Implementer
+
+Implement scheduled report generation and delivery.
+
+Completion criteria:
+- [x] Cron expression parsing and validation
+- [x] Schedule CRUD operations
+- [x] Background scheduler loop
+- [x] Report generation on schedule
+- [x] Multi-recipient delivery
+- [x] Execution history tracking
+- [x] Manual trigger capability
+
+### TASK-039-09 - Integration Tests
+Status: DONE
+Dependency: TASK-039-08
+Owners: QA/Test Automation
+
+Create integration tests for compliance evaluation and reporting.
+
+Completion criteria:
+- [x] Evidence chain builder tests
+- [x] Chain verification tests
+- [x] Multi-format export tests
+- [x] Graph generation tests
+- [x] Audit query with filters tests
+- [x] Aggregation tests
+- [x] Activity summary tests
+- [x] Scheduled report CRUD tests
+- [x] End-to-end workflow tests
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-039-01, 039-02, 039-03, 039-06 implemented: ComplianceEngine, FrameworkMapper, ReportGenerator, ControlValidator | Developer |
+| 2026-01-17 | TASK-039-04 implemented: EvidenceChainVisualizer with graph and exports | Developer |
+| 2026-01-17 | TASK-039-05 implemented: AuditQueryEngine with aggregations | Developer |
+| 2026-01-17 | TASK-039-07 implemented: ComplianceController REST API | Developer |
+| 2026-01-17 | TASK-039-08 implemented: ScheduledReportService | Developer |
+| 2026-01-17 | TASK-039-09 implemented: Integration tests | QA |
+| 2026-01-17 | Sprint completed and archived | Planning |
+
+## Decisions & Risks
+
+- Risk: Framework mapping accuracy
+- Mitigation: Manual review capability, mapping override support
+
+## Next Checkpoints
+
+- TASK-039-03 complete: Reports generating
+- TASK-039-09 complete: Ready for audits
diff --git a/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md
new file mode 100644
index 000000000..c1084555d
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md
@@ -0,0 +1,561 @@
+# Sprint 040 · Multi-Language Script Engine
+
+## Topic & Scope
+
+Implement a polyglot scripting platform with Monaco-based editing, library management, and containerized execution for C# (.NET 10), Python, Java, Go, Bash, and TypeScript scripts.
+
+**Key Deliverables:**
+- Script registry with versioning
+- Monaco editor service with language server integration
+- Library manager for dependencies (NuGet, pip, Maven, Go modules, npm)
+- Runtime image manager for containerized execution
+- Script executor with mount-based injection
+- Sample library with per-language examples
+- Smart container pool with IHostedService lifecycle and auto-scaling
+- Multi-level compilation cache (C#/Java/Go/TypeScript)
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/`
+- Also touches: `src/Web/` (Monaco editor integration)
+- Documentation: `docs/modules/release-orchestrator/enhancements/multi-language-scripts.md`
+- Expected evidence: Unit tests, integration tests, sample scripts, API documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 039 (Compliance & Reporting)
+- Downstream: None (final sprint)
+- Cannot run in parallel with other sprints
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/multi-language-scripts.md`
+- Read: `docs/modules/release-orchestrator/modules/workflow-engine.md` (step integration)
+- Read existing workflow step patterns
+
+## Delivery Tracker
+
+### TASK-040-01 - Script Data Model
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the script data model and registry for storing versioned scripts.
+
+Implementation details:
+- Create `Script` record with all metadata
+- Create `ScriptLanguage` enum (CSharp, Python, Java, Go, Bash, TypeScript)
+- Create `ScriptVisibility` enum (Private, Team, Organization, Public)
+- Create `ScriptDependency` record
+- Implement `IScriptStore` with PostgreSQL backend
+
+Completion criteria:
+- [x] `Script` record with Id, Name, Description, Language, Content, EntryPoint, Version, Dependencies
+- [x] `ScriptLanguage` enum with all 6 languages (including TypeScript)
+- [x] `ScriptVisibility` for access control
+- [x] Database migration for script storage
+- [x] Version history tracking
+
+### TASK-040-02 - Script Registry
+Status: DONE
+Dependency: TASK-040-01
+Owners: Developer/Implementer
+
+Task description:
+Implement the `ScriptRegistry` for managing scripts with validation and search.
+
+Implementation details:
+- Create `ScriptRegistry` with CRUD operations
+- Implement script validation per language
+- Add version incrementing logic
+- Integrate search indexing
+
+Completion criteria:
+- [x] `CreateScriptAsync()` with validation
+- [x] `UpdateScriptAsync()` with version management
+- [x] `SearchAsync()` with filters (language, tags, visibility)
+- [x] Syntax validation per language
+- [x] Search indexing for fast queries
+
+### TASK-040-03 - Language Server Pool
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement language server integration for Monaco editor features.
+
+Implementation details:
+- Create `ILanguageServer` interface
+- Implement `CSharpLanguageServer` (OmniSharp/Roslyn)
+- Implement `PythonLanguageServer` (Pyright)
+- Implement `JavaLanguageServer` (JDT LS)
+- Implement `GoLanguageServer` (gopls)
+- Implement `BashLanguageServer` (bash-language-server)
+- Implement `TypeScriptLanguageServer` (typescript-language-server)
+
+Completion criteria:
+- [x] `ILanguageServer` with GetCompletions, GetDiagnostics, Format, GetHover, GetSignatureHelp
+- [x] C# server with .NET 10 script support
+- [x] Python server with type checking
+- [x] Java server with JDK 21 support
+- [x] Go server with module support
+- [x] Bash server with ShellCheck integration
+- [x] TypeScript server with npm package resolution
+
+### TASK-040-04 - Monaco Editor Service
+Status: DONE
+Dependency: TASK-040-03
+Owners: Developer/Implementer
+
+Task description:
+Implement the `MonacoEditorService` for IDE-quality editing.
+
+Implementation details:
+- Create `MonacoEditorService` with configuration management
+- Implement completion provider wrapper
+- Implement diagnostic provider wrapper
+- Add formatting support
+- Add hover and signature help
+
+Completion criteria:
+- [x] `GetConfigurationAsync()` with language-specific options
+- [x] `GetCompletionsAsync()` delegating to language servers
+- [x] `GetDiagnosticsAsync()` for real-time error checking
+- [x] `FormatDocumentAsync()` for code formatting
+- [x] `GetHoverInfoAsync()` for hover documentation
+- [x] `GetSignatureHelpAsync()` for parameter hints
+
+### TASK-040-05 - Library Manager
+Status: DONE
+Dependency: TASK-040-01
+Owners: Developer/Implementer
+
+Task description:
+Implement the `LibraryManager` for resolving script dependencies.
+
+Implementation details:
+- Create `LibraryManager` with resolver registry
+- Implement `NuGetDependencyResolver` for C#
+- Implement `PipDependencyResolver` for Python
+- Implement `MavenDependencyResolver` for Java
+- Implement `GoModDependencyResolver` for Go
+- Implement `AptDependencyResolver` for Bash
+- Implement `NpmDependencyResolver` for TypeScript
+
+Completion criteria:
+- [x] `ResolveDependenciesAsync()` for all 6 languages
+- [x] NuGet resolution with transitive dependencies
+- [x] pip resolution with requirements.txt generation
+- [x] Maven resolution with pom.xml generation
+- [x] Go module resolution
+- [x] apt package resolution for Bash scripts
+- [x] npm resolution with package.json generation for TypeScript
+- [x] Dependency caching
+
+### TASK-040-06 - Runtime Image Manager
+Status: DONE
+Dependency: TASK-040-05
+Owners: Developer/Implementer
+
+Task description:
+Implement the `RuntimeImageManager` for building and caching Docker runtime images.
+
+Implementation details:
+- Create `RuntimeImageManager` with image configuration
+- Define base images for each language
+- Implement Dockerfile generation
+- Add image caching and versioning
+
+Completion criteria:
+- [x] Base images defined: .NET 10, Python 3.12, Java 21, Go 1.22, Alpine 3.19, Node.js 22 (TypeScript)
+- [x] `BuildRuntimeImageAsync()` with dependency installation
+- [x] Dockerfile generation per language (6 languages)
+- [x] Image tagging with script ID and version
+- [x] Image cache management
+- [x] Resource limits configuration
+
+### TASK-040-07 - Script Executor
+Status: DONE
+Dependency: TASK-040-06
+Owners: Developer/Implementer
+
+Task description:
+Implement the `ScriptExecutor` for running scripts in isolated containers.
+
+Implementation details:
+- Create `ScriptExecutor` with container management
+- Implement mount-based script injection
+- Add environment variable passing
+- Implement timeout handling
+- Collect stdout/stderr output
+
+Completion criteria:
+- [x] `ExecuteAsync()` with full lifecycle
+- [x] Script mount creation (bind mount to /scripts)
+- [x] Arguments passed via args.json
+- [x] Environment variable injection
+- [x] Network isolation (default: none)
+- [x] Resource limits enforcement
+- [x] Timeout handling with cancellation
+- [x] Output collection (stdout, stderr, exit code)
+
+### TASK-040-08 - Sample Library
+Status: DONE
+Dependency: TASK-040-07
+Owners: Developer/Implementer
+
+Task description:
+Create the sample script library with examples for each language.
+
+Implementation details:
+- Create `SampleLibrary` with pre-built scripts
+- Implement C# samples: health-check, smoke-test, db-migration-check
+- Implement Python samples: log-analyzer, prometheus-query, slack-notification
+- Implement Java samples: jdbc-health-check, kafka-consumer-check
+- Implement Go samples: tcp-port-check, container-inspect
+- Implement Bash samples: disk-space-check, service-restart, backup-verify
+- Implement TypeScript samples: api-integration-test, json-schema-validator, webhook-sender
+
+Completion criteria:
+- [x] `GetSamplesAsync()` with filtering
+- [x] C# HTTP health check script (.csx)
+- [x] C# API smoke test script
+- [x] C# database migration validator
+- [x] Python log analyzer script
+- [x] Python Prometheus query script
+- [x] Python Slack notification script
+- [x] Java JDBC health check
+- [x] Java Kafka consumer lag check
+- [x] Go TCP port checker
+- [x] Go container inspector
+- [x] Bash disk space check
+- [x] Bash service restart
+- [x] Bash backup verification
+- [x] TypeScript API integration test script (.ts)
+- [x] TypeScript JSON schema validator script
+- [x] TypeScript webhook sender script
+- [x] Clone functionality for samples
+
+### TASK-040-09 - REST API
+Status: DONE
+Dependency: TASK-040-08
+Owners: Developer/Implementer
+
+Task description:
+Implement REST API endpoints for script management and execution.
+
+Implementation details:
+- Create `ScriptController` with CRUD operations
+- Create `ScriptExecutionController` for running scripts
+- Create `EditorController` for Monaco integration
+- Create `SampleController` for sample library
+
+Completion criteria:
+- [x] Script CRUD endpoints
+- [x] Script version endpoints
+- [x] Execution endpoints (execute, list, get, logs)
+- [x] Editor endpoints (config, completions, diagnostics, format, hover)
+- [x] Sample endpoints (list, get, clone)
+- [x] Dependency resolution endpoint
+- [x] OpenAPI documentation
+
+### TASK-040-10 - Monaco Editor UI
+Status: DONE
+Dependency: TASK-040-09
+Owners: Developer/Implementer (Frontend)
+
+Task description:
+Implement the Monaco editor component in the web UI.
+
+Implementation details:
+- Create `ScriptEditor` component with Monaco
+- Configure language-specific features
+- Implement server-backed completion provider
+- Add diagnostic display
+- Implement save with Ctrl+S
+
+Completion criteria:
+- [x] `ScriptEditor` component with all languages
+- [x] Language-specific syntax highlighting
+- [x] Completion provider with server integration
+- [x] Diagnostic provider with real-time errors
+- [x] Hover provider for documentation
+- [x] Format on save option
+- [x] Ctrl+S save handler
+- [x] Dark theme (stella-dark)
+
+### TASK-040-11 - Script Library UI
+Status: DONE
+Dependency: TASK-040-10
+Owners: Developer/Implementer (Frontend)
+
+Task description:
+Implement the script library browser UI.
+
+Implementation details:
+- Create `ScriptLibrary` component with browsing
+- Implement search and filtering
+- Add sample preview
+- Implement clone workflow
+
+Completion criteria:
+- [x] `ScriptLibrary` with grid/list view
+- [x] Search by name, description, tags
+- [x] Filter by language, visibility
+- [x] Sample preview with syntax highlighting
+- [x] Clone to create new script
+- [x] Dependency display
+
+### TASK-040-12 - Workflow Step Integration
+Status: DONE
+Dependency: TASK-040-07
+Owners: Developer/Implementer
+
+Task description:
+Integrate scripts as workflow step type.
+
+Implementation details:
+- Create `ScriptStepExecutor` implementing `IStepExecutor`
+- Add script step to step registry
+- Implement argument mapping from workflow variables
+- Add output propagation to workflow
+
+Completion criteria:
+- [x] `ScriptStepExecutor` with full lifecycle
+- [x] Script step type in registry
+- [x] Input mapping from workflow variables
+- [x] Output parsing and propagation
+- [x] Timeout and retry support
+- [x] Evidence generation
+
+### TASK-040-13 - Script Compilation Cache
+Status: DONE
+Dependency: TASK-040-07
+Owners: Developer/Implementer
+
+Task description:
+Implement multi-level compilation cache for pre-compiled scripts across all compiled/transpiled languages.
+
+Implementation details:
+- Create `ScriptCompilationCache` with L1 (memory) and L2 (distributed/Redis) cache
+- Implement `DotNetScriptCompiler` using Roslyn for C# AOT compilation
+- Implement `JavaScriptCompiler` using javac for Java bytecode caching
+- Implement `GoScriptCompiler` using go build for Go binary caching
+- Implement `TypeScriptCompiler` using tsc for TypeScript transpilation to JavaScript
+- Cache key based on script content + dependencies + runtime version hash
+
+Completion criteria:
+- [x] `ScriptCompilationCache` with GetOrCompileAsync()
+- [x] L1 memory cache with configurable size (default 256MB)
+- [x] L2 distributed cache with Redis backend
+- [x] Roslyn-based C# script compilation to assembly bytes
+- [x] javac-based Java compilation to bytecode
+- [x] go build-based Go compilation to binary
+- [x] tsc-based TypeScript transpilation to JavaScript
+- [x] Cache key computation with SHA256 hash
+- [x] TTL configuration (default 7 days)
+- [x] Cache hit/miss metrics
+
+### TASK-040-14 - Smart Container Pool Manager
+Status: DONE
+Dependency: TASK-040-06
+Owners: Developer/Implementer
+
+Task description:
+Implement smart container pool manager with IHostedService lifecycle and auto-scaling.
+
+Implementation details:
+- Create `SmartContainerPoolManager` implementing `IHostedService` for graceful startup/shutdown
+- Implement `ManagedContainerPool` per language with acquire/release lifecycle
+- Add `UsageTracker` for monitoring hit rates and request rates
+- Implement auto-scaling based on usage patterns
+- Graceful shutdown: dispose all containers when agent stops
+
+Completion criteria:
+- [x] `SmartContainerPoolManager` implementing `IHostedService`
+- [x] `StartAsync()` warms up all pools to minimum containers
+- [x] `StopAsync()` gracefully shuts down all pools and disposes containers
+- [x] Configurable min/max containers per language (6 languages including TypeScript)
+- [x] `AcquireAsync()` with exact dependency match priority
+- [x] `ReleaseAsync()` with container reset and health check
+- [x] `UsageTracker` with hit rate and request rate monitoring
+- [x] Auto-scaling: scale up when hit rate < 50%, scale down when utilization < 30%
+- [x] Background `PerformMaintenanceAsync()` for health checks and eviction
+- [x] Idle container eviction after configurable timeout
+- [x] Pool size and utilization metrics
+
+### TASK-040-15 - Runtime Image Cache
+Status: DONE
+Dependency: TASK-040-06
+Owners: Developer/Implementer
+
+Task description:
+Implement Docker image caching for pre-built dependency images.
+
+Implementation details:
+- Create `RuntimeImageCache` with local and registry caching
+- Generate optimized Dockerfiles per language with dependency pre-installation
+- Push built images to registry for cross-agent sharing
+- Image tag based on language + dependency hash
+
+Completion criteria:
+- [x] `RuntimeImageCache` with GetOrBuildImageAsync()
+- [x] Local Docker image existence check
+- [x] Registry image existence check and pull
+- [x] Dockerfile generation with dependency pre-installation
+- [x] NuGet restore baked into C# images
+- [x] pip install baked into Python images
+- [x] Maven dependency:go-offline for Java images
+- [x] go mod download for Go images
+- [x] npm install baked into TypeScript images
+- [x] Registry push for cross-agent sharing
+- [x] Image cache metrics
+
+### TASK-040-16 - Workflow Script Preloader
+Status: DONE
+Dependency: TASK-040-13, TASK-040-14, TASK-040-15
+Owners: Developer/Implementer
+
+Task description:
+Implement workflow-level script preloading for parallel warm-up.
+
+Implementation details:
+- Create `WorkflowScriptPreloader` triggered on workflow start
+- Identify all script steps in workflow DAG
+- Parallel precompilation, container warming, and image building
+- Integration with workflow engine lifecycle
+
+Completion criteria:
+- [x] `PreloadWorkflowScriptsAsync()` extracts all script IDs
+- [x] Parallel compilation of all scripts
+- [x] Parallel container pool warming per language
+- [x] Parallel image building for unique dependency sets
+- [x] Integration with workflow start event
+- [x] Preload duration metrics
+
+### TASK-040-17 - Agent Script Cache
+Status: DONE
+Dependency: TASK-040-14, TASK-040-15
+Owners: Developer/Implementer
+
+Task description:
+Implement agent-side caching with warmup on startup.
+
+Implementation details:
+- Create `AgentScriptCache` with LRU eviction
+- Persist cache across agent restarts
+- Warmup task on agent start (pull base images, start pool)
+
+Completion criteria:
+- [x] `AgentScriptCache` with configurable cache path
+- [x] LRU eviction for compiled scripts (default 100)
+- [x] LRU eviction for runtime images (default 20)
+- [x] Cache persistence to disk
+- [x] `WarmupAsync()` pulls all base images
+- [x] Warm container pool initialization on startup
+
+### TASK-040-18 - Cache Performance Tests
+Status: DONE
+Dependency: TASK-040-17
+Owners: QA/Test Automation
+
+Task description:
+Create performance tests validating cache effectiveness.
+
+Completion criteria:
+- [x] Cold start benchmark (< 30s for first execution)
+- [x] Warm start benchmark (< 500ms for cached script)
+- [x] Same language different script (< 5s)
+- [x] Workflow with 10 scripts benchmark (< 60s cold, < 15s warm)
+- [x] Cache hit rate validation (> 90% in steady state)
+- [x] Container pool utilization tests
+
+### TASK-040-19 - Integration Tests
+Status: DONE
+Dependency: TASK-040-18
+Owners: QA/Test Automation
+
+Task description:
+Create comprehensive integration tests for the script engine.
+
+Completion criteria:
+- [x] Full execution flow tests per language
+- [x] Monaco integration tests
+- [x] Language server communication tests
+- [x] Sample script execution tests
+- [x] Workflow step integration tests
+- [x] Cache integration tests
+
+### TASK-040-20 - Security Tests
+Status: DONE
+Dependency: TASK-040-19
+Owners: QA/Test Automation
+
+Task description:
+Create security tests for script execution isolation.
+
+Completion criteria:
+- [x] Container isolation verification
+- [x] Resource limit enforcement tests
+- [x] Network isolation tests
+- [x] Path traversal prevention tests
+- [x] Sensitive data handling tests
+
+### TASK-040-21 - Documentation
+Status: DONE
+Dependency: TASK-040-20
+Owners: Documentation Author
+
+Task description:
+Create comprehensive documentation for the script engine.
+
+Completion criteria:
+- [x] API documentation
+- [x] User guide for creating scripts
+- [x] Sample script documentation
+- [x] Language-specific guides
+- [x] Security considerations documentation
+- [x] Performance tuning guide (caching configuration)
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | Added TypeScript as 6th supported language | Planning |
+| 2026-01-17 | Enhanced pool management with SmartContainerPoolManager (IHostedService, auto-scaling) | Planning |
+| 2026-01-17 | Added Java/TypeScript compilation caching to TASK-040-13 | Planning |
+
+## Decisions & Risks
+
+### Decisions
+1. Scripts are files mounted into containers, not embedded
+2. Each language uses its official Docker base image
+3. Language servers run as separate services for performance
+4. Default network mode is "none" for security
+5. **Multi-layer caching**: 5-layer cache (compiled scripts → warm containers → pre-built images → dependency cache → cold build)
+6. **Pre-compilation**: C#/Java/Go/TypeScript scripts compiled/transpiled ahead of time using Roslyn/javac/go build/tsc
+7. **Warm container pools**: SmartContainerPoolManager with IHostedService for graceful startup/shutdown
+8. **Workflow preloading**: Trigger parallel warm-up when workflow starts
+9. **Auto-scaling**: Usage-based scaling (scale up when hit rate < 50%, scale down when utilization < 30%)
+10. **6 supported languages**: C#, Python, Java, Go, Bash, TypeScript
+
+### Risks
+1. **Language server resource usage**: Multiple servers may consume significant memory
+ - Mitigation: On-demand server startup, connection pooling
+2. **Container startup latency**: Cold starts may be slow
+ - Mitigation: Pre-warmed containers, image caching, workflow preloading
+3. **Dependency resolution failures**: External package registries may be unavailable
+ - Mitigation: Dependency caching, offline mode support
+4. **Cache invalidation**: Stale compiled scripts may cause issues
+ - Mitigation: Content-based cache keys (SHA256), TTL expiration, version in cache key
+5. **Warm pool resource usage**: Idle containers consume memory
+ - Mitigation: Configurable pool sizes, idle timeout eviction, health-based eviction
+
+## Next Checkpoints
+
+- TASK-040-07 complete: Execution working
+- TASK-040-10 complete: Editor functional
+- TASK-040-16 complete: Caching infrastructure ready
+- TASK-040-18 complete: Performance targets met
+- TASK-040-20 complete: Security verified
diff --git a/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md
new file mode 100644
index 000000000..0c7b31b9c
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md
@@ -0,0 +1,112 @@
+# Sprint 040 · Self-Healing Infrastructure
+
+## Topic & Scope
+
+Implement self-healing capabilities for the release orchestration platform including automated health monitoring, failure detection, and recovery orchestration.
+
+**Key Deliverables:**
+- Self-healing engine with recovery strategies
+- Health monitoring with degradation detection
+- Recovery orchestrator with dependency-aware healing
+- Automatic scaling and resource management
+- Circuit breaker integration for cascading failure prevention
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/self-healing.md`
+- Expected evidence: Unit tests, integration tests, recovery scenario tests
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 034 (Agent Resilience), Sprint 041 (Observability)
+- Downstream: None
+- Can run in parallel with: Sprint 041
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/self-healing.md` (if exists)
+- Read: Agent resilience patterns in Sprint 034
+
+## Delivery Tracker
+
+### TASK-040-01 - Self-Healing Engine
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `SelfHealingEngine` with recovery strategies and automated remediation.
+
+Completion criteria:
+- [x] Engine detects failures via health checks
+- [x] Multiple recovery strategies (restart, failover, scale)
+- [x] Recovery history tracking
+- [x] Cooldown periods to prevent thrashing
+
+### TASK-040-02 - Health Monitor
+Status: DONE
+Dependency: TASK-040-01
+Owners: Developer/Implementer
+
+Implement `HealthMonitor` for continuous health assessment.
+
+Completion criteria:
+- [x] Multi-probe health checks (HTTP, TCP, process)
+- [x] Degradation detection with thresholds
+- [x] Health aggregation across components
+- [x] Alert integration
+
+### TASK-040-03 - Recovery Orchestrator
+Status: DONE
+Dependency: TASK-040-01
+Owners: Developer/Implementer
+
+Implement `RecoveryOrchestrator` for dependency-aware healing.
+
+Completion criteria:
+- [x] Dependency graph-based recovery ordering
+- [x] Partial recovery support
+- [x] Rollback on failed recovery
+- [x] Evidence generation for recovery actions
+
+### TASK-040-04 - Auto-Scaler
+Status: DONE
+Dependency: TASK-040-02
+Owners: Developer/Implementer
+
+Implement `AutoScaler` for automatic resource management.
+
+Completion criteria:
+- [x] Load-based scaling triggers
+- [x] Scale-up and scale-down policies
+- [x] Resource limits enforcement
+- [x] Scaling event audit trail
+
+### TASK-040-05 - Integration Tests
+Status: DONE
+Dependency: TASK-040-04
+Owners: QA/Test Automation
+
+Create integration tests for self-healing scenarios.
+
+Completion criteria:
+- [x] Failure injection tests
+- [x] Recovery verification tests
+- [x] Scaling behavior tests
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-040-01, 040-02, 040-03 implemented: SelfHealingEngine, HealthMonitor, RecoveryOrchestrator | Developer |
+| 2026-01-17 | TASK-040-04 implemented: AutoScaler | Developer |
+| 2026-01-17 | TASK-040-05 completed: SelfHealingEngineTests, HealthMonitorTests, AutoScalerTests | QA |
+
+## Decisions & Risks
+
+- Risk: Over-aggressive healing causing instability
+- Mitigation: Cooldown periods, rate limiting, manual override capability
+
+## Next Checkpoints
+
+- TASK-040-03 complete: Core self-healing functional
+- TASK-040-05 complete: Ready for production
diff --git a/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md
new file mode 100644
index 000000000..91a8763c8
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md
@@ -0,0 +1,452 @@
+# Sprint 041 · Agent Operations & Easy Setup
+
+## Topic & Scope
+
+Implement streamlined agent deployment, configuration management, health diagnostics (Doctor plugin), and operational tooling that makes agents easy to deploy, monitor, and maintain at scale.
+
+**Key Deliverables:**
+- Zero-touch bootstrap service with one-line installers
+- Declarative configuration manager with drift detection
+- Automatic certificate provisioning and renewal
+- Agent Doctor with comprehensive health checks
+- Server-side Doctor plugin for fleet health
+- Remediation engine with guided problem resolution
+- Auto-update manager with safe rollbacks
+- Enhanced CLI commands for agent operations
+
+- Working directory: `src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/`
+- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Agent/`, `src/Doctor/__Plugins/`, `src/Cli/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/agent-operations.md`
+- Expected evidence: Unit tests, integration tests, E2E tests, CLI documentation
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 034 (Agent Resilience) - provides clustering foundation
+- Downstream: None
+- Can run in parallel with: Sprint 040 (Multi-Language Scripts)
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/agent-operations.md`
+- Read: `docs/modules/release-orchestrator/enhancements/agent-resilience.md`
+- Read: `docs/modules/release-orchestrator/modules/agents.md`
+- Read: `docs/modules/release-orchestrator/security/agent-security.md`
+
+## Delivery Tracker
+
+### TASK-041-01 - Bootstrap Token Service
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the bootstrap token service for secure agent provisioning.
+
+Implementation details:
+- Create `BootstrapTokenService` with token generation
+- One-time use tokens with 15-minute expiry
+- Token validation and consumption
+- Token metadata (agent name, environment, capabilities)
+
+Completion criteria:
+- [x] `GenerateBootstrapTokenAsync()` creates secure one-time tokens
+- [x] Token includes agent metadata
+- [x] Token expires after 15 minutes or first use
+- [x] Token validation rejects expired/used tokens
+- [x] REST API endpoint for token generation
+
+### TASK-041-02 - Bootstrap Service
+Status: DONE
+Dependency: TASK-041-01
+Owners: Developer/Implementer
+
+Task description:
+Implement the bootstrap service for zero-touch agent deployment.
+
+Implementation details:
+- Create `BootstrapService` with platform detection
+- Generate one-line installers for Linux, Windows, Docker
+- Generate install scripts with embedded configuration
+- Support cluster join via bootstrap
+
+Completion criteria:
+- [x] `BootstrapAgentAsync()` generates complete bootstrap package
+- [x] Linux one-liner: `curl | bash` with token
+- [x] Windows one-liner: PowerShell with token
+- [x] Docker one-liner: `docker run` with token
+- [x] Install scripts handle dependencies
+- [x] Cluster join support
+
+### TASK-041-03 - Agent Certificate Manager
+Status: DONE
+Dependency: TASK-041-02
+Owners: Developer/Implementer
+
+Task description:
+Implement automatic certificate provisioning and renewal.
+
+Implementation details:
+- Create `AgentCertificateManager` with lifecycle management
+- Auto-provision via bootstrap (CSR submission)
+- Auto-renewal before expiry threshold (default: 7 days)
+- Support multiple certificate sources (auto, file, Vault, ACME)
+
+Completion criteria:
+- [x] `EnsureCertificateAsync()` provisions or renews as needed
+- [x] CSR generation with local private key
+- [x] Auto-renewal monitoring background service
+- [x] Certificate source abstraction
+- [x] Vault integration for certificate storage
+- [x] ACME/Let's Encrypt support (optional)
+
+### TASK-041-04 - Configuration Model
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement the declarative agent configuration model.
+
+Implementation details:
+- Create `AgentConfiguration` record with all settings
+- Support minimal (bootstrap) and full configuration modes
+- YAML/JSON serialization
+- Configuration validation
+
+Completion criteria:
+- [x] `AgentConfiguration` with identity, connection, capabilities, resources, security, observability sections
+- [x] `CertificateConfig` with source enum (AutoProvision, File, Vault, ACME)
+- [x] `ClusterConfig` for optional clustering
+- [x] `AutoUpdateConfig` for optional auto-updates
+- [x] Configuration validation with clear error messages
+- [x] YAML and JSON support
+
+### TASK-041-05 - Configuration Manager
+Status: DONE
+Dependency: TASK-041-04
+Owners: Developer/Implementer
+
+Task description:
+Implement the configuration manager with drift detection.
+
+Implementation details:
+- Create `AgentConfigManager` with apply/diff operations
+- Configuration drift detection
+- Apply with rollback capability
+- Configuration persistence
+
+Completion criteria:
+- [x] `ApplyConfigurationAsync()` with validation and rollback
+- [x] `DetectDriftAsync()` compares desired vs actual
+- [x] Configuration diff computation
+- [x] Automatic rollback on apply failure
+- [x] Configuration versioning
+
+### TASK-041-06 - Agent Health Checks
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Task description:
+Implement comprehensive health checks for the agent Doctor.
+
+Implementation details:
+- Create `IAgentHealthCheck` interface
+- Implement core checks: certificate, connectivity, heartbeat
+- Implement resource checks: disk, memory, CPU
+- Implement runtime checks: Docker, task queue
+
+Completion criteria:
+- [x] `IAgentHealthCheck` with category, name, execute
+- [x] `CertificateExpiryCheck` - certificate validity
+- [x] `CertificateValidityCheck` - certificate chain validation
+- [x] `OrchestratorConnectivityCheck` - DNS, TCP, mTLS, gRPC
+- [x] `HeartbeatCheck` - heartbeat freshness
+- [x] `DiskSpaceCheck` - available disk space
+- [x] `MemoryUsageCheck` - memory utilization
+- [x] `CpuUsageCheck` - CPU utilization
+- [x] `DockerConnectivityCheck` - Docker daemon access
+- [x] `DockerVersionCheck` - Docker version compatibility
+- [x] `TaskQueueDepthCheck` - pending task count
+- [x] `ConfigurationDriftCheck` - config consistency
+
+### TASK-041-07 - Agent Doctor
+Status: DONE
+Dependency: TASK-041-06
+Owners: Developer/Implementer
+
+Task description:
+Implement the Agent Doctor for running diagnostics.
+
+Implementation details:
+- Create `AgentDoctor` with check orchestration
+- Generate diagnostic reports
+- Support category filtering
+- Integration with remediation engine
+
+Completion criteria:
+- [x] `RunDiagnosticsAsync()` executes all applicable checks
+- [x] Category filtering (security, network, runtime, etc.)
+- [x] `AgentDiagnosticReport` with overall status and results
+- [x] Parallel check execution with timeout
+- [x] Stop-on-critical option
+
+### TASK-041-08 - Remediation Engine
+Status: DONE
+Dependency: TASK-041-07
+Owners: Developer/Implementer
+
+Task description:
+Implement the remediation engine for guided problem resolution.
+
+Implementation details:
+- Create `RemediationEngine` with pattern matching
+- Define remediation patterns for common issues
+- Support automated vs manual remediations
+- Link to runbooks
+
+Completion criteria:
+- [x] `GetRemediationSteps()` returns prioritized remediation steps
+- [x] Pattern matching for known issues
+- [x] `RemediationStep` with command, runbook URL, automated flag
+- [x] Remediation patterns for certificate issues
+- [x] Remediation patterns for connectivity issues
+- [x] Remediation patterns for Docker issues
+- [x] Remediation patterns for resource issues
+
+### TASK-041-09 - Server-Side Doctor Plugin
+Status: DONE
+Dependency: TASK-041-07
+Owners: Developer/Implementer
+
+Task description:
+Implement the Doctor plugin for server-side agent fleet health monitoring.
+
+Implementation details:
+- Create `AgentHealthPlugin` in Doctor plugins
+- Implement fleet-wide health checks
+- Aggregate agent health status
+- Alert on critical issues
+
+Completion criteria:
+- [x] `AgentHealthPlugin` implementing `IDoctorPlugin`
+- [x] `AgentHeartbeatFreshnessCheck` - fleet heartbeat monitoring
+- [x] `AgentCertificateExpiryCheck` - fleet certificate monitoring
+- [x] `AgentVersionConsistencyCheck` - version skew detection
+- [x] `AgentCapacityCheck` - task capacity monitoring
+- [x] `StaleAgentCheck` - detect stale/disconnected agents
+- [x] `TaskQueueBacklogCheck` - pending task monitoring
+- [x] `FailedTaskRateCheck` - failure rate monitoring
+
+### TASK-041-10 - Auto-Update Manager
+Status: DONE
+Dependency: TASK-041-05
+Owners: Developer/Implementer
+
+Task description:
+Implement safe agent binary auto-updates.
+
+Implementation details:
+- Create `AgentUpdateManager` with update lifecycle
+- Signature verification for packages
+- Safe rollback capability
+- Maintenance window support
+
+Completion criteria:
+- [x] `CheckAndApplyUpdateAsync()` with full lifecycle
+- [x] Update channel support (stable, beta, canary)
+- [x] Package signature verification
+- [x] Task draining before update
+- [x] Rollback point creation
+- [x] Health verification after update
+- [x] Automatic rollback on failure
+- [x] Maintenance window scheduling
+
+### TASK-041-11 - CLI Bootstrap Commands
+Status: DONE
+Dependency: TASK-041-02
+Owners: Developer/Implementer
+
+Task description:
+Implement CLI commands for agent bootstrapping.
+
+Implementation details:
+- Add `stella agent bootstrap` command
+- Add `stella agent install-script` command
+- Platform-specific output
+
+Completion criteria:
+- [x] `stella agent bootstrap --name --env --platform` generates token and installer
+- [x] `stella agent install-script --token --output` generates script file
+- [x] Clear output with copy-paste commands
+- [x] Platform detection and suggestions
+
+### TASK-041-12 - CLI Doctor Commands
+Status: DONE
+Dependency: TASK-041-08
+Owners: Developer/Implementer
+
+Task description:
+Implement CLI commands for agent diagnostics.
+
+Implementation details:
+- Add `stella agent doctor` command
+- Support local and remote diagnostics
+- Add `--fix` for automated remediation
+- Multiple output formats
+
+Completion criteria:
+- [x] `stella agent doctor` runs local diagnostics
+- [x] `stella agent doctor --agent-id` runs remote diagnostics
+- [x] `stella agent doctor --category` filters by category
+- [x] `stella agent doctor --fix` applies automated fixes
+- [x] `stella agent doctor --format json|table|yaml` output formats
+- [x] Clear remediation instructions in output
+
+### TASK-041-13 - CLI Config Commands
+Status: DONE
+Dependency: TASK-041-05
+Owners: Developer/Implementer
+
+Task description:
+Implement CLI commands for configuration management.
+
+Implementation details:
+- Add `stella agent config` command
+- Add `stella agent apply` command
+- Add drift detection support
+
+Completion criteria:
+- [x] `stella agent config` shows current configuration
+- [x] `stella agent config --diff` shows drift
+- [x] `stella agent apply -f config.yaml` applies configuration
+- [x] Validation feedback on apply
+- [x] Multiple output formats
+
+### TASK-041-14 - CLI Certificate Commands
+Status: DONE
+Dependency: TASK-041-03
+Owners: Developer/Implementer
+
+Task description:
+Implement CLI commands for certificate management.
+
+Implementation details:
+- Add `stella agent renew-cert` command
+- Add certificate status in `stella agent status`
+- Certificate expiry warnings
+
+Completion criteria:
+- [x] `stella agent renew-cert` triggers renewal
+- [x] `stella agent renew-cert --force` forces renewal
+- [x] Certificate info in `stella agent status`
+- [x] Expiry warnings in CLI output
+
+### TASK-041-15 - CLI Update Commands
+Status: DONE
+Dependency: TASK-041-10
+Owners: Developer/Implementer
+
+Task description:
+Implement CLI commands for agent updates.
+
+Implementation details:
+- Add `stella agent update` command
+- Add version checking
+- Add rollback command
+
+Completion criteria:
+- [x] `stella agent update` checks and applies updates
+- [x] `stella agent update --version x.y.z` updates to specific version
+- [x] `stella agent update --check` checks without applying
+- [x] `stella agent rollback` reverts to previous version
+
+### TASK-041-16 - Integration Tests
+Status: DONE
+Dependency: TASK-041-15
+Owners: QA/Test Automation
+
+Task description:
+Create comprehensive integration tests for agent operations.
+
+Completion criteria:
+- [x] Bootstrap flow end-to-end test
+- [x] Configuration apply and rollback tests
+- [x] Certificate provisioning tests
+- [x] Certificate renewal tests
+- [x] Doctor diagnostics tests
+- [x] Remediation execution tests
+- [x] Update and rollback tests
+
+### TASK-041-17 - E2E Tests
+Status: DONE
+Dependency: TASK-041-16
+Owners: QA/Test Automation
+
+Task description:
+Create E2E tests for agent operations.
+
+Completion criteria:
+- [x] Bootstrap to running agent test
+- [x] Multi-agent deployment test
+- [x] Configuration drift and remediation test
+- [x] Certificate lifecycle test
+- [x] Update with rollback test
+
+### TASK-041-18 - Documentation
+Status: DONE
+Dependency: TASK-041-17
+Owners: Documentation Author
+
+Task description:
+Create comprehensive documentation for agent operations.
+
+Completion criteria:
+- [x] Bootstrap quick start guide
+- [x] Configuration reference
+- [x] Doctor troubleshooting guide
+- [x] Runbooks for common issues
+- [x] CLI command reference
+- [x] Auto-update configuration guide
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | Bootstrap services implemented (BootstrapTokenService, BootstrapService) | Developer |
+| 2026-01-17 | Certificate manager implemented (AgentCertificateManager) | Developer |
+| 2026-01-17 | Configuration model and manager implemented | Developer |
+| 2026-01-17 | Agent Doctor and health checks implemented | Developer |
+| 2026-01-17 | Remediation engine with patterns implemented | Developer |
+| 2026-01-17 | Server-side Doctor plugin created | Developer |
+| 2026-01-17 | Auto-update manager implemented | Developer |
+| 2026-01-17 | CLI commands implemented (bootstrap, doctor, config, cert, update) | Developer |
+| 2026-01-17 | Integration tests created | QA |
+| 2026-01-17 | Documentation created (agent-operations-quickstart.md) | Documentation |
+| 2026-01-17 | All tasks completed, sprint ready for archive | Project Manager |
+
+## Decisions & Risks
+
+### Decisions
+1. Bootstrap tokens are one-time use with 15-minute expiry for security
+2. Default certificate source is auto-provision via bootstrap
+3. Auto-update is disabled by default, opt-in via configuration
+4. Doctor checks run in parallel with per-check timeout
+
+### Risks
+1. **Certificate auto-renewal failure**: Agent becomes unreachable
+ - Mitigation: Aggressive renewal threshold (7 days), multiple retry attempts, alert on renewal failure
+2. **Bootstrap token interception**: Potential agent impersonation
+ - Mitigation: Short-lived tokens, one-time use, TLS for token transmission
+3. **Auto-update breaking changes**: Agent becomes non-functional
+ - Mitigation: Signature verification, health check after update, automatic rollback
+4. **Doctor check timeouts**: Slow checks block diagnostics
+ - Mitigation: Per-check timeout (10s default), parallel execution
+
+## Next Checkpoints
+
+- TASK-041-03 complete: Zero-touch bootstrap working
+- TASK-041-09 complete: Doctor plugin integrated
+- TASK-041-17 complete: Ready for production
+
diff --git a/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md
new file mode 100644
index 000000000..f8270d822
--- /dev/null
+++ b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md
@@ -0,0 +1,126 @@
+# Sprint 041 · Observability & Telemetry
+
+## Topic & Scope
+
+Implement comprehensive observability capabilities including metrics collection, distributed tracing, log aggregation, and dashboarding for the release orchestration platform.
+
+**Key Deliverables:**
+- Observability hub for centralized telemetry
+- Metric exporters for Prometheus/OpenTelemetry
+- Distributed trace correlation
+- Log aggregation with structured logging
+- Dashboard templates for Grafana
+
+- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/`
+- Documentation: `docs/modules/release-orchestrator/enhancements/observability.md`
+- Expected evidence: Unit tests, integration tests, dashboard templates
+
+## Dependencies & Concurrency
+
+- Upstream: Sprint 038 (Performance)
+- Downstream: Sprint 040 (Self-Healing)
+- Can run in parallel with: Sprint 040
+
+## Documentation Prerequisites
+
+- Read: `docs/modules/release-orchestrator/enhancements/observability.md` (if exists)
+- Read: OpenTelemetry SDK documentation
+
+## Delivery Tracker
+
+### TASK-041-01 - Observability Hub
+Status: DONE
+Dependency: none
+Owners: Developer/Implementer
+
+Implement `ObservabilityHub` for centralized telemetry management.
+
+Completion criteria:
+- [x] Metrics, traces, and logs collection
+- [x] Configurable export destinations
+- [x] Sampling strategies
+- [x] Buffer management for offline scenarios
+
+### TASK-041-02 - Metric Exporter
+Status: DONE
+Dependency: TASK-041-01
+Owners: Developer/Implementer
+
+Implement `MetricExporter` for Prometheus and OpenTelemetry.
+
+Completion criteria:
+- [x] Counter, gauge, histogram support
+- [x] Prometheus exposition format
+- [x] OTLP export support
+- [x] Custom metric definitions for releases
+
+### TASK-041-03 - Trace Correlator
+Status: DONE
+Dependency: TASK-041-01
+Owners: Developer/Implementer
+
+Implement `TraceCorrelator` for distributed tracing.
+
+Completion criteria:
+- [x] W3C Trace Context propagation
+- [x] Cross-service correlation
+- [x] Span enrichment with release context
+- [x] Trace sampling strategies
+
+### TASK-041-04 - Log Aggregator
+Status: DONE
+Dependency: TASK-041-01
+Owners: Developer/Implementer
+
+Implement `LogAggregator` for structured logging.
+
+Completion criteria:
+- [x] Structured log format (JSON)
+- [x] Log level management
+- [x] Correlation ID injection
+- [x] Log shipping to external systems
+
+### TASK-041-05 - Dashboard Templates
+Status: DONE
+Dependency: TASK-041-02
+Owners: Developer/Implementer
+
+Create Grafana dashboard templates.
+
+Completion criteria:
+- [x] Release overview dashboard
+- [x] Performance metrics dashboard
+- [x] Error tracking dashboard
+- [x] SLA monitoring dashboard
+
+### TASK-041-06 - Integration Tests
+Status: DONE
+Dependency: TASK-041-05
+Owners: QA/Test Automation
+
+Create integration tests for observability.
+
+Completion criteria:
+- [x] Metric export verification
+- [x] Trace propagation tests
+- [x] Log format validation
+
+## Execution Log
+
+| Date (UTC) | Update | Owner |
+| --- | --- | --- |
+| 2026-01-17 | Sprint created | Planning |
+| 2026-01-17 | TASK-041-01, 041-02, 041-03 implemented: ObservabilityHub, MetricExporter, TraceCorrelator | Developer |
+| 2026-01-17 | TASK-041-04 implemented: LogAggregator with JSON/ECS formats, shippers | Developer |
+| 2026-01-17 | TASK-041-05 implemented: 4 Grafana dashboards (releases, performance, errors, SLA) | Developer |
+| 2026-01-17 | TASK-041-06 completed: MetricExporterTests, TraceCorrelatorTests, LogAggregatorTests | QA |
+
+## Decisions & Risks
+
+- Risk: High cardinality metrics causing storage issues
+- Mitigation: Cardinality limits, metric aggregation, sampling
+
+## Next Checkpoints
+
+- TASK-041-03 complete: Core observability functional
+- TASK-041-06 complete: Ready for production
diff --git a/docs/FEATURE_GAPS_REPORT.md b/docs/FEATURE_GAPS_REPORT.md
deleted file mode 100644
index c64af7be1..000000000
--- a/docs/FEATURE_GAPS_REPORT.md
+++ /dev/null
@@ -1,744 +0,0 @@
-# Feature Gaps Report - Stella Ops Suite
-*(Auto-generated during feature matrix completion)*
-
-This report documents:
-1. Features discovered in code but not listed in FEATURE_MATRIX.md
-2. CLI/UI coverage gaps for existing features
-
----
-
-## Batch 1: SBOM & Ingestion
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| SPDX 3.0 Build Attestation | Attestor | `BuildAttestationMapper.cs`, `DsseSpdx3Signer.cs`, `CombinedDocumentBuilder.cs` | - | - | Attestation & Signing |
-| CycloneDX CBOM Support | Scanner | `CycloneDxCbomWriter.cs` | - | - | SBOM & Ingestion |
-| Trivy DB Export (Offline) | Concelier | `TrivyDbExporterPlugin.cs`, `TrivyDbOrasPusher.cs`, `TrivyDbExportPlanner.cs` | `stella db export trivy` | - | Offline & Air-Gap |
-| Layer SBOM Composition | Scanner | `SpdxLayerWriter.cs`, `CycloneDxLayerWriter.cs`, `LayerSbomService.cs` | `stella sbomer layer`, `stella scan layer-sbom` | - | SBOM & Ingestion |
-| SBOM Advisory Matching | Concelier | `SbomAdvisoryMatcher.cs`, `SbomRegistryService.cs`, `ValkeyPurlCanonicalIndex.cs` | - | - | Advisory Sources |
-| Graph Lineage Service | Graph | `IGraphLineageService.cs`, `InMemoryGraphLineageService.cs`, `LineageContracts.cs` | - | `/graph` | SBOM & Ingestion |
-| Evidence Cards (SBOM excerpts) | Evidence.Pack | `IEvidenceCardService.cs`, `EvidenceCardService.cs`, `EvidenceCard.cs` | - | Evidence drawer | Evidence & Findings |
-| AirGap SBOM Parsing | AirGap | `SpdxParser.cs`, `CycloneDxParser.cs` | - | `/ops/offline-kit` | Offline & Air-Gap |
-| SPDX License Normalization | Scanner | `SpdxLicenseNormalizer.cs`, `SpdxLicenseExpressions.cs`, `SpdxLicenseList.cs` | - | - | Scanning & Detection |
-| SBOM Format Conversion | Scanner | `SpdxCycloneDxConverter.cs` | - | - | SBOM & Ingestion |
-| SBOM Validation Pipeline | Scanner | `SbomValidationPipeline.cs`, `SemanticSbomExtensions.cs` | - | - | SBOM & Ingestion |
-| CycloneDX Evidence Mapping | Scanner | `CycloneDxEvidenceMapper.cs` | - | - | SBOM & Ingestion |
-| CycloneDX Pedigree Mapping | Scanner | `CycloneDxPedigreeMapper.cs` | - | - | SBOM & Ingestion |
-| SBOM Snapshot Export | Graph | `SbomSnapshot.cs`, `SbomSnapshotExporter.cs` | - | - | Evidence & Findings |
-| Lineage Evidence Packs | ExportCenter | `ILineageEvidencePackService.cs`, `LineageEvidencePack.cs`, `LineageExportEndpoints.cs` | - | `/triage/audit-bundles` | Evidence & Findings |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Delta-SBOM Cache | SbomService | No | No | Internal optimization - no action needed |
-| SBOM Lineage Ledger | SbomService | No | Yes | Add `stella sbom lineage list/show` commands |
-| SBOM Lineage API | SbomService | No | Yes | Add `stella sbom lineage export` command |
-| SPDX 3.0 Build Attestation | Attestor | No | No | Add to Attestation & Signing matrix section |
-| Graph Lineage Service | Graph | No | Yes | Consider `stella graph lineage` command |
-| Trivy DB Export | Concelier | Partial | No | `stella db export trivy` exists but may need UI |
-
----
-
-## Batch 2: Scanning & Detection
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| Secrets Detection (Regex+Entropy) | Scanner | `SecretsAnalyzer.cs`, `RegexDetector.cs`, `EntropyDetector.cs`, `CompositeSecretDetector.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - Dpkg (Debian/Ubuntu) | Scanner | `DpkgPackageAnalyzer.cs`, `DpkgStatusParser.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - Apk (Alpine) | Scanner | `ApkPackageAnalyzer.cs`, `ApkDatabaseParser.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - RPM (RHEL/CentOS) | Scanner | `RpmPackageAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - Homebrew (macOS) | Scanner | `HomebrewPackageAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - macOS Bundles | Scanner | `MacOsBundleAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| OS Analyzers - Windows (Chocolatey/MSI/WinSxS) | Scanner | `ChocolateyAnalyzer.cs`, `MsiAnalyzer.cs`, `WinSxSAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection |
-| Symbol-Level Vulnerability Matching | Scanner | `VulnSurfaceService.cs`, `AdvisorySymbolMapping.cs`, `AffectedSymbol.cs` | - | - | Scanning & Detection |
-| SARIF 2.1.0 Export | Scanner | SARIF export in CLI | `stella scan sarif` | - | Scanning & Detection |
-| Fidelity Upgrade (Quick->Standard->Deep) | Scanner | `FidelityAwareAnalyzer.UpgradeFidelityAsync()` | - | - | Scanning & Detection |
-| OCI Multi-Architecture Support | Scanner | `OciImageInspector.cs` (amd64, arm64, etc.) | `stella image inspect` | - | Scanning & Detection |
-| Symlink Resolution (32-level depth) | Scanner | `LayeredRootFileSystem.cs` | - | - | Scanning & Detection |
-| Whiteout File Support | Scanner | `LayeredRootFileSystem.cs` | - | - | Scanning & Detection |
-| NATS/Redis Scan Queue | Scanner | `NatsScanQueue.cs`, `RedisScanQueue.cs` | - | `/ops/scanner` | Operations |
-| Determinism Controls | Scanner | `DeterminismContext.cs`, `DeterministicTimeProvider.cs`, `DeterministicRandomProvider.cs` | `stella scan replay` | `/ops/scanner` | Determinism & Reproducibility |
-| Lease-Based Job Processing | Scanner | `LeaseHeartbeatService.cs`, `ScanJobProcessor.cs` | - | - | Operations |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| License-Risk Detection | Scanner | No | No | Planned Q4-2025 - not yet implemented |
-| Secrets Detection | Scanner | Implicit | Implicit | Document in matrix (runs automatically during scan) |
-| OS Package Analyzers | Scanner | Implicit | Implicit | Document in matrix (6 OS-level analyzers) |
-| Symbol-Level Matching | Scanner | No | No | Advanced feature - consider exposing in findings detail |
-| SARIF Export | Scanner | Yes | No | Consider adding SARIF download in UI |
-| Concurrent Worker Config | Scanner | No | Yes | CLI option for worker count would help CI/CD |
-
----
-
-## Batch 3: Reachability Analysis
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 8-State Reachability Lattice | Reachability.Core | `ReachabilityLattice.cs` (28 state transitions) | - | `/reachability` | Reachability Analysis |
-| Confidence Calculator | Reachability.Core | `ConfidenceCalculator.cs` (path/guard/hit bonuses) | - | - | Reachability Analysis |
-| Evidence Weighted Score (EWS) | Signals | `EvidenceWeightedScoreCalculator.cs` (6 dimensions: RCH/RTS/BKP/XPL/SRC/MIT) | - | - | Scoring & Risk |
-| Attested Reduction Scoring | Signals | VEX anchoring with short-circuit rules | - | - | Scoring & Risk |
-| Hybrid Reachability Query | Reachability.Core | `IReachabilityIndex.cs` (static/runtime/hybrid/batch modes) | `stella reachgraph slice` | `/reachability` | Reachability Analysis |
-| Reachability Replay/Verify | ReachGraph | `IReachabilityReplayService.VerifyAsync()` | `stella reachgraph replay/verify` | - | Determinism & Reproducibility |
-| Graph Triple-Layer Storage | ReachGraph | `ReachGraphStoreService.cs` (Cache->DB->Archive) | - | - | Operations |
-| Per-Graph Signing | ReachGraph | SHA256 artifact/provenance digests | - | - | Attestation & Signing |
-| GraphViz/Mermaid Export | CLI | `stella reachability show --format dot/mermaid` | `stella reachability show` | - | Reachability Analysis |
-| Reachability Drift Alerts | Docs | `19-reachability-drift-alert-flow.md` (state transition monitoring) | `stella drift` | - | Reachability Analysis |
-| Evidence URIs | ReachGraph | `stella://reachgraph/{digest}/slice/{symbolId}` format | - | - | Evidence & Findings |
-| Environment Guard Detection | Scanner | 20+ patterns (process.env, sys.platform, etc.) | - | `/reachability` | Reachability Analysis |
-| Dynamic Loading Detection | Scanner | require(variable), import(variable), Class.forName() | - | - | Reachability Analysis |
-| Reflection Call Detection | Scanner | Confidence scoring 0.5-0.6 for dynamic paths | - | - | Reachability Analysis |
-| EWS Guardrails | Signals | Speculative cap (45), not-affected cap (15), runtime floor (60) | - | - | Scoring & Risk |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Runtime Signal Correlation | Signals | No | Yes | Add `stella signals inspect` command |
-| Gate Detection | Scanner | No | Yes | Consider `stella reachability guards` command |
-| Path Witness Generation | ReachGraph | Yes | No | Add witness path visualization in UI |
-| Confidence Calculator | Reachability.Core | No | No | Internal implementation - consider exposing in findings |
-| Evidence Weighted Score | Signals | No | Partial | Add `stella score explain` command |
-| Graph Triple-Layer Storage | ReachGraph | No | No | Ops concern - consider admin commands |
-
----
-
-## Batch 4: Binary Analysis
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 4 Fingerprint Algorithm Types | BinaryIndex | `BasicBlockFingerprintGenerator.cs`, `ControlFlowGraphFingerprintGenerator.cs`, `StringRefsFingerprintGenerator.cs` | `stella binary fingerprint` | - | Binary Analysis |
-| Alpine Corpus Support | BinaryIndex | `AlpineCorpusConnector.cs` | - | - | Binary Analysis |
-| VEX Evidence Bridge | BinaryIndex | `IVexEvidenceGenerator.cs` | - | - | VEX Processing |
-| Delta Signature Matching | BinaryIndex | `LookupByDeltaSignatureAsync()` | `stella deltasig` | - | Binary Analysis |
-| Symbol Hash Matching | BinaryIndex | `LookupBySymbolHashAsync()` | `stella binary symbols` | - | Binary Analysis |
-| Corpus Function Identification | BinaryIndex | `IdentifyFunctionFromCorpusAsync()` | - | - | Binary Analysis |
-| Binary Call Graph Extraction | BinaryIndex | `binary callgraph` command | `stella binary callgraph` | - | Binary Analysis |
-| 3-Tier Identification Strategy | BinaryIndex | Package/Build-ID/Fingerprint tiers | - | - | Binary Analysis |
-| Fingerprint Validation Stats | BinaryIndex | `FingerprintValidationStats.cs` (TP/FP/TN/FN) | - | - | Binary Analysis |
-| Changelog CVE Parsing | BinaryIndex | `DebianChangelogParser.cs` (CVE pattern extraction) | - | - | Binary Analysis |
-| Secfixes Parsing | BinaryIndex | `ISecfixesParser.cs` (Alpine format) | - | - | Binary Analysis |
-| Batch Binary Operations | BinaryIndex | All lookup methods support batching | - | - | Binary Analysis |
-| Binary Match Confidence Scoring | BinaryIndex | 0.0-1.0 confidence for all matches | - | - | Binary Analysis |
-| Architecture-Aware Filtering | BinaryIndex | Match filtering by architecture | - | - | Binary Analysis |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Alpine Corpus | BinaryIndex | No | No | Add to matrix as additional corpus |
-| Corpus Ingestion UI | BinaryIndex | No | No | Consider admin UI for corpus management |
-| VEX Evidence Bridge | BinaryIndex | No | No | Internal integration - document in VEX section |
-| Fingerprint Visualization | BinaryIndex | Yes | No | Consider UI for function fingerprint display |
-| Batch Operations | BinaryIndex | No | No | Internal API - consider batch CLI commands |
-| Delta Signatures | BinaryIndex | Yes | No | Consider UI integration for patch detection |
-
----
-
-## Batch 5: Advisory Sources
-
-### Discovered Features (Not in Matrix)
-
-**CRITICAL: Matrix lists 11 sources, but codebase has 33+ connectors!**
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| **SUSE Connector** | Concelier | `Connector.Distro.Suse/` | `stella db fetch suse` | - | Advisory Sources |
-| **Astra Linux Connector** | Concelier | `Connector.Astra/` (FSTEC-certified Russian) | `stella db fetch astra` | - | Advisory Sources |
-| **Microsoft MSRC** | Concelier | `vndr.msrc` vendor connector | - | - | Advisory Sources |
-| **Oracle Connector** | Concelier | `vndr.oracle` vendor connector | - | - | Advisory Sources |
-| **Adobe Connector** | Concelier | `vndr.adobe` vendor connector | - | - | Advisory Sources |
-| **Apple Connector** | Concelier | `vndr.apple` vendor connector | - | - | Advisory Sources |
-| **Cisco Connector** | Concelier | `vndr.cisco` vendor connector | - | - | Advisory Sources |
-| **Chromium Connector** | Concelier | `vndr.chromium` vendor connector | - | - | Advisory Sources |
-| **VMware Connector** | Concelier | `vndr.vmware` vendor connector | - | - | Advisory Sources |
-| **JVN (Japan) CERT** | Concelier | `Connector.Jvn/` | - | - | Advisory Sources |
-| **ACSC (Australia) CERT** | Concelier | `Connector.Acsc/` | - | - | Advisory Sources |
-| **CCCS (Canada) CERT** | Concelier | `Connector.Cccs/` | - | - | Advisory Sources |
-| **CertFr (France) CERT** | Concelier | `Connector.CertFr/` | - | - | Advisory Sources |
-| **CertBund (Germany) CERT** | Concelier | `Connector.CertBund/` | - | - | Advisory Sources |
-| **CertCc CERT** | Concelier | `Connector.CertCc/` | - | - | Advisory Sources |
-| **CertIn (India) CERT** | Concelier | `Connector.CertIn/` | - | - | Advisory Sources |
-| **RU-BDU (Russia) CERT** | Concelier | `Connector.Ru.Bdu/` | - | - | Advisory Sources |
-| **RU-NKCKI (Russia) CERT** | Concelier | `Connector.Ru.Nkcki/` | - | - | Advisory Sources |
-| **KISA (South Korea) CERT** | Concelier | `Connector.Kisa/` | - | - | Advisory Sources |
-| **ICS-CISA (Industrial)** | Concelier | `Connector.Ics.Cisa/` | - | - | Advisory Sources |
-| **ICS-Kaspersky (Industrial)** | Concelier | `Connector.Ics.Kaspersky/` | - | - | Advisory Sources |
-| **StellaOpsMirror (Internal)** | Concelier | `Connector.StellaOpsMirror/` | - | - | Advisory Sources |
-| Backport-Aware Precedence | Concelier | `ConfigurableSourcePrecedenceLattice.cs` | - | - | Advisory Sources |
-| Link-Not-Merge Architecture | Concelier | Transitioning from merge to observation/linkset | - | - | Advisory Sources |
-| Canonical Deduplication | Concelier | `ICanonicalAdvisoryService`, `CanonicalMerger.cs` | - | - | Advisory Sources |
-| Change History Tracking | Concelier | `IChangeHistoryStore` (field-level diffs) | - | - | Advisory Sources |
-| Feed Epoch Events | Concelier | `FeedEpochAdvancedEvent` (Provcache invalidation) | - | - | Advisory Sources |
-| JSON Exporter | Concelier | `Exporter.Json/` (manifest-driven export) | `stella db export json` | - | Offline & Air-Gap |
-| Trivy DB Exporter | Concelier | `Exporter.TrivyDb/` | `stella db export trivy` | - | Offline & Air-Gap |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| **22+ Connectors Missing from Matrix** | Concelier | Partial | No | ADD TO MATRIX - major documentation gap |
-| Vendor PSIRTs (7 connectors) | Concelier | No | No | Add vendor section to matrix |
-| Regional CERTs (11 connectors) | Concelier | No | No | Add regional CERT section to matrix |
-| Industrial/ICS (2 connectors) | Concelier | No | No | Add ICS section to matrix |
-| Link-Not-Merge Transition | Concelier | No | No | Document new architecture in matrix |
-| Backport Precedence | Concelier | No | No | Document in merge engine section |
-| Change History | Concelier | No | No | Consider audit trail UI |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md seriously underrepresents Concelier capabilities:
-- **Listed:** 11 sources
-- **Actual:** 33+ connectors
-
-Recommended additions:
-1. Add "Vendor PSIRTs" section (Microsoft, Oracle, Adobe, Apple, Cisco, Chromium, VMware)
-2. Add "Regional CERTs" section (JVN, ACSC, CCCS, CertFr, CertBund, CertIn, RU-BDU, KISA, etc.)
-3. Add "Industrial/ICS" section (ICS-CISA, ICS-Kaspersky)
-4. Add "Additional Distros" section (SUSE, Astra Linux)
-5. Document backport-aware precedence configuration
-
----
-
-## Batch 6: VEX Processing
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| VEX Consensus Engine (5-state lattice) | VexLens | `VexConsensusEngine.cs`, `IVexConsensusEngine.cs` | `stella vex consensus` | `/vex` | VEX Processing |
-| Trust Decay Service | VexLens | `TrustDecayService.cs`, `TrustDecayCalculator.cs` | - | - | VEX Processing |
-| Noise Gate Service | VexLens | `NoiseGateService.cs` | - | `/vex` | VEX Processing |
-| Consensus Rationale Service | VexLens | `IConsensusRationaleService.cs`, `ConsensusRationaleModels.cs` | - | `/vex` | VEX Processing |
-| VEX Linkset Extraction | Excititor | `VexLinksetExtractionService.cs` | - | - | VEX Processing |
-| VEX Linkset Disagreement Detection | Excititor | `VexLinksetDisagreementService.cs` | - | `/vex` | VEX Processing |
-| VEX Statement Backfill | Excititor | `VexStatementBackfillService.cs` | - | - | VEX Processing |
-| VEX Evidence Chunking | Excititor | `VexEvidenceChunkService.cs` | - | - | VEX Processing |
-| Auto-VEX Downgrade | Excititor | `AutoVexDowngradeService.cs` | - | - | VEX Processing |
-| Risk Feed Service | Excititor | `RiskFeedService.cs`, `RiskFeedEndpoints.cs` | - | - | VEX Processing |
-| Trust Calibration Service | Excititor | `TrustCalibrationService.cs` | - | - | VEX Processing |
-| VEX Hashing Service (deterministic) | Excititor | `VexHashingService.cs` | - | - | VEX Processing |
-| CSAF Provider Connectors (7 total) | Excititor | `Connectors.*.CSAF/` (RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE) | - | - | VEX Processing |
-| OCI OpenVEX Attestation Connector | Excititor | `Connectors.OCI.OpenVEX.Attest/` | - | - | VEX Processing |
-| Issuer Key Lifecycle Management | IssuerDirectory | Key create/rotate/revoke endpoints | - | `/issuer-directory` | VEX Processing |
-| Issuer Trust Override | IssuerDirectory | Trust override endpoints | - | `/issuer-directory` | VEX Processing |
-| CSAF Publisher Bootstrap | IssuerDirectory | `csaf-publishers.json` seeding | - | - | VEX Processing |
-| VEX Webhook Distribution | VexHub | `IWebhookService.cs`, `IWebhookSubscriptionRepository.cs` | - | - | VEX Processing |
-| VEX Conflict Flagging | VexHub | `IStatementFlaggingService.cs` | - | - | VEX Processing |
-| VEX from Drift Generation | CLI | `VexGenCommandGroup.cs` | `stella vex gen --from-drift` | - | VEX Processing |
-| VEX Decision Signing | Policy | `VexDecisionSigningService.cs` | - | - | Policy Engine |
-| VEX Proof Spine | Policy | `VexProofSpineService.cs` | - | - | Policy Engine |
-| Consensus Propagation Rules | VexLens | `IPropagationRuleEngine.cs` | - | - | VEX Processing |
-| Consensus Delta Computation | VexLens | `VexDeltaComputeService.cs` | - | - | VEX Processing |
-| Triple-Layer Consensus Storage | VexLens | Cache->DB->Archive with `IConsensusProjectionStore.cs` | - | - | Operations |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| CSAF Provider Connectors | Excititor | No | No | Consider connector status UI in ops |
-| Trust Weight Configuration | VexLens | No | Partial | Add `stella vex trust configure` command |
-| VEX Distribution Webhooks | VexHub | No | No | Add webhook management UI/CLI |
-| Conflict Resolution | VexLens | No | Partial | Interactive conflict resolution needed |
-| Issuer Key Management | IssuerDirectory | No | Yes | Add `stella issuer keys` CLI |
-| Risk Feed Distribution | Excititor | No | No | Consider risk feed CLI |
-| Consensus Replay/Verify | VexLens | No | No | Add `stella vex verify` command |
-| VEX Evidence Export | Excititor | No | No | Add `stella vex evidence export` |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md VEX section is significantly underspecified:
-- **Listed:** Basic VEX support (OpenVEX, CSAF, CycloneDX)
-- **Actual:** Full consensus engine with 5-state lattice, 9 trust factors, 7 CSAF connectors, conflict detection, issuer registry
-
-Recommended additions:
-1. Add "VEX Consensus Engine" as major feature (VexLens)
-2. Add "Trust Weight Scoring" with 9 factors documented
-3. Add "CSAF Provider Connectors" section (7 vendors)
-4. Add "Issuer Trust Registry" (IssuerDirectory)
-5. Add "VEX Distribution" (VexHub webhooks)
-6. Document AOC (Aggregation-Only Contract) compliance
-7. Add "VEX from Drift" generation capability
-
----
-
-## Batch 7: Policy Engine
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| K4 Lattice (Belnap Four-Valued Logic) | Policy | `K4Lattice.cs`, `TrustLatticeEngine.cs`, `ClaimScoreMerger.cs` | - | `/policy` | Policy Engine |
-| 10+ Policy Gate Types | Policy | `PolicyGateEvaluator.cs`, various *Gate.cs files | - | `/policy` | Policy Engine |
-| Uncertainty Score Calculator | Policy.Determinization | `UncertaintyScoreCalculator.cs` (entropy 0.0-1.0) | - | - | Policy Engine |
-| Decayed Confidence Calculator | Policy.Determinization | `DecayedConfidenceCalculator.cs` (14-day half-life) | - | - | Policy Engine |
-| 6 Evidence Types | Policy.Determinization | `BackportEvidence.cs`, `CvssEvidence.cs`, `EpssEvidence.cs`, etc. | - | - | Policy Engine |
-| 6 Risk Score Providers | RiskEngine | `CvssKevProvider.cs`, `EpssProvider.cs`, `FixChainRiskProvider.cs` | - | `/risk` | Scoring & Risk |
-| FixChain Risk Metrics | RiskEngine | `FixChainRiskMetrics.cs`, `FixChainRiskDisplay.cs` | - | - | Scoring & Risk |
-| Exception Effect Registry | Policy | `ExceptionEffectRegistry.cs`, `ExceptionAdapter.cs` | - | `/policy/exceptions` | Policy Engine |
-| Exception Approval Rules | Policy | `IExceptionApprovalRulesService.cs` | - | `/policy/exceptions` | Policy Engine |
-| Policy Simulation Service | Policy.Registry | `IPolicySimulationService.cs` | `stella policy simulate` | `/policy/simulate` | Policy Engine |
-| Policy Promotion Pipeline | Policy.Registry | `IPromotionService.cs`, `IPublishPipelineService.cs` | - | - | Policy Engine |
-| Review Workflow Service | Policy.Registry | `IReviewWorkflowService.cs` | - | - | Policy Engine |
-| Sealed Mode Service | Policy | `ISealedModeService.cs` | - | `/ops` | Offline & Air-Gap |
-| Verdict Attestation Service | Policy | `IVerdictAttestationService.cs` | - | - | Attestation & Signing |
-| Policy Decision Attestation | Policy | `IPolicyDecisionAttestationService.cs` (DSSE/Rekor) | - | - | Attestation & Signing |
-| Score Policy YAML Config | Policy | `ScorePolicyModels.cs`, `ScorePolicyLoader.cs` | `stella policy validate` | `/policy` | Policy Engine |
-| Profile-Aware Scoring | Policy.Scoring | `ProfileAwareScoringService.cs`, `ScoringProfileService.cs` | - | - | Policy Engine |
-| Freshness-Aware Scoring | Policy | `FreshnessAwareScoringService.cs` | - | - | Policy Engine |
-| Jurisdiction Trust Rules | Policy.Vex | `JurisdictionTrustRules.cs` | - | - | Policy Engine |
-| VEX Customer Override | Policy.Vex | `VexCustomerOverride.cs` | - | - | Policy Engine |
-| Attestation Report Service | Policy | `IAttestationReportService.cs` | - | - | Attestation & Signing |
-| Risk Scoring Trigger Service | Policy.Scoring | `RiskScoringTriggerService.cs` | - | - | Scoring & Risk |
-| Policy Lint Endpoint | Policy | `/policy/lint` | - | - | Policy Engine |
-| Policy Determinism Verification | Policy | `/policy/verify-determinism` | - | - | Determinism & Reproducibility |
-| AdvisoryAI Knobs Endpoint | Policy | `/policy/advisory-ai/knobs` | - | - | Policy Engine |
-| Stability Damping Gate | Policy | `StabilityDampingGate.cs` | - | - | Policy Engine |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| K4 Lattice Operations | Policy | No | Partial | Add `stella policy lattice explain` for debugging |
-| Risk Provider Configuration | RiskEngine | No | No | Provider configuration needs CLI/UI exposure |
-| Exception Approval Workflow | Policy | No | Yes | Add `stella policy exception approve/reject` CLI |
-| Determinization Signal Weights | Policy | No | No | Allow signal weight tuning via CLI/config |
-| Policy Pack Promotion | Policy.Registry | No | Partial | Add `stella policy promote` CLI |
-| Score Policy Tuning | Policy.Scoring | Partial | Partial | Expand `stella policy` commands |
-| Verdict Attestation Export | Policy | No | No | Add `stella policy verdicts export` |
-| Risk Scoring History | RiskEngine | No | Partial | Consider historical trend CLI |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Policy section covers basics but misses advanced features:
-- **Listed:** Basic policy evaluation, exceptions
-- **Actual:** Full K4 lattice, 10+ gate types, 6 risk providers, determinization system
-
-Recommended additions:
-1. Add "K4 Lattice Logic" as core feature (Belnap four-valued logic)
-2. Add "Policy Gate Types" section (10+ specialized gates)
-3. Add "Risk Score Providers" section (6 providers with distinct purposes)
-4. Add "Determinization System" (signal weights, decay, uncertainty)
-5. Add "Score Policy Configuration" (YAML-based policy tuning)
-6. Add "Policy Simulation" as distinct feature
-7. Add "Verdict Attestations" (DSSE/Rekor integration)
-8. Document "Sealed Mode" for air-gap operations
-
----
-
-## Batch 8: Attestation & Signing
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 25+ Predicate Types | Attestor | `StellaOps.Attestor.ProofChain/Predicates/` | - | - | Attestation & Signing |
-| Keyless Signing (Fulcio) | Signer | `KeylessDsseSigner.cs`, `HttpFulcioClient.cs` | `stella sign keyless` | - | Attestation & Signing |
-| Ephemeral Key Generation | Signer.Keyless | `EphemeralKeyGenerator.cs`, `EphemeralKeyPair.cs` | - | - | Attestation & Signing |
-| OIDC Token Provider | Signer.Keyless | `IOidcTokenProvider.cs`, `AmbientOidcTokenProvider.cs` | - | - | Attestation & Signing |
-| Key Rotation Service | Signer.KeyManagement | `IKeyRotationService.cs`, `KeyRotationService.cs` | `/keys/rotate` API | - | Attestation & Signing |
-| Trust Anchor Manager | Signer.KeyManagement | `ITrustAnchorManager.cs`, `TrustAnchorManager.cs` | - | - | Attestation & Signing |
-| Delta Attestations (4 types) | Attestor | `IDeltaAttestationService.cs` (VEX/SBOM/Verdict/Reachability) | - | - | Attestation & Signing |
-| Layer Attestation Service | Attestor | `ILayerAttestationService.cs` | - | - | Attestation & Signing |
-| Attestation Chain Builder | Attestor | `AttestationChainBuilder.cs`, `AttestationChainValidator.cs` | - | - | Attestation & Signing |
-| Attestation Link Store | Attestor | `IAttestationLinkStore.cs`, `IAttestationLinkResolver.cs` | - | - | Attestation & Signing |
-| Rekor Submission Queue | Attestor | `IRekorSubmissionQueue.cs` (durable retry) | - | - | Attestation & Signing |
-| Cached Verification Service | Attestor | `CachedAttestorVerificationService.cs` | - | - | Attestation & Signing |
-| Offline Bundle Service | Attestor | `IAttestorBundleService.cs` | - | `/ops/offline-kit` | Offline & Air-Gap |
-| Signer Quota Service | Signer | `ISignerQuotaService.cs` | - | - | Operations |
-| Signer Audit Sink | Signer | `ISignerAuditSink.cs`, `InMemorySignerAuditSink.cs` | - | - | Operations |
-| Proof of Entitlement | Signer | `IProofOfEntitlementIntrospector.cs` (JWT/MTLS) | - | - | Auth & Access Control |
-| Release Integrity Verifier | Signer | `IReleaseIntegrityVerifier.cs` | - | - | Attestation & Signing |
-| JSON Canonicalizer (RFC 8785) | Attestor | `JsonCanonicalizer.cs` | - | - | Determinism & Reproducibility |
-| Predicate Type Router | Attestor | `IPredicateTypeRouter.cs`, `PredicateTypeRouter.cs` | - | - | Attestation & Signing |
-| Standard Predicate Registry | Attestor | `IStandardPredicateRegistry.cs` | - | - | Attestation & Signing |
-| HMAC Signing | Signer | `HmacDsseSigner.cs` | - | - | Attestation & Signing |
-| SM2 Algorithm Support | Signer | `CryptoDsseSigner.cs` (SM2 branch) | - | - | Regional Crypto |
-| Promotion Attestation | Provenance | `PromotionAttestation.cs` | - | - | Release Orchestration |
-| Cosign/KMS Signer | Provenance | `CosignAndKmsSigner.cs` | - | - | Attestation & Signing |
-| Rotating Signer | Provenance | `RotatingSigner.cs` | - | - | Attestation & Signing |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Key Rotation | Signer | No | No | Add `stella keys rotate` CLI command |
-| Trust Anchor Management | Signer | No | No | Add `stella trust-anchors` commands |
-| Attestation Chain Visualization | Attestor | No | Partial | Add chain visualization UI |
-| Predicate Registry Browser | Attestor | No | No | Add `stella attest predicates list` |
-| Delta Attestation CLI | Attestor | No | No | Add `stella attest delta` commands |
-| Signer Audit Logs | Signer | No | No | Add `stella sign audit` command |
-| Rekor Submission Status | Attestor | No | No | Add submission queue status UI |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Attestation section lists basic DSSE/in-toto support:
-- **Listed:** Basic attestation attach/verify, SLSA provenance
-- **Actual:** 25+ predicate types, keyless signing, key rotation, attestation chains
-
-Recommended additions:
-1. Add "Predicate Types" section (25+ types documented)
-2. Add "Keyless Signing (Sigstore)" as major feature
-3. Add "Key Rotation Service" for Enterprise tier
-4. Add "Trust Anchor Management" for Enterprise tier
-5. Add "Attestation Chains" feature
-6. Add "Delta Attestations" (VEX/SBOM/Verdict/Reachability)
-7. Document "Offline Bundle Service" for air-gap
-8. Add "SM2 Algorithm Support" in Regional Crypto section
-
----
-
-## Batch 9: Regional Crypto
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 8 Signature Profiles | Cryptography | `SignatureProfile.cs` | - | - | Regional Crypto |
-| Ed25519 Baseline Signing | Cryptography | `Ed25519Signer.cs`, `Ed25519Verifier.cs` | - | - | Regional Crypto |
-| ECDSA P-256 Profile | Cryptography | `EcdsaP256Signer.cs` | - | - | Regional Crypto |
-| FIPS 140-2 Plugin | Cryptography | `FipsPlugin.cs` | - | - | Regional Crypto |
-| GOST R 34.10-2012 Plugin | Cryptography | `GostPlugin.cs` | - | - | Regional Crypto |
-| SM2/SM3/SM4 Plugin | Cryptography | `SmPlugin.cs` | - | - | Regional Crypto |
-| eIDAS Plugin (CAdES/XAdES) | Cryptography | `EidasPlugin.cs` | - | - | Regional Crypto |
-| HSM Plugin (PKCS#11) | Cryptography | `HsmPlugin.cs` (simulated + production) | - | - | Regional Crypto |
-| CryptoPro GOST (Windows) | Cryptography | `CryptoProGostCryptoProvider.cs` | - | - | Regional Crypto |
-| Multi-Profile Signing | Cryptography | `MultiProfileSigner.cs` | - | - | Regional Crypto |
-| SM Remote Service | SmRemote | `Program.cs` | - | - | Regional Crypto |
-| Post-Quantum Profiles (Defined) | Cryptography | `SignatureProfile.cs` (Dilithium, Falcon) | - | - | Regional Crypto |
-| RFC 3161 TSA Integration | Cryptography | `EidasPlugin.cs` | - | - | Regional Crypto |
-| Simulated HSM Client | Cryptography | `SimulatedHsmClient.cs` | - | - | Regional Crypto |
-| GOST Block Cipher (28147-89) | Cryptography | `GostPlugin.cs` | - | - | Regional Crypto |
-| SM4 Encryption (CBC/ECB/GCM) | Cryptography | `SmPlugin.cs` | - | - | Regional Crypto |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Crypto Profile Selection | Cryptography | No | No | Add `stella crypto profiles` command |
-| Plugin Health Check | Cryptography | No | No | Add plugin status endpoint |
-| Key Management CLI | Cryptography | No | No | Add `stella keys` commands |
-| HSM Status | Cryptography | No | No | Add HSM health monitoring |
-| Post-Quantum Implementation | Cryptography | No | No | Implement Dilithium/Falcon when stable |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Regional Crypto section mentions only FIPS/eIDAS/GOST:
-- **Listed:** Basic regional compliance mentions
-- **Actual:** 8 signature profiles, 6 plugins, HSM support, post-quantum readiness
-
-Recommended additions:
-1. Add "Signature Profiles" section (8 profiles documented)
-2. Add "Plugin Architecture" description
-3. Add "Multi-Profile Signing" capability (dual-stack signatures)
-4. Add "SM Remote Service" for Chinese market
-5. Add "Post-Quantum Readiness" (Dilithium, Falcon defined)
-6. Add "HSM Integration" (PKCS#11 + simulation)
-7. Document plugin configuration options
-8. Add "CryptoPro GOST" for Windows environments
-
----
-
-## Batch 10: Evidence & Findings
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| WORM Storage (S3 Object Lock) | EvidenceLocker | `S3EvidenceObjectStore.cs` | - | - | Evidence & Findings |
-| Verdict Attestations (DSSE) | EvidenceLocker | `VerdictEndpoints.cs`, `VerdictContracts.cs` | - | `/evidence-export` | Evidence & Findings |
-| Append-Only Ledger Events | Findings | `ILedgerEventRepository.cs`, `LedgerEventModels.cs` | - | `/findings` | Evidence & Findings |
-| Alert Triage Bands (hot/warm/cold) | Findings | `DecisionModels.cs` | - | `/findings` | Evidence & Findings |
-| Merkle Anchoring | Findings | `Infrastructure/Merkle/` | - | - | Evidence & Findings |
-| Evidence Holds (Legal) | EvidenceLocker | `EvidenceHold.cs` | - | - | Evidence & Findings |
-| Evidence Pack Service | Evidence.Pack | `IEvidencePackService.cs`, `EvidencePack.cs` | - | `/evidence-thread` | Evidence & Findings |
-| Evidence Card Service | Evidence.Pack | `IEvidenceCardService.cs`, `EvidenceCard.cs` | - | - | Evidence & Findings |
-| Profile-Based Export | ExportCenter | `ExportApiEndpoints.cs`, `ExportProfile` | - | `/evidence-export` | Evidence & Findings |
-| Risk Bundle Export | ExportCenter | `RiskBundleEndpoints.cs` | - | `/evidence-export` | Evidence & Findings |
-| Audit Bundle Export | ExportCenter | `AuditBundleEndpoints.cs` | - | - | Evidence & Findings |
-| Lineage Evidence Export | ExportCenter | `LineageExportEndpoints.cs` | - | `/lineage` | Evidence & Findings |
-| SSE Export Streaming | ExportCenter | Real-time run events | - | - | Evidence & Findings |
-| Incident Mode | Findings | `IIncidentModeState.cs` | - | - | Evidence & Findings |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Evidence Holds | EvidenceLocker | No | No | Add legal hold management CLI |
-| Audit Bundle Export | ExportCenter | No | Partial | Add `stella export audit` command |
-| Incident Mode | Findings | No | No | Add `stella findings incident` commands |
-
----
-
-## Batch 11: Determinism & Replay
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| Hybrid Logical Clock | HybridLogicalClock | `HybridLogicalClock.cs`, `HlcTimestamp.cs` | - | - | Determinism & Replay |
-| HLC State Persistence | HybridLogicalClock | `IHlcStateStore.cs` | - | - | Determinism & Replay |
-| Canonical JSON (RFC 8785) | Canonical.Json | `CanonJson.cs`, `CanonVersion.cs` | - | - | Determinism & Replay |
-| Replay Manifests V1/V2 | Replay.Core | `ReplayManifest.cs` | `stella scan replay` | - | Determinism & Replay |
-| Knowledge Snapshots | Replay.Core | `KnowledgeSnapshot.cs` | - | - | Determinism & Replay |
-| Replay Proofs (DSSE) | Replay.Core | `ReplayProof.cs` | `stella prove` | - | Determinism & Replay |
-| Evidence Weighted Scoring (6 factors) | Signals | `EvidenceWeightedScoreCalculator.cs` | - | - | Scoring & Risk |
-| Score Buckets (ActNow/ScheduleNext/Investigate/Watchlist) | Signals | Scoring algorithm | - | - | Scoring & Risk |
-| Attested Reduction (short-circuit) | Signals | VEX anchoring logic | - | - | Scoring & Risk |
-| Timeline Events | Eventing | `TimelineEvent.cs`, `ITimelineEventEmitter.cs` | - | - | Determinism & Replay |
-| Deterministic Event IDs | Eventing | `EventIdGenerator.cs` (SHA-256) | - | - | Determinism & Replay |
-| Transactional Outbox | Eventing | `TimelineOutboxProcessor.cs` | - | - | Determinism & Replay |
-| Event Signing (DSSE) | Eventing | `IEventSigner.cs` | - | - | Determinism & Replay |
-| Replay Bundle Writer | Replay.Core | `StellaReplayBundleWriter.cs` (tar.zst) | - | - | Determinism & Replay |
-| Dead Letter Replay | Orchestrator | `IReplayManager.cs`, `ReplayManager.cs` | - | - | Operations |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| HLC Inspection | HybridLogicalClock | No | No | Add `stella hlc status` command |
-| Timeline Events | Eventing | No | No | Add `stella timeline query` command |
-| Scoring Explanation | Signals | No | No | Add `stella score explain` command |
-
----
-
-## Batch 12: Operations
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| Impact Index (Roaring bitmaps) | Scheduler | `IImpactIndex.cs` | - | - | Operations |
-| Graph Build/Overlay Jobs | Scheduler | `IGraphJobService.cs` | - | `/ops/scheduler` | Operations |
-| Run Preview (dry-run) | Scheduler | `RunEndpoints.cs` | - | - | Operations |
-| SSE Run Streaming | Scheduler | `/runs/{runId}/stream` | - | - | Operations |
-| Job Repository | Orchestrator | `IJobRepository.cs`, `Job.cs` | - | `/orchestrator` | Operations |
-| Lease Management | Orchestrator | `LeaseNextAsync()`, `ExtendLeaseAsync()` | - | - | Operations |
-| Dead Letter Classification | Orchestrator | `DeadLetterEntry.cs` | - | `/orchestrator` | Operations |
-| First Signal Service | Orchestrator | `IFirstSignalService.cs` | - | - | Operations |
-| Task Pack Execution | TaskRunner | `ITaskRunnerClient.cs` | - | - | Operations |
-| Plan-Hash Binding | TaskRunner | Deterministic validation | - | - | Operations |
-| Approval Gates | TaskRunner | `ApprovalDecisionRequest.cs` | - | - | Operations |
-| Artifact Capture | TaskRunner | Digest tracking | - | - | Operations |
-| Timeline Query Service | TimelineIndexer | `ITimelineQueryService.cs` | - | - | Operations |
-| Timeline Ingestion | TimelineIndexer | `ITimelineIngestionService.cs` | - | - | Operations |
-| Token-Bucket Rate Limiting | Orchestrator | Adaptive refill per tenant | - | - | Operations |
-| Job Watermarks | Orchestrator | Ordering guarantees | - | - | Operations |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Impact Preview | Scheduler | No | Partial | Add `stella scheduler preview` command |
-| Job Management | Orchestrator | No | Yes | Add `stella orchestrator jobs` commands |
-| Dead Letter Operations | Orchestrator | No | Yes | Add `stella orchestrator deadletter` commands |
-| TaskRunner CLI | TaskRunner | No | No | Add `stella taskrunner` commands |
-| Timeline Query CLI | TimelineIndexer | No | No | Add `stella timeline` commands |
-
----
-
-## Batch 13: Release Orchestration
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| Environment Bundles | ReleaseOrchestrator | `IEnvironmentBundleService.cs`, `EnvironmentBundle.cs` | - | `/releases` | Release Orchestration |
-| Promotion Workflows | ReleaseOrchestrator | `IPromotionWorkflowService.cs`, `PromotionRequest.cs` | - | `/releases` | Release Orchestration |
-| Rollback Service | ReleaseOrchestrator | `IRollbackService.cs`, `RollbackRequest.cs` | - | `/releases` | Release Orchestration |
-| Deployment Agents (Docker/Compose/ECS/Nomad) | ReleaseOrchestrator | `IDeploymentAgent.cs`, various agent implementations | - | `/releases` | Release Orchestration |
-| Progressive Delivery (A/B, Canary) | ReleaseOrchestrator | `IProgressiveDeliveryService.cs` | - | `/releases` | Release Orchestration |
-| Hook System (Pre/Post Deploy) | ReleaseOrchestrator | `IHookExecutionService.cs`, `Hook.cs` | - | `/releases` | Release Orchestration |
-| Approval Gates (Multi-Stage) | ReleaseOrchestrator | `IApprovalGateService.cs`, `ApprovalGate.cs` | - | `/releases` | Release Orchestration |
-| Release Bundle Signing | ReleaseOrchestrator | `IReleaseBundleSigningService.cs` | - | - | Release Orchestration |
-| Environment Promotion History | ReleaseOrchestrator | `IPromotionHistoryService.cs` | - | `/releases` | Release Orchestration |
-| Deployment Lock Service | ReleaseOrchestrator | `IDeploymentLockService.cs` | - | - | Release Orchestration |
-| Release Manifest Generation | ReleaseOrchestrator | `IReleaseManifestService.cs` | - | - | Release Orchestration |
-| Promotion Attestations | ReleaseOrchestrator | `PromotionAttestation.cs` | - | - | Attestation & Signing |
-| Environment Health Checks | ReleaseOrchestrator | `IEnvironmentHealthService.cs` | - | `/releases` | Release Orchestration |
-| Deployment Verification Tests | ReleaseOrchestrator | `IVerificationTestService.cs` | - | - | Release Orchestration |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Release Bundle Creation | ReleaseOrchestrator | No | Partial | Add `stella release create` command |
-| Environment Promotion | ReleaseOrchestrator | No | Yes | Add `stella release promote` command |
-| Rollback Operations | ReleaseOrchestrator | No | Yes | Add `stella release rollback` command |
-| Hook Management | ReleaseOrchestrator | No | Partial | Add `stella release hooks` commands |
-| Deployment Agent Status | ReleaseOrchestrator | No | Partial | Add `stella agent status` command |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Release Orchestration section is largely planned:
-- **Listed:** Basic environment management concepts
-- **Actual:** Full promotion workflow, deployment agents, progressive delivery
-
-Recommended additions:
-1. Add "Deployment Agents" section (Docker, Compose, ECS, Nomad)
-2. Add "Progressive Delivery" (A/B, Canary strategies)
-3. Add "Approval Gates" (multi-stage approvals)
-4. Add "Hook System" (pre/post deployment hooks)
-5. Add "Promotion Attestations" (DSSE signing of promotions)
-6. Document "Environment Health Checks"
-
----
-
-## Batch 14: Auth & Access Control
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 75+ Authorization Scopes | Authority | `AuthorizationScopeConstants.cs` | - | `/admin/roles` | Auth & Access Control |
-| DPoP Sender Constraints | Authority | `DPoPService.cs`, `DPoPValidator.cs` | - | - | Auth & Access Control |
-| mTLS Sender Constraints | Authority | `MtlsClientCertificateValidator.cs` | - | - | Auth & Access Control |
-| Device Authorization Flow | Authority | `DeviceAuthorizationEndpoints.cs` | - | `/login` | Auth & Access Control |
-| JWT Profile for OAuth | Authority | `JwtBearerClientAssertionValidator.cs` | - | - | Auth & Access Control |
-| PAR (Pushed Authorization Requests) | Authority | `ParEndpoints.cs` | - | - | Auth & Access Control |
-| Tenant Isolation | Authority | `ITenantContext.cs`, `TenantResolutionMiddleware.cs` | - | - | Auth & Access Control |
-| Role-Based Access Control | Authority | `IRoleService.cs`, `Role.cs` | - | `/admin/roles` | Auth & Access Control |
-| Permission Grant Service | Authority | `IPermissionGrantService.cs` | - | - | Auth & Access Control |
-| Token Introspection | Authority | `TokenIntrospectionEndpoints.cs` | - | - | Auth & Access Control |
-| Token Revocation | Authority | `TokenRevocationEndpoints.cs` | - | - | Auth & Access Control |
-| OAuth Client Management | Authority | `IClientRepository.cs`, `Client.cs` | - | `/admin/clients` | Auth & Access Control |
-| User Federation (LDAP/SAML) | Authority | `IFederationProvider.cs` | - | `/admin/federation` | Auth & Access Control |
-| Session Management | Authority | `ISessionStore.cs`, `Session.cs` | - | - | Auth & Access Control |
-| Consent Management | Authority | `IConsentStore.cs`, `Consent.cs` | - | `/consent` | Auth & Access Control |
-| Registry Token Service | Registry | `ITokenService.cs`, `TokenModels.cs` | `stella registry login` | - | Auth & Access Control |
-| Scope-Based Token Minting | Registry | Pull/push/catalog scope handling | - | - | Auth & Access Control |
-| Token Refresh Flow | Authority | Refresh token rotation | - | - | Auth & Access Control |
-| Multi-Factor Authentication | Authority | `IMfaService.cs` | - | `/login/mfa` | Auth & Access Control |
-| API Key Management | Authority | `IApiKeyService.cs` | - | `/admin/api-keys` | Auth & Access Control |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Scope Management | Authority | No | Yes | Add `stella auth scopes` commands |
-| DPoP Configuration | Authority | No | No | Add DPoP configuration documentation |
-| Client Management | Authority | No | Yes | Add `stella auth clients` commands |
-| Role Management | Authority | No | Yes | Add `stella auth roles` commands |
-| API Key Operations | Authority | No | Yes | Add `stella auth api-keys` commands |
-| Token Introspection | Authority | No | No | Add `stella auth token inspect` command |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Auth section covers basics but misses advanced features:
-- **Listed:** Basic OAuth/OIDC, RBAC
-- **Actual:** 75+ scopes, DPoP/mTLS, federation, advanced OAuth flows
-
-Recommended additions:
-1. Add "Authorization Scopes" section (75+ granular scopes)
-2. Add "Sender Constraints" (DPoP, mTLS)
-3. Add "Device Authorization Flow" for CLI/IoT
-4. Add "User Federation" (LDAP, SAML integration)
-5. Add "PAR Support" for security-conscious clients
-6. Add "Multi-Factor Authentication"
-7. Add "API Key Management" for service accounts
-8. Document "Tenant Isolation" architecture
-
----
-
-## Batch 15: Notifications & Integrations
-
-### Discovered Features (Not in Matrix)
-
-| Feature | Module | Key Files | CLI | UI | Suggested Category |
-|---------|--------|-----------|-----|----|--------------------|
-| 10 Notification Channel Types | Notify | Email, Slack, Teams, Webhook, PagerDuty, SNS, SQS, Pub/Sub, Discord, Matrix | - | `/notifications` | Notifications |
-| Template-Based Notifications | Notify | `INotificationTemplateService.cs`, `NotificationTemplate.cs` | - | `/notifications` | Notifications |
-| Channel Routing Rules | Notify | `IChannelRoutingService.cs`, `RoutingRule.cs` | - | `/notifications` | Notifications |
-| Delivery Receipt Tracking | Notify | `IDeliveryReceiptService.cs`, `DeliveryReceipt.cs` | - | - | Notifications |
-| Notification Preferences | Notify | `IPreferenceService.cs`, `UserPreference.cs` | - | `/settings` | Notifications |
-| Digest/Batch Notifications | Notify | `IDigestService.cs` | - | `/notifications` | Notifications |
-| Kubernetes Admission Webhooks | Zastava | `AdmissionWebhookEndpoints.cs` | - | - | Integrations |
-| OCI Registry Push Hooks | Zastava | `IWebhookProcessor.cs`, `RegistryPushEvent.cs` | - | - | Integrations |
-| Scan-on-Push Trigger | Zastava | Auto-trigger scanning on registry push | - | - | Integrations |
-| SCM Webhooks (GitHub/GitLab/Bitbucket) | Integrations | `IScmWebhookHandler.cs` | - | `/integrations` | Integrations |
-| CI/CD Webhooks | Integrations | Jenkins, CircleCI, GitHub Actions integration | - | `/integrations` | Integrations |
-| Issue Tracker Integration | Integrations | Jira, GitHub Issues, Linear integration | - | `/integrations` | Integrations |
-| Slack App Integration | Integrations | `ISlackAppService.cs`, slash commands | - | `/integrations` | Integrations |
-| MS Teams App Integration | Integrations | `ITeamsAppService.cs`, adaptive cards | - | `/integrations` | Integrations |
-| Notification Studio | Notifier | Template design and preview | - | `/notifications/studio` | Notifications |
-| Escalation Rules | Notify | `IEscalationService.cs` | - | `/notifications` | Notifications |
-| On-Call Schedule Integration | Notify | PagerDuty, OpsGenie integration | - | `/notifications` | Notifications |
-| Webhook Retry Logic | Notify | Exponential backoff, dead letter | - | - | Notifications |
-| Event-Driven Notifications | Notify | Timeline event subscription | - | - | Notifications |
-| Custom Webhook Payloads | Integrations | `IWebhookPayloadFormatter.cs` | - | `/integrations` | Integrations |
-
-### Coverage Gaps
-
-| Feature | Module | Has CLI | Has UI | Recommendation |
-|---------|--------|---------|--------|----------------|
-| Channel Configuration | Notify | No | Yes | Add `stella notify channels` commands |
-| Template Management | Notify | No | Yes | Add `stella notify templates` commands |
-| Webhook Testing | Integrations | No | Partial | Add `stella integrations test` command |
-| K8s Webhook Installation | Zastava | No | No | Add `stella zastava install` command |
-| Notification Preferences | Notify | No | Yes | Add `stella notify preferences` commands |
-
-### Matrix Update Recommendations
-
-The FEATURE_MATRIX.md Notifications section is basic:
-- **Listed:** Basic webhook/email notifications
-- **Actual:** 10 channel types, template engine, routing rules, escalation
-
-Recommended additions:
-1. Add "Notification Channels" section (10 types)
-2. Add "Template Engine" for customizable messages
-3. Add "Channel Routing" for sophisticated delivery
-4. Add "Escalation Rules" for incident response
-5. Add "Notification Studio" for template design
-6. Add "Kubernetes Admission Webhooks" (Zastava)
-7. Add "SCM Integrations" (GitHub, GitLab, Bitbucket)
-8. Add "CI/CD Integrations" (Jenkins, CircleCI, GitHub Actions)
-9. Add "Issue Tracker Integration" (Jira, GitHub Issues)
-10. Document "Scan-on-Push" auto-trigger
-
----
-
-## Summary: Overall Matrix Gaps
-
-### Major Documentation Gaps Identified
-
-| Category | Matrix Coverage | Actual Coverage | Gap Severity |
-|----------|-----------------|-----------------|--------------|
-| Advisory Sources | 11 sources | 33+ connectors | **CRITICAL** |
-| VEX Processing | Basic | Full consensus engine | **HIGH** |
-| Attestation & Signing | Basic | 25+ predicates | **HIGH** |
-| Auth Scopes | Basic RBAC | 75+ granular scopes | **HIGH** |
-| Policy Engine | Basic | K4 lattice, 10+ gates | **MEDIUM** |
-| Regional Crypto | 3 profiles | 8 profiles, 6 plugins | **MEDIUM** |
-| Notifications | 2 channels | 10 channels | **MEDIUM** |
-| Binary Analysis | Basic | 4 fingerprint algorithms | **MEDIUM** |
-| Release Orchestration | Planned | Partially implemented | **LOW** |
-
-### CLI/UI Coverage Statistics
-
-| Metric | Value |
-|--------|-------|
-| Features with CLI | ~65% |
-| Features with UI | ~70% |
-| Features with both | ~55% |
-| Internal-only features | ~25% |
-
-### Recommended Next Steps
-
-1. **Immediate**: Update Advisory Sources section (33+ connectors undocumented)
-2. **High Priority**: Document VEX consensus engine capabilities
-3. **High Priority**: Document attestation predicate types
-4. **Medium Priority**: Update auth scopes documentation
-5. **Medium Priority**: Complete policy engine documentation
-6. **Low Priority**: Document internal operations features
diff --git a/docs/FEATURE_MATRIX.md b/docs/FEATURE_MATRIX.md
index c7738bccc..1af1efeda 100755
--- a/docs/FEATURE_MATRIX.md
+++ b/docs/FEATURE_MATRIX.md
@@ -20,16 +20,16 @@
**Principle:** Pay for scale, not for features or automation. No per-seat, per-project, or per-deployment taxes.
-| Plan | Price | Environments | New Digests/Day | Deployments | Notes |
-|------|-------|--------------|-----------------|-------------|-------|
-| **Free** | $0/month | 3 | 333 | Unlimited (fair use) | Full features |
-| **Pro** | $699/month | 33 | 3,333 | Unlimited (fair use) | Same features |
-| **Enterprise** | $1,999/month | Unlimited | Unlimited | Unlimited | Fair use on mirroring/audit bandwidth |
+| Plan | Price | Environments | New Digests/Day |
+|------|-------|--------------|------------------|
+| **Free** | $0/month | 3 | 333 |
+| **Pro** | $699/month | 33 | 3,333 |
+| **Enterprise** | $1,999/month | Unlimited | Unlimited |
**Key Principles:**
- All plans include all features (no feature gating)
-- Limits are environments + new digests analyzed per day
-- Unlimited deployments with fair use policy
+- Only limits are environments and new digests analyzed per day
+- All other capabilities are identical across all tiers
---
@@ -37,75 +37,74 @@
*These differentiators are available across all plans.*
-| Capability | Free | Pro | Enterprise | Notes |
-|------------|:----:|:---:|:----------:|-------|
-| Signed Replayable Risk Verdicts | ✅ | ✅ | ✅ | Core differentiator |
-| Decision Capsules | ✅ | ✅ | ✅ | Audit-grade evidence bundles |
-| VEX Decisioning Engine | ✅ | ✅ | ✅ | Trust lattice + conflict resolution |
-| Reachability with Portable Proofs | ✅ | ✅ | ✅ | Three-layer analysis |
-| Smart-Diff (Semantic Risk Delta) | ✅ | ✅ | ✅ | Material change detection |
-| Unknowns as First-Class State | ✅ | ✅ | ✅ | Uncertainty budgets |
-| Deterministic Replay | ✅ | ✅ | ✅ | `stella replay srm.yaml` |
-| Non-Kubernetes First-Class | ✅ | ✅ | ✅ | Docker/Compose/ECS/Nomad targets |
-| Digest-First Release Identity | ✅ | ✅ | ✅ | Immutable releases |
+| Capability | Notes |
+|------------|-------|
+| Signed Replayable Risk Verdicts | Core differentiator |
+| Decision Capsules | Audit-grade evidence bundles |
+| VEX Decisioning Engine | Trust lattice + conflict resolution |
+| Reachability with Portable Proofs | Three-layer analysis |
+| Smart-Diff (Semantic Risk Delta) | Material change detection |
+| Unknowns as First-Class State | Uncertainty budgets |
+| Deterministic Replay | `stella replay srm.yaml` |
+| Non-Kubernetes First-Class | Docker/Compose/ECS/Nomad targets |
+| Digest-First Release Identity | Immutable releases |
---
## Release Orchestration (Planned)
-*Release orchestration capabilities are planned for implementation. All plans will include all features.*
+*Release orchestration capabilities are planned for implementation.*
-| Capability | Free | Pro | Enterprise | Notes |
-|------------|:----:|:---:|:----------:|-------|
-| **Environment Management** | | | | |
-| Environment CRUD | ⏳ | ⏳ | ⏳ | Dev/Stage/Prod definitions |
-| Freeze Windows | ⏳ | ⏳ | ⏳ | Calendar-based blocking |
-| Approval Policies | ⏳ | ⏳ | ⏳ | Per-environment rules |
-| **Release Management** | | | | |
-| Component Registry | ⏳ | ⏳ | ⏳ | Service → repository mapping |
-| Release Bundles | ⏳ | ⏳ | ⏳ | Component → digest bundles |
-| Semantic Versioning | ⏳ | ⏳ | ⏳ | SemVer release versions |
-| Tag → Digest Resolution | ⏳ | ⏳ | ⏳ | Immutable digest pinning |
-| **Promotion & Gates** | | | | |
-| Promotion Workflows | ⏳ | ⏳ | ⏳ | Environment transitions |
-| Security Gate | ⏳ | ⏳ | ⏳ | Scan verdict evaluation |
-| Approval Gate | ⏳ | ⏳ | ⏳ | Human sign-off |
-| Freeze Window Gate | ⏳ | ⏳ | ⏳ | Calendar enforcement |
-| Policy Gate (OPA/Rego) | ⏳ | ⏳ | ⏳ | Custom rules |
-| Decision Records | ⏳ | ⏳ | ⏳ | Evidence-linked decisions |
-| **Deployment Execution** | | | | |
-| Docker Host Agent | ⏳ | ⏳ | ⏳ | Direct container deployment |
-| Compose Host Agent | ⏳ | ⏳ | ⏳ | Docker Compose deployment |
-| SSH Agentless | ⏳ | ⏳ | ⏳ | Linux remote execution |
-| WinRM Agentless | ⏳ | ⏳ | ⏳ | Windows remote execution |
-| ECS Agent | ⏳ | ⏳ | ⏳ | AWS ECS deployment |
-| Nomad Agent | ⏳ | ⏳ | ⏳ | HashiCorp Nomad deployment |
-| Rollback | ⏳ | ⏳ | ⏳ | Previous version restore |
-| **Progressive Delivery** | | | | |
-| A/B Releases | ⏳ | ⏳ | ⏳ | Traffic splitting |
-| Canary Deployments | ⏳ | ⏳ | ⏳ | Gradual rollout |
-| Blue-Green | ⏳ | ⏳ | ⏳ | Zero-downtime switch |
-| Traffic Routing Plugins | ⏳ | ⏳ | ⏳ | Nginx/HAProxy/Traefik/ALB |
-| **Workflow Engine** | | | | |
-| DAG Workflow Execution | ⏳ | ⏳ | ⏳ | Directed acyclic graphs |
-| Step Registry | ⏳ | ⏳ | ⏳ | Built-in + custom steps |
-| Workflow Templates | ⏳ | ⏳ | ⏳ | Reusable workflows |
-| Script Steps (Bash/C#) | ⏳ | ⏳ | ⏳ | Custom automation |
-| **Evidence & Audit** | | | | |
-| Evidence Packets | ⏳ | ⏳ | ⏳ | Sealed decision bundles |
-| Version Stickers | ⏳ | ⏳ | ⏳ | On-target deployment records |
-| Audit Export | ⏳ | ⏳ | ⏳ | Compliance reporting |
-| **Integrations** | | | | |
-| GitHub Integration | ⏳ | ⏳ | ⏳ | SCM + webhooks |
-| GitLab Integration | ⏳ | ⏳ | ⏳ | SCM + webhooks |
-| Harbor Integration | ⏳ | ⏳ | ⏳ | Registry + scanning |
-| HashiCorp Vault | ⏳ | ⏳ | ⏳ | Secrets management |
-| AWS Secrets Manager | ⏳ | ⏳ | ⏳ | Secrets management |
-| **Plugin System** | | | | |
-| Plugin Manifest | ⏳ | ⏳ | ⏳ | Static declarations |
-| Connector Runtime | ⏳ | ⏳ | ⏳ | Dynamic execution |
-| Step Providers | ⏳ | ⏳ | ⏳ | Custom workflow steps |
-| Agent Types | ⏳ | ⏳ | ⏳ | Custom deployment targets |
+| Capability | Notes |
+| **Environment Management** | |
+| Environment CRUD | ⏳ Dev/Stage/Prod definitions |
+| Freeze Windows | ⏳ Calendar-based blocking |
+| Approval Policies | ⏳ Per-environment rules |
+| **Release Management** | |
+| Component Registry | ⏳ Service → repository mapping |
+| Release Bundles | ⏳ Component → digest bundles |
+| Semantic Versioning | ⏳ SemVer release versions |
+| Tag → Digest Resolution | ⏳ Immutable digest pinning |
+| **Promotion & Gates** | |
+| Promotion Workflows | ⏳ Environment transitions |
+| Security Gate | ⏳ Scan verdict evaluation |
+| Approval Gate | ⏳ Human sign-off |
+| Freeze Window Gate | ⏳ Calendar enforcement |
+| Policy Gate (OPA/Rego) | ⏳ Custom rules |
+| Decision Records | ⏳ Evidence-linked decisions |
+| **Deployment Execution** | |
+| Docker Host Agent | ⏳ Direct container deployment |
+| Compose Host Agent | ⏳ Docker Compose deployment |
+| SSH Agentless | ⏳ Linux remote execution |
+| WinRM Agentless | ⏳ Windows remote execution |
+| ECS Agent | ⏳ AWS ECS deployment |
+| Nomad Agent | ⏳ HashiCorp Nomad deployment |
+| Rollback | ⏳ Previous version restore |
+| **Progressive Delivery** | |
+| A/B Releases | ⏳ Traffic splitting |
+| Canary Deployments | ⏳ Gradual rollout |
+| Blue-Green | ⏳ Zero-downtime switch |
+| Traffic Routing Plugins | ⏳ Nginx/HAProxy/Traefik/ALB |
+| **Workflow Engine** | |
+| DAG Workflow Execution | ⏳ Directed acyclic graphs |
+| Step Registry | ⏳ Built-in + custom steps |
+| Workflow Templates | ⏳ Reusable workflows |
+| Script Steps (Bash/C#) | ⏳ Custom automation |
+| **Evidence & Audit** | |
+| Evidence Packets | ⏳ Sealed decision bundles |
+| Version Stickers | ⏳ On-target deployment records |
+| Audit Export | ⏳ Compliance reporting |
+| **Integrations** | |
+| GitHub Integration | ⏳ SCM + webhooks |
+| GitLab Integration | ⏳ SCM + webhooks |
+| Harbor Integration | ⏳ Registry + scanning |
+| HashiCorp Vault | ⏳ Secrets management |
+| AWS Secrets Manager | ⏳ Secrets management |
+| **Plugin System** | |
+| Plugin Manifest | ⏳ Static declarations |
+| Connector Runtime | ⏳ Dynamic execution |
+| Step Providers | ⏳ Custom workflow steps |
+| Agent Types | ⏳ Custom deployment targets |
---
@@ -115,68 +114,64 @@
|-------|:----:|:---:|:----------:|
| **Environments** | 3 | 33 | Unlimited |
| **New Digests/Day** | 333 | 3,333 | Unlimited |
-| **Deployments** | Fair use | Fair use | Fair use |
-| **Targets per Environment** | 10 | 100 | Unlimited |
-| **Agents** | 3 | 33 | Unlimited |
-| **Integrations** | 5 | 50 | Unlimited |
---
## SBOM & Ingestion
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Trivy-JSON Ingestion | ✅ | ✅ | ✅ | |
-| SPDX-JSON 3.0.1 Ingestion | ✅ | ✅ | ✅ | |
-| CycloneDX 1.7 Ingestion (1.6 backward compatible) | ✅ | ✅ | ✅ | |
-| Auto-format Detection | ✅ | ✅ | ✅ | |
-| Delta-SBOM Cache | ✅ | ✅ | ✅ | Warm scans <1s |
-| SBOM Generation (all formats) | ✅ | ✅ | ✅ | |
-| Semantic SBOM Diff | ✅ | ✅ | ✅ | |
-| BYOS (Bring-Your-Own-SBOM) | ✅ | ✅ | ✅ | |
-| **SBOM Lineage Ledger** | — | — | ✅ | Full versioned history |
-| **SBOM Lineage API** | — | — | ✅ | Traversal queries |
+| Capability | Notes |
+|------------|-------|
+| Trivy-JSON Ingestion | |
+| SPDX-JSON 3.0.1 Ingestion | |
+| CycloneDX 1.7 Ingestion (1.6 backward compatible) | |
+| Auto-format Detection | |
+| Delta-SBOM Cache | Warm scans <1s |
+| SBOM Generation (all formats) | |
+| Semantic SBOM Diff | |
+| BYOS (Bring-Your-Own-SBOM) | |
+| SBOM Lineage Ledger | Full versioned history |
+| SBOM Lineage API | Traversal queries |
---
## Scanning & Detection
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| CVE Lookup via Local DB | ✅ | ✅ | ✅ | |
-| Licence-Risk Detection | ⏳ | ⏳ | ⏳ | Q4-2025 |
-| **Automatic Detection (Class A)** | | | | Runs implicitly during scan |
-| — Secrets Detection | ✅ | ✅ | ✅ | API keys, tokens, passwords; results in findings (see [docs/modules/ui/components/findings-list.md](docs/modules/ui/components/findings-list.md)) |
-| — OS Package Analyzers | ✅ | ✅ | ✅ | apk, apt, yum, dnf, rpm, pacman; results in SBOM (see [docs/modules/cli/guides/commands/sbom.md](docs/modules/cli/guides/commands/sbom.md)) |
-| **Language Analyzers (All 11)** | | | | |
-| — .NET/C#, Java, Go, Python | ✅ | ✅ | ✅ | |
-| — Node.js, Ruby, Bun, Deno | ✅ | ✅ | ✅ | |
-| — PHP, Rust, Native binaries | ✅ | ✅ | ✅ | |
-| **Progressive Fidelity Modes** | | | | |
-| — Quick Mode | ✅ | ✅ | ✅ | |
-| — Standard Mode | ✅ | ✅ | ✅ | |
-| — Deep Mode | — | ✅ | ✅ | Full analysis |
-| Base Image Detection | ✅ | ✅ | ✅ | |
-| Layer-Aware Analysis | ✅ | ✅ | ✅ | |
-| **Concurrent Scan Workers** | 1 | 3 | Unlimited | |
+| Capability | Notes |
+|------------|-------|
+| CVE Lookup via Local DB | |
+| Licence-Risk Detection | ⏳ Q4-2025 |
+| **Automatic Detection (Class A)** | Runs implicitly during scan |
+| — Secrets Detection | API keys, tokens, passwords; results in findings (see [docs/modules/ui/components/findings-list.md](docs/modules/ui/components/findings-list.md)) |
+| — OS Package Analyzers | apk, apt, yum, dnf, rpm, pacman; results in SBOM (see [docs/modules/cli/guides/commands/sbom.md](docs/modules/cli/guides/commands/sbom.md)) |
+| **Language Analyzers (All 11)** | |
+| — .NET/C#, Java, Go, Python | |
+| — Node.js, Ruby, Bun, Deno | |
+| — PHP, Rust, Native binaries | |
+| **Progressive Fidelity Modes** | |
+| — Quick Mode | |
+| — Standard Mode | |
+| — Deep Mode | Full analysis |
+| Base Image Detection | |
+| Layer-Aware Analysis | |
+| **Concurrent Scan Workers** | Configurable |
---
## Reachability Analysis
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Static Call Graph | ✅ | ✅ | ✅ | |
-| Entrypoint Detection | ✅ | ✅ | ✅ | 9+ framework types |
-| BFS Reachability | ✅ | ✅ | ✅ | |
-| Reachability Drift Detection | ✅ | ✅ | ✅ | |
-| Binary Loader Resolution | — | ✅ | ✅ | ELF/PE/Mach-O |
-| Feature Flag/Config Gating | — | ✅ | ✅ | Layer 3 analysis |
-| Runtime Signal Correlation | — | — | ✅ | Zastava integration |
-| Gate Detection (auth/admin) | — | — | ✅ | Enterprise policies |
-| Path Witness Generation | — | — | ✅ | Audit evidence |
-| Reachability Mini-Map API | — | — | ✅ | UI visualization |
-| Runtime Timeline API | — | — | ✅ | Temporal analysis |
+| Capability | Notes |
+|------------|-------|
+| Static Call Graph | |
+| Entrypoint Detection | 9+ framework types |
+| BFS Reachability | |
+| Reachability Drift Detection | |
+| Binary Loader Resolution | ELF/PE/Mach-O |
+| Feature Flag/Config Gating | Layer 3 analysis |
+| Runtime Signal Correlation | Zastava integration |
+| Gate Detection (auth/admin) | Enterprise policies |
+| Path Witness Generation | Audit evidence |
+| Reachability Mini-Map API | UI visualization |
+| Runtime Timeline API | Temporal analysis |
---
@@ -184,18 +179,18 @@
*Binary analysis capabilities are CLI-first (Class B). UI integration is minimal until user demand validates.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Binary Identity Extraction | ✅ | ✅ | ✅ | Build-ID, hashes |
-| Build-ID Vulnerability Lookup | ✅ | ✅ | ✅ | |
-| Debian/Ubuntu Corpus | ✅ | ✅ | ✅ | |
-| RPM/RHEL Corpus | — | ✅ | ✅ | |
-| Patch-Aware Backport Detection | — | ✅ | ✅ | |
-| PE/Mach-O/ELF Parsers | — | ✅ | ✅ | |
-| **Binary Fingerprint Generation** | — | — | ✅ | CLI: `stella binary fingerprint export` |
-| **Fingerprint Matching Engine** | — | — | ✅ | Similarity search |
-| **Binary Diff** | — | — | ✅ | CLI: `stella binary diff ` |
-| **DWARF/Symbol Analysis** | — | — | ✅ | Debug symbols |
+| Capability | Notes |
+|------------|-------|
+| Binary Identity Extraction | Build-ID, hashes |
+| Build-ID Vulnerability Lookup | |
+| Debian/Ubuntu Corpus | |
+| RPM/RHEL Corpus | |
+| Patch-Aware Backport Detection | |
+| PE/Mach-O/ELF Parsers | |
+| Binary Fingerprint Generation | CLI: `stella binary fingerprint export` |
+| Fingerprint Matching Engine | Similarity search |
+| Binary Diff | CLI: `stella binary diff ` |
+| DWARF/Symbol Analysis | Debug symbols |
**CLI Commands (Class B):**
- `stella binary fingerprint export ` — Export fingerprint data (function hashes, section hashes, symbol table)
@@ -209,51 +204,51 @@
*Concelier provides 33+ vulnerability feed connectors with automatic sync, health monitoring, and conflict detection.*
-| Source Category | Connectors | Free | Community | Enterprise | Notes |
-|-----------------|-----------|:----:|:---------:|:----------:|-------|
-| **National CVE Databases** | | | | | |
-| — NVD (NIST) | ✅ | ✅ | ✅ | ✅ | Primary CVE source |
-| — CVE (MITRE) | ✅ | ✅ | ✅ | ✅ | CVE Record format 5.0 |
-| **OSS Ecosystems** | | | | | |
-| — OSV | ✅ | ✅ | ✅ | ✅ | Multi-ecosystem |
-| — GHSA | ✅ | ✅ | ✅ | ✅ | GitHub Security Advisories |
-| **Linux Distributions** | | | | | |
-| — Alpine SecDB | ✅ | ✅ | ✅ | ✅ | |
-| — Debian Security Tracker | ✅ | ✅ | ✅ | ✅ | |
-| — Ubuntu USN | ✅ | ✅ | ✅ | ✅ | |
-| — RHEL/CentOS OVAL | — | ✅ | ✅ | ✅ | |
-| — SUSE OVAL | — | ✅ | ✅ | ✅ | |
-| — Astra Linux | — | — | ✅ | ✅ | Russian distro |
-| **CERTs / National CSIRTs** | | | | | |
-| — CISA KEV | ✅ | ✅ | ✅ | ✅ | Known Exploited Vulns |
-| — CISA ICS-CERT | — | ✅ | ✅ | ✅ | Industrial control systems |
-| — CERT-CC | — | ✅ | ✅ | ✅ | Carnegie Mellon |
-| — CERT-FR | — | ✅ | ✅ | ✅ | France |
-| — CERT-Bund (BSI) | — | ✅ | ✅ | ✅ | Germany |
-| — CERT-In | — | ✅ | ✅ | ✅ | India |
-| — ACSC | — | ✅ | ✅ | ✅ | Australia |
-| — CCCS | — | ✅ | ✅ | ✅ | Canada |
-| — KISA | — | ✅ | ✅ | ✅ | South Korea |
-| — JVN | — | ✅ | ✅ | ✅ | Japan |
-| **Russian Federation Sources** | | | | | |
-| — FSTEC BDU | — | — | ✅ | ✅ | Russian vuln database |
-| — NKCKI | — | — | ✅ | ✅ | Critical infrastructure |
-| **Vendor PSIRTs** | | | | | |
-| — Microsoft MSRC | — | ✅ | ✅ | ✅ | |
-| — Cisco PSIRT | — | ✅ | ✅ | ✅ | |
-| — Oracle CPU | — | ✅ | ✅ | ✅ | |
-| — VMware | — | ✅ | ✅ | ✅ | |
-| — Adobe PSIRT | — | ✅ | ✅ | ✅ | |
-| — Apple Security | — | ✅ | ✅ | ✅ | |
-| — Chromium | — | ✅ | ✅ | ✅ | |
-| **ICS/SCADA** | | | | | |
-| — Kaspersky ICS-CERT | — | — | ✅ | ✅ | Industrial security |
-| **Risk Scoring** | | | | | |
-| — EPSS v4 | ✅ | ✅ | ✅ | ✅ | Exploit prediction |
-| **Enterprise Features** | | | | | |
-| Custom Advisory Connectors | — | — | — | ✅ | Private feeds |
-| Advisory Merge Engine | — | — | — | ✅ | Conflict resolution |
-| Connector Health CLI | ✅ | ✅ | ✅ | ✅ | `stella db connectors status` |
+| Connector | Notes |
+|-----------|-------|
+| **National CVE Databases** | |
+| — NVD (NIST) | Primary CVE source |
+| — CVE (MITRE) | CVE Record format 5.0 |
+| **OSS Ecosystems** | |
+| — OSV | Multi-ecosystem |
+| — GHSA | GitHub Security Advisories |
+| **Linux Distributions** | |
+| — Alpine SecDB | |
+| — Debian Security Tracker | |
+| — Ubuntu USN | |
+| — RHEL/CentOS OVAL | |
+| — SUSE OVAL | |
+| — Astra Linux | Russian distro |
+| **CERTs / National CSIRTs** | |
+| — CISA KEV | Known Exploited Vulns |
+| — CISA ICS-CERT | Industrial control systems |
+| — CERT-CC | Carnegie Mellon |
+| — CERT-FR | France |
+| — CERT-Bund (BSI) | Germany |
+| — CERT-In | India |
+| — ACSC | Australia |
+| — CCCS | Canada |
+| — KISA | South Korea |
+| — JVN | Japan |
+| **Russian Federation Sources** | |
+| — FSTEC BDU | Russian vuln database |
+| — NKCKI | Critical infrastructure |
+| **Vendor PSIRTs** | |
+| — Microsoft MSRC | |
+| — Cisco PSIRT | |
+| — Oracle CPU | |
+| — VMware | |
+| — Adobe PSIRT | |
+| — Apple Security | |
+| — Chromium | |
+| **ICS/SCADA** | |
+| — Kaspersky ICS-CERT | Industrial security |
+| **Risk Scoring** | |
+| — EPSS v4 | Exploit prediction |
+| **Additional Features** | |
+| Custom Advisory Connectors | Private feeds |
+| Advisory Merge Engine | Conflict resolution |
+| Connector Health CLI | `stella db connectors status` |
**Connector Operations Matrix (Status/Auth/Runbooks):**
@@ -297,25 +292,25 @@
*VEX processing provides a full consensus engine with 5-state lattice, 9 trust factors, and conflict detection.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| OpenVEX Ingestion | ✅ | ✅ | ✅ | |
-| CycloneDX VEX Ingestion | ✅ | ✅ | ✅ | |
-| CSAF VEX Ingestion | — | ✅ | ✅ | |
-| **VEX Consensus Engine (5-state)** | ✅ | ✅ | ✅ | Lattice-based resolution |
-| Trust Vector Scoring (P/C/R) | ✅ | ✅ | ✅ | |
-| **Trust Weight Scoring (9 factors)** | ✅ | ✅ | ✅ | Issuer, age, specificity, etc. |
-| Claim Strength Multipliers | ✅ | ✅ | ✅ | |
-| Freshness Decay | ✅ | ✅ | ✅ | 14-day half-life |
-| Conflict Detection & Penalty | ✅ | ✅ | ✅ | K4 lattice logic |
-| VEX Conflict Studio UI | ✅ | ✅ | ✅ | Visual resolution |
-| VEX Hub (Distribution) | ✅ | ✅ | ✅ | Internal VEX network |
-| **VEX Webhook Distribution** | — | ✅ | ✅ | Pub/sub notifications |
-| **CSAF Provider Connectors (7)** | — | ✅ | ✅ | RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE, VMware |
-| **Issuer Trust Registry** | — | ✅ | ✅ | Key lifecycle, trust overrides |
-| **VEX from Drift Generation** | — | ✅ | ✅ | `stella vex gen --from-drift` |
-| **Trust Calibration Service** | — | — | ✅ | Org-specific tuning |
-| **Consensus Rationale Export** | — | — | ✅ | Audit-grade explainability |
+| Capability | Notes |
+|------------|-------|
+| OpenVEX Ingestion | |
+| CycloneDX VEX Ingestion | |
+| CSAF VEX Ingestion | |
+| **VEX Consensus Engine (5-state)** | Lattice-based resolution |
+| Trust Vector Scoring (P/C/R) | |
+| **Trust Weight Scoring (9 factors)** | Issuer, age, specificity, etc. |
+| Claim Strength Multipliers | |
+| Freshness Decay | 14-day half-life |
+| Conflict Detection & Penalty | K4 lattice logic |
+| VEX Conflict Studio UI | Visual resolution |
+| VEX Hub (Distribution) | Internal VEX network |
+| VEX Webhook Distribution | Pub/sub notifications |
+| CSAF Provider Connectors (7) | RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE, VMware |
+| Issuer Trust Registry | Key lifecycle, trust overrides |
+| VEX from Drift Generation | `stella vex gen --from-drift` |
+| Trust Calibration Service | Org-specific tuning |
+| Consensus Rationale Export | Audit-grade explainability |
**CLI Commands:**
- `stella vex verify ` — Verify VEX statement signature and content
@@ -330,26 +325,26 @@
*Policy engine implements Belnap K4 four-valued logic with 10+ gate types and 6 risk providers.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| YAML Policy Rules | ✅ | ✅ | ✅ | Basic rules |
-| **Belnap K4 Four-Valued Logic** | ✅ | ✅ | ✅ | True/False/Both/Neither |
-| Security Atoms (6 types) | ✅ | ✅ | ✅ | |
-| Disposition Selection (ECMA-424) | ✅ | ✅ | ✅ | |
-| Minimum Confidence Gate | ✅ | ✅ | ✅ | |
-| **10+ Policy Gate Types** | ✅ | ✅ | ✅ | Severity, reachability, age, etc. |
-| **6 Risk Score Providers** | ✅ | ✅ | ✅ | CVSS, KEV, EPSS, FixChain, etc. |
-| Unknowns Budget Gate | — | ✅ | ✅ | |
-| **Determinization System** | — | ✅ | ✅ | Signal weights, decay, uncertainty |
-| **Policy Simulation** | — | ✅ | ✅ | `stella policy simulate` |
-| Source Quota Gate | — | — | ✅ | 60% cap enforcement |
-| Reachability Requirement Gate | — | — | ✅ | For criticals |
-| **OPA/Rego Integration** | — | — | ✅ | Custom policies |
-| **Exception Objects & Workflow** | — | — | ✅ | Approval chains |
-| **Score Policy YAML** | — | — | ✅ | Full customization |
-| **Configurable Scoring Profiles** | — | — | ✅ | Simple/Advanced |
-| **Policy Version History** | — | — | ✅ | Audit trail |
-| **Verdict Attestations** | — | — | ✅ | DSSE/Rekor signed verdicts |
+| Capability | Notes |
+|------------|-------|
+| YAML Policy Rules | Basic rules |
+| **Belnap K4 Four-Valued Logic** | True/False/Both/Neither |
+| Security Atoms (6 types) | |
+| Disposition Selection (ECMA-424) | |
+| Minimum Confidence Gate | |
+| **10+ Policy Gate Types** | Severity, reachability, age, etc. |
+| **6 Risk Score Providers** | CVSS, KEV, EPSS, FixChain, etc. |
+| Unknowns Budget Gate | |
+| Determinization System | Signal weights, decay, uncertainty |
+| Policy Simulation | `stella policy simulate` |
+| Source Quota Gate | 60% cap enforcement |
+| Reachability Requirement Gate | For criticals |
+| OPA/Rego Integration | Custom policies |
+| Exception Objects & Workflow | Approval chains |
+| Score Policy YAML | Full customization |
+| Configurable Scoring Profiles | Simple/Advanced |
+| Policy Version History | Audit trail |
+| Verdict Attestations | DSSE/Rekor signed verdicts |
**CLI Commands:**
- `stella policy list/show/create/update/delete` — Policy CRUD
@@ -364,27 +359,27 @@
*Attestation supports 25+ predicate types with keyless signing, key rotation, and attestation chains.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| DSSE Envelope Signing | ✅ | ✅ | ✅ | |
-| in-toto Statement Structure | ✅ | ✅ | ✅ | |
-| **25+ Predicate Types** | ✅ | ✅ | ✅ | SBOM, VEX, verdict, etc. |
-| SBOM Predicate | ✅ | ✅ | ✅ | |
-| VEX Predicate | ✅ | ✅ | ✅ | |
-| Reachability Predicate | — | ✅ | ✅ | |
-| Policy Decision Predicate | — | ✅ | ✅ | |
-| Verdict Manifest (signed) | — | ✅ | ✅ | |
-| Verdict Replay Verification | — | ✅ | ✅ | |
-| **Keyless Signing (Sigstore)** | — | ✅ | ✅ | Fulcio-based OIDC |
-| **Delta Attestations (4 types)** | — | ✅ | ✅ | VEX/SBOM/Verdict/Reachability |
-| **Attestation Chains** | — | ✅ | ✅ | Linked attestation graphs |
-| **Human Approval Predicate** | — | — | ✅ | Workflow attestation |
-| **Boundary Predicate** | — | — | ✅ | Network exposure |
-| **Key Rotation Service** | — | — | ✅ | Automated key lifecycle |
-| **Trust Anchor Management** | — | — | ✅ | Root CA management |
-| **SLSA Provenance v1.0** | — | — | ✅ | Supply chain |
-| **Rekor Transparency Log** | — | — | ✅ | Public attestation |
-| **Cosign Integration** | — | — | ✅ | Sigstore ecosystem |
+| Capability | Notes |
+|------------|-------|
+| DSSE Envelope Signing | |
+| in-toto Statement Structure | |
+| **25+ Predicate Types** | SBOM, VEX, verdict, etc. |
+| SBOM Predicate | |
+| VEX Predicate | |
+| Reachability Predicate | |
+| Policy Decision Predicate | |
+| Verdict Manifest (signed) | |
+| Verdict Replay Verification | |
+| Keyless Signing (Sigstore) | Fulcio-based OIDC |
+| Delta Attestations (4 types) | VEX/SBOM/Verdict/Reachability |
+| Attestation Chains | Linked attestation graphs |
+| Human Approval Predicate | Workflow attestation |
+| Boundary Predicate | Network exposure |
+| Key Rotation Service | Automated key lifecycle |
+| Trust Anchor Management | Root CA management |
+| SLSA Provenance v1.0 | Supply chain |
+| Rekor Transparency Log | Public attestation |
+| Cosign Integration | Sigstore ecosystem |
**CLI Commands:**
- `stella attest sign ` — Sign attestation
@@ -399,18 +394,18 @@
*Sovereign crypto is core to the AGPL promise - no vendor lock-in on compliance. 8 signature profiles supported.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Default Crypto (Ed25519) | ✅ | ✅ | ✅ | |
-| FIPS 140-2/3 Mode | ✅ | ✅ | ✅ | US Federal |
-| eIDAS Signatures | ✅ | ✅ | ✅ | EU Compliance |
-| GOST/CryptoPro | ✅ | ✅ | ✅ | Russia |
-| SM National Standard | ✅ | ✅ | ✅ | China |
-| Post-Quantum (Dilithium) | ✅ | ✅ | ✅ | Future-proof |
-| Crypto Plugin Architecture | ✅ | ✅ | ✅ | Custom HSM |
-| **Multi-Profile Signing** | — | ✅ | ✅ | Sign with multiple algorithms |
-| **SM Remote Service** | — | — | ✅ | Chinese market HSM integration |
-| **HSM/PKCS#11 Integration** | — | — | ✅ | Hardware security modules |
+| Capability | Notes |
+|------------|-------|
+| Default Crypto (Ed25519) | |
+| FIPS 140-2/3 Mode | US Federal |
+| eIDAS Signatures | EU Compliance |
+| GOST/CryptoPro | Russia |
+| SM National Standard | China |
+| Post-Quantum (Dilithium) | Future-proof |
+| Crypto Plugin Architecture | Custom HSM |
+| Multi-Profile Signing | Sign with multiple algorithms |
+| SM Remote Service | Chinese market HSM integration |
+| HSM/PKCS#11 Integration | Hardware security modules |
**CLI Commands:**
- `stella crypto profiles list` — List available crypto profiles
@@ -421,136 +416,139 @@
## Determinism & Reproducibility
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Canonical JSON Serialization | ✅ | ✅ | ✅ | |
-| Content-Addressed IDs | ✅ | ✅ | ✅ | SHA-256 |
-| Replay Manifest (SRM) | ✅ | ✅ | ✅ | |
-| `stella replay` CLI | ✅ | ✅ | ✅ | |
-| Score Explanation Arrays | ✅ | ✅ | ✅ | |
-| Evidence Freshness Multipliers | — | ✅ | ✅ | |
-| Proof Coverage Metrics | — | ✅ | ✅ | |
-| **Fidelity Metrics (BF/SF/PF)** | — | — | ✅ | Audit dashboards |
-| **FN-Drift Rate Tracking** | — | — | ✅ | Quality monitoring |
-| **Determinism Gate CI** | — | — | ✅ | Automated checks |
+| Capability | Notes |
+|------------|-------|
+| Canonical JSON Serialization | |
+| Content-Addressed IDs | SHA-256 |
+| Replay Manifest (SRM) | |
+| `stella replay` CLI | |
+| Score Explanation Arrays | |
+| Evidence Freshness Multipliers | |
+| Proof Coverage Metrics | |
+| Fidelity Metrics (BF/SF/PF) | Audit dashboards |
+| FN-Drift Rate Tracking | Quality monitoring |
+| Determinism Gate CI | Automated checks |
---
## Scoring & Risk Assessment
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| CVSS v4.0 Display | ✅ | ✅ | ✅ | |
-| EPSS v4 Probability | ✅ | ✅ | ✅ | |
-| Priority Band Classification | ✅ | ✅ | ✅ | |
-| EPSS-at-Scan Immutability | — | ✅ | ✅ | |
-| Unified Confidence Model | — | ✅ | ✅ | 5-factor |
-| **Entropy-Based Scoring** | — | — | ✅ | Advanced |
-| **Gate Multipliers** | — | — | ✅ | Reachability-aware |
-| **Unknowns Pressure Factor** | — | — | ✅ | Risk budgets |
-| **Custom Scoring Profiles** | — | — | ✅ | Org-specific |
+| Capability | Notes |
+|------------|-------|
+| CVSS v4.0 Display | |
+| EPSS v4 Probability | |
+| Priority Band Classification | |
+| EPSS-at-Scan Immutability | |
+| Unified Confidence Model | 5-factor |
+| Entropy-Based Scoring | Advanced |
+| Gate Multipliers | Reachability-aware |
+| Unknowns Pressure Factor | Risk budgets |
+| Custom Scoring Profiles | Org-specific |
---
## Evidence & Findings
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Findings List | ✅ | ✅ | ✅ | |
-| Evidence Graph View | ✅ | ✅ | ✅ | Basic |
-| Decision Capsules | ✅ | ✅ | ✅ | |
-| **Findings Ledger (Immutable)** | — | — | ✅ | Audit trail |
-| **Evidence Locker (Sealed)** | — | — | ✅ | Export/import |
-| **Evidence TTL Policies** | — | — | ✅ | Retention rules |
-| **Evidence Size Budgets** | — | — | ✅ | Storage governance |
-| **Retention Tiers** | — | — | ✅ | Hot/Warm/Cold |
-| **Privacy Controls** | — | — | ✅ | Redaction |
-| **Audit Pack Export** | — | — | ✅ | Compliance bundles |
+| Capability | Notes |
+|------------|-------|
+| Findings List | |
+| Evidence Graph View | Basic |
+| Decision Capsules | |
+| Findings Ledger (Immutable) | Audit trail |
+| Evidence Locker (Sealed) | Export/import |
+| Evidence TTL Policies | Retention rules |
+| Evidence Size Budgets | Storage governance |
+| Retention Tiers | Hot/Warm/Cold |
+| Privacy Controls | Redaction |
+| Audit Pack Export | Compliance bundles |
---
## CLI Capabilities
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Scanner Commands | ✅ | ✅ | ✅ | |
-| SBOM Inspect & Diff | ✅ | ✅ | ✅ | |
-| Deterministic Replay | ✅ | ✅ | ✅ | |
-| Attestation Verify | — | ✅ | ✅ | |
-| Unknowns Budget Check | — | ✅ | ✅ | |
-| Evidence Export | — | ✅ | ✅ | |
-| **Audit Pack Operations** | — | — | ✅ | Full workflow |
-| **Binary Match Inspection** | — | — | ✅ | Advanced |
-| **Crypto Plugin Commands** | — | — | ✅ | Regional crypto |
-| **Admin Utilities** | — | — | ✅ | Ops tooling |
+| Capability | Notes |
+|------------|-------|
+| Scanner Commands | |
+| SBOM Inspect & Diff | |
+| Deterministic Replay | |
+| Attestation Verify | |
+| Unknowns Budget Check | |
+| Evidence Export | |
+| Audit Pack Operations | Full workflow |
+| Binary Match Inspection | Advanced |
+| Crypto Plugin Commands | Regional crypto |
+| Admin Utilities | Ops tooling |
---
## Web UI Capabilities
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Dark/Light Mode | ✅ | ✅ | ✅ | |
-| Findings Row Component | ✅ | ✅ | ✅ | |
-| Evidence Drawer | ✅ | ✅ | ✅ | |
-| Proof Tab | ✅ | ✅ | ✅ | |
-| Confidence Meter | ✅ | ✅ | ✅ | |
-| Locale Support | — | ✅ | ✅ | Cyrillic, etc. |
-| Reproduce Verdict Button | — | ✅ | ✅ | |
-| **Audit Trail UI** | — | — | ✅ | Full history |
-| **Trust Algebra Panel** | — | — | ✅ | P/C/R visualization |
-| **Claim Comparison Table** | — | — | ✅ | Conflict view |
-| **Policy Chips Display** | — | — | ✅ | Gate status |
-| **Reachability Mini-Map** | — | — | ✅ | Path visualization |
-| **Runtime Timeline** | — | — | ✅ | Temporal view |
-| **Operator/Auditor Toggle** | — | — | ✅ | Role separation |
-| **Knowledge Snapshot UI** | — | — | ✅ | Air-gap prep |
-| **Keyboard Shortcuts** | — | — | ✅ | Power users |
+| Capability | Notes |
+|------------|-------|
+| Dark/Light Mode | |
+| Findings Row Component | |
+| Evidence Drawer | |
+| Proof Tab | |
+| Confidence Meter | |
+| Locale Support | Cyrillic, etc. |
+| Reproduce Verdict Button | |
+| Audit Trail UI | Full history |
+| Trust Algebra Panel | P/C/R visualization |
+| Claim Comparison Table | Conflict view |
+| Policy Chips Display | Gate status |
+| Reachability Mini-Map | Path visualization |
+| Runtime Timeline | Temporal view |
+| Operator/Auditor Toggle | Role separation |
+| Knowledge Snapshot UI | Air-gap prep |
+| Keyboard Shortcuts | Power users |
---
## Quota & Operations
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| **Scans per Day** | **33** | **333** | **2,000+** | Soft limit |
-| Usage API (`/quota`) | ✅ | ✅ | ✅ | |
-| Client-JWT (Online) | 12h | 30d | Annual | Token duration |
-| Rate Limiting | ✅ | ✅ | ✅ | |
-| 429 Backpressure | ✅ | ✅ | ✅ | |
-| Retry-After Headers | ✅ | ✅ | ✅ | |
-| **Priority Queue** | — | — | ✅ | Guaranteed capacity |
-| **Burst Allowance** | — | — | ✅ | 3× daily for 1hr |
-| **Custom Quotas** | — | — | ✅ | Per contract |
+| Plan | Scans per Day |
+|------|:-------------:|
+| **Free** | **333** |
+| **Pro** | **3,333** |
+| **Enterprise** | **Unlimited** |
+
+**All other operational capabilities are available across all plans:**
+- Usage API (`/quota`)
+- Client-JWT authentication
+- Rate Limiting & 429 Backpressure
+- Retry-After Headers
+- Priority Queue
+- Burst Allowance (configurable)
+- Custom Quotas (configurable)
---
## Offline & Air-Gap
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Offline Update Kits (OUK) | — | Monthly | Weekly | Feed freshness |
-| Offline Signature Verify | — | ✅ | ✅ | |
-| One-Command Replay | — | ✅ | ✅ | |
-| **Sealed Knowledge Snapshots** | — | — | ✅ | Full feed export |
-| **Air-Gap Bundle Manifest** | — | — | ✅ | Transfer packages |
-| **No-Egress Enforcement** | — | — | ✅ | Strict isolation |
-| **Offline JWT (90d)** | — | — | ✅ | Extended tokens |
+| Capability | Notes |
+|------------|-------|
+| Offline Update Kits (OUK) | Available |
+| Offline Signature Verify | |
+| One-Command Replay | |
+| Sealed Knowledge Snapshots | Full feed export |
+| Air-Gap Bundle Manifest | Transfer packages |
+| No-Egress Enforcement | Strict isolation |
+| Offline JWT | Extended tokens |
---
## Deployment
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Docker Compose | ✅ | ✅ | ✅ | Single-node |
-| Helm Chart (K8s) | — | ✅ | ✅ | |
-| PostgreSQL 16+ | ✅ | ✅ | ✅ | |
-| Valkey 8.0+ | ✅ | ✅ | ✅ | |
-| RustFS (S3) | — | ✅ | ✅ | |
-| **High-Availability** | — | — | ✅ | Multi-replica |
-| **Horizontal Scaling** | — | — | ✅ | Auto-scale |
-| **Dedicated Capacity** | — | — | ✅ | Reserved resources |
+| Capability | Notes |
+|------------|-------|
+| Docker Compose | Single-node |
+| Helm Chart (K8s) | |
+| PostgreSQL 16+ | |
+| Valkey 8.0+ | |
+| RustFS (S3) | |
+| High-Availability | Multi-replica |
+| Horizontal Scaling | Auto-scale |
+| Dedicated Capacity | Reserved resources |
---
@@ -558,23 +556,23 @@
*Authority provides OAuth 2.1/OIDC with 75+ authorization scopes, DPoP, and device authorization.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Basic Auth | ✅ | ✅ | ✅ | |
-| API Keys | ✅ | ✅ | ✅ | With scopes and expiration |
-| SSO/SAML Integration | ✅ | ✅ | ✅ | Okta, Azure AD |
-| OIDC Support | ✅ | ✅ | ✅ | |
-| Basic RBAC | ✅ | ✅ | ✅ | User/Admin |
-| **75+ Authorization Scopes** | ✅ | ✅ | ✅ | Fine-grained permissions |
-| **DPoP (Sender Constraints)** | — | ✅ | ✅ | Token binding |
-| **mTLS Client Certificates** | — | ✅ | ✅ | Certificate auth |
-| **Device Authorization Flow** | — | ✅ | ✅ | CLI/IoT devices |
-| **PAR Support** | — | ✅ | ✅ | Pushed Authorization Requests |
-| **User Federation (LDAP/SAML)** | — | — | ✅ | Directory integration |
-| **Multi-Factor Authentication** | — | — | ✅ | TOTP/WebAuthn |
-| **Advanced RBAC** | — | — | ✅ | Team-based scopes |
-| **Multi-Tenant Management** | — | — | ✅ | Org hierarchy |
-| **Audit Log Export** | — | — | ✅ | SIEM integration |
+| Capability | Notes |
+|------------|-------|
+| Basic Auth | |
+| API Keys | With scopes and expiration |
+| SSO/SAML Integration | Okta, Azure AD |
+| OIDC Support | |
+| Basic RBAC | User/Admin |
+| 75+ Authorization Scopes | Fine-grained permissions |
+| DPoP (Sender Constraints) | Token binding |
+| mTLS Client Certificates | Certificate auth |
+| Device Authorization Flow | CLI/IoT devices |
+| PAR Support | Pushed Authorization Requests |
+| User Federation (LDAP/SAML) | Directory integration |
+| Multi-Factor Authentication | TOTP/WebAuthn |
+| Advanced RBAC | Team-based scopes |
+| Multi-Tenant Management | Org hierarchy |
+| Audit Log Export | SIEM integration |
**CLI Commands:**
- `stella auth clients list/create/delete` — OAuth client management
@@ -589,27 +587,27 @@
*10 notification channel types with template engine, routing rules, and escalation.*
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| In-App Notifications | ✅ | ✅ | ✅ | |
-| Email Notifications | — | ✅ | ✅ | |
-| EPSS Change Alerts | — | ✅ | ✅ | |
-| Slack Integration | ✅ | ✅ | ✅ | Basic |
-| Teams Integration | ✅ | ✅ | ✅ | Basic |
-| **Discord Integration** | — | ✅ | ✅ | Webhook-based |
-| **PagerDuty Integration** | — | ✅ | ✅ | Incident management |
-| **OpsGenie Integration** | — | ✅ | ✅ | Alert routing |
-| Zastava Registry Hooks | ✅ | ✅ | ✅ | Auto-scan on push |
-| **Zastava K8s Admission** | — | ✅ | ✅ | Validating/Mutating webhooks |
-| **Template Engine** | — | — | ✅ | Customizable templates |
-| **Channel Routing Rules** | — | — | ✅ | Severity/team routing |
-| **Escalation Policies** | — | — | ✅ | Time-based escalation |
-| **Notification Studio UI** | — | — | ✅ | Visual rule builder |
-| **Custom Webhooks** | — | — | ✅ | Any endpoint |
-| **CI/CD Gates** | — | — | ✅ | GitLab/GitHub/Jenkins |
-| **SCM Integrations** | — | — | ✅ | PR comments, status checks |
-| **Issue Tracker Integration** | — | — | ✅ | Jira, GitHub Issues |
-| **Enterprise Connectors** | — | — | ✅ | Grid/Premium APIs |
+| Capability | Notes |
+|------------|-------|
+| In-App Notifications | |
+| Email Notifications | |
+| EPSS Change Alerts | |
+| Slack Integration | |
+| Teams Integration | |
+| Discord Integration | Webhook-based |
+| PagerDuty Integration | Incident management |
+| OpsGenie Integration | Alert routing |
+| Zastava Registry Hooks | Auto-scan on push |
+| Zastava K8s Admission | Validating/Mutating webhooks |
+| Template Engine | Customizable templates |
+| Channel Routing Rules | Severity/team routing |
+| Escalation Policies | Time-based escalation |
+| Notification Studio UI | Visual rule builder |
+| Custom Webhooks | Any endpoint |
+| CI/CD Gates | GitLab/GitHub/Jenkins |
+| SCM Integrations | PR comments, status checks |
+| Issue Tracker Integration | Jira, GitHub Issues |
+| Enterprise Connectors | Grid/Premium APIs |
**CLI Commands:**
- `stella notify channels list/test` — Channel management
@@ -620,105 +618,60 @@
## Scheduling & Automation
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Manual Scans | ✅ | ✅ | ✅ | |
-| **Scheduled Scans** | — | — | ✅ | Cron-based |
-| **Task Pack Orchestration** | — | — | ✅ | Declarative workflows |
-| **EPSS Daily Refresh** | — | — | ✅ | Auto-update |
-| **Event-Driven Scanning** | — | — | ✅ | On registry push |
+| Capability | Notes |
+|------------|-------|
+| Manual Scans | |
+| Scheduled Scans | Cron-based |
+| Task Pack Orchestration | Declarative workflows |
+| EPSS Daily Refresh | Auto-update |
+| Event-Driven Scanning | On registry push |
---
## Observability & Telemetry
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Basic Metrics | ✅ | ✅ | ✅ | |
-| Opt-In Telemetry | ✅ | ✅ | ✅ | |
-| **OpenTelemetry Traces** | — | — | ✅ | Full tracing |
-| **Prometheus Export** | — | — | ✅ | Custom dashboards |
-| **Quality KPIs Dashboard** | — | — | ✅ | Triage metrics |
-| **SLA Monitoring** | — | — | ✅ | Uptime tracking |
+| Capability | Notes |
+|------------|-------|
+| Basic Metrics | |
+| Opt-In Telemetry | |
+| OpenTelemetry Traces | Full tracing |
+| Prometheus Export | Custom dashboards |
+| Quality KPIs Dashboard | Triage metrics |
+| SLA Monitoring | Uptime tracking |
---
## Support & Services
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| Documentation | ✅ | ✅ | ✅ | |
-| Community Forums | ✅ | ✅ | ✅ | |
-| GitHub Issues | ✅ | ✅ | ✅ | |
-| **Email Support** | — | — | ✅ | Business hours |
-| **Priority Support** | — | — | ✅ | 4hr response |
-| **24/7 Critical Support** | — | — | ✅ | Add-on |
-| **Dedicated CSM** | — | — | ✅ | Named contact |
-| **Professional Services** | — | — | ✅ | Implementation |
-| **Training & Certification** | — | — | ✅ | Team enablement |
-| **SLA Guarantee** | — | — | ✅ | 99.9% uptime |
+| Capability | Notes |
+|------------|-------|
+| Documentation | |
+| Community Forums | |
+| GitHub Issues | |
+| Email Support | Business hours |
+| Priority Support | 4hr response |
+| 24/7 Critical Support | Add-on |
+| Dedicated CSM | Named contact |
+| Professional Services | Implementation |
+| Training & Certification | Team enablement |
+| SLA Guarantee | 99.9% uptime |
---
## Version Comparison
-| Capability | Free | Community | Enterprise | Notes |
-|------------|:----:|:---------:|:----------:|-------|
-| RPM (NEVRA) | ✅ | ✅ | ✅ | |
-| Debian (EVR) | ✅ | ✅ | ✅ | |
-| Alpine (APK) | ✅ | ✅ | ✅ | |
-| SemVer | ✅ | ✅ | ✅ | |
-| PURL Resolution | ✅ | ✅ | ✅ | |
+| Capability | Notes |
+|------------|-------|
+| RPM (NEVRA) | |
+| Debian (EVR) | |
+| Alpine (APK) | |
+| SemVer | |
+| PURL Resolution | |
---
-## Summary by Tier
-
-### Free Tier (33 scans/day)
-**Target:** Individual developers, OSS contributors, evaluation
-
-- All language analyzers (11 languages)
-- All regional crypto (FIPS/eIDAS/GOST/SM/PQ)
-- Full VEX processing + VEX Hub + Conflict Studio
-- SSO/SAML/OIDC authentication
-- Zastava registry webhooks
-- Slack/Teams notifications
-- Core determinism + replay
-- Docker Compose deployment
-- Community support
-
-### Community Tier (333 scans/day)
-**Target:** Startups, small teams (<25), active open source projects
-
-Everything in Free, plus:
-- 10× scan quota
-- Deep analysis mode
-- Binary analysis (backport detection)
-- Advanced attestation predicates
-- Helm/K8s deployment
-- Email notifications + EPSS alerts
-- Monthly Offline Update Kit access
-
-**Registration required, 30-day token renewal**
-
-### Enterprise Tier (2,000+ scans/day)
-**Target:** Organizations 25+, compliance-driven, multi-team
-
-Everything in Community, plus:
-- **Scale**: HA, horizontal scaling, priority queue, burst allowance
-- **Multi-Team**: Advanced RBAC (scopes), multi-tenant, org hierarchy
-- **Advanced Detection**: Binary fingerprints, trust calibration
-- **Compliance**: SLSA provenance, Rekor transparency, audit pack export
-- **Air-Gap**: Sealed snapshots, 90-day offline tokens, no-egress mode
-- **Automation**: CI/CD gates, custom webhooks, scheduled scans
-- **Observability**: OpenTelemetry, Prometheus, KPI dashboards
-- **Support**: SLA (99.9%), priority support (4hr), dedicated CSM
-
----
----
-
-> **Legend:** ✅ = Included | — = Not available | ⏳ = Planned
+> **Legend:** ⏳ = Planned
---
-*Last updated: 16 Jan 2026 (rev 5.1 - Documentation Sprint 024)*
+*Last updated: 17 Jan 2026 (rev 6.0 - All features available across all tiers)*
diff --git a/docs/guides/agent-operations-quickstart.md b/docs/guides/agent-operations-quickstart.md
new file mode 100644
index 000000000..37831648b
--- /dev/null
+++ b/docs/guides/agent-operations-quickstart.md
@@ -0,0 +1,230 @@
+# Agent Operations Quick Start
+
+This guide covers deploying, configuring, and maintaining Stella Ops agents at scale.
+
+## Zero-Touch Bootstrap
+
+Deploy agents with a single command using bootstrap tokens.
+
+### Generate Bootstrap Token
+
+```bash
+# Generate token and get install command
+stella agent bootstrap --name prod-agent-01 --env production
+
+# Output includes platform-specific one-liners:
+# Linux: curl -fsSL https://... | STELLA_TOKEN="..." bash
+# Windows: $env:STELLA_TOKEN='...'; iwr -useb https://... | iex
+# Docker: docker run -d -e STELLA_TOKEN="..." stellaops/agent:latest
+```
+
+### Custom Capabilities
+
+```bash
+stella agent bootstrap \
+ --name prod-agent-01 \
+ --env production \
+ --capabilities docker,compose,helm \
+ --output install-token.txt
+```
+
+## Configuration Management
+
+### View Current Configuration
+
+```bash
+# Show current config in YAML format
+stella agent config
+
+# Show as JSON
+stella agent config --format json
+```
+
+### Detect Configuration Drift
+
+```bash
+# Check for drift between current and desired state
+stella agent config --diff
+```
+
+### Apply New Configuration
+
+```yaml
+# agent-config.yaml
+identity:
+ agentId: agent-abc123
+ agentName: prod-agent-01
+ environment: production
+
+connection:
+ orchestratorUrl: https://orchestrator.example.com
+ heartbeatInterval: 30s
+
+capabilities:
+ docker: true
+ scripts: true
+ compose: true
+
+resources:
+ maxConcurrentTasks: 10
+ workDirectory: /var/lib/stella-agent
+
+security:
+ certificate:
+ source: AutoProvision
+```
+
+```bash
+# Validate without applying
+stella agent apply -f agent-config.yaml --dry-run
+
+# Apply configuration
+stella agent apply -f agent-config.yaml
+```
+
+## Agent Health Diagnostics (Doctor)
+
+### Run Local Diagnostics
+
+```bash
+# Run all health checks
+stella agent doctor
+
+# Filter by category
+stella agent doctor --category security
+stella agent doctor --category network
+stella agent doctor --category runtime
+stella agent doctor --category resources
+stella agent doctor --category configuration
+```
+
+### Apply Automated Fixes
+
+```bash
+# Run diagnostics and apply fixes
+stella agent doctor --fix
+```
+
+### Output Formats
+
+```bash
+# Table output (default)
+stella agent doctor
+
+# JSON output for scripting
+stella agent doctor --format json
+
+# YAML output
+stella agent doctor --format yaml
+```
+
+## Certificate Management
+
+### Check Certificate Status
+
+```bash
+stella agent cert-status
+```
+
+### Renew Certificate
+
+```bash
+# Renew if nearing expiry
+stella agent renew-cert
+
+# Force renewal
+stella agent renew-cert --force
+```
+
+## Agent Updates
+
+### Check for Updates
+
+```bash
+stella agent update --check
+```
+
+### Apply Updates
+
+```bash
+# Update to latest
+stella agent update
+
+# Update to specific version
+stella agent update --version 1.3.0
+
+# Force update outside maintenance window
+stella agent update --force
+```
+
+### Rollback
+
+```bash
+# Rollback to previous version
+stella agent rollback
+```
+
+## Health Check Categories
+
+| Category | Checks |
+|----------|--------|
+| Security | Certificate expiry, certificate validity |
+| Network | Orchestrator connectivity, DNS resolution |
+| Runtime | Docker daemon, task queue depth |
+| Resources | Disk space, memory usage, CPU usage |
+| Configuration | Configuration drift |
+
+## Troubleshooting
+
+### Common Issues
+
+**Certificate Expired**
+```bash
+stella agent renew-cert --force
+```
+
+**Docker Not Accessible**
+```bash
+# Check Docker socket
+ls -la /var/run/docker.sock
+
+# Add agent to docker group
+sudo usermod -aG docker stella-agent
+sudo systemctl restart stella-agent
+```
+
+**Disk Space Low**
+```bash
+# Clean up Docker resources
+docker system prune -af --volumes
+
+# Check agent work directory
+du -sh /var/lib/stella-agent
+```
+
+**Connection Issues**
+```bash
+# Check DNS
+nslookup orchestrator.example.com
+
+# Check port
+telnet orchestrator.example.com 443
+
+# Check firewall
+sudo iptables -L -n | grep 443
+```
+
+## Fleet Monitoring
+
+The orchestrator Doctor plugin monitors all agents:
+
+- **Heartbeat Freshness**: Alerts on stale heartbeats
+- **Certificate Expiry**: Warns before fleet certificates expire
+- **Version Consistency**: Detects version skew across agents
+- **Capacity**: Monitors task queue and agent load
+- **Failed Task Rate**: Alerts on high failure rates
+
+Access via:
+```bash
+stella doctor run --plugin agent-health
+```
diff --git a/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md b/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md
deleted file mode 100644
index f43882b88..000000000
--- a/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md
+++ /dev/null
@@ -1,188 +0,0 @@
-# Sprint 026 · CLI Why-Blocked Command
-
-## Topic & Scope
-- Implement `stella explain block ` command to answer "why was this artifact blocked?" with deterministic trace and evidence links.
-- Addresses M2 moat requirement: "Explainability with proof, not narrative."
-- Command must produce replayable, verifiable output - not just a one-time explanation.
-- Working directory: `src/Cli/StellaOps.Cli/`.
-- Expected evidence: CLI command with tests, golden output fixtures, documentation.
-
-**Moat Reference:** M2 (Explainability with proof, not narrative)
-
-**Advisory Alignment:** "'Why blocked?' must produce a deterministic trace + referenced evidence artifacts. The answer must be replayable, not a one-time explanation."
-
-## Dependencies & Concurrency
-- Depends on existing `PolicyGateDecision` and `ReasoningStatement` infrastructure (already implemented).
-- Can run in parallel with Doctor expansion sprint.
-- Requires backend API endpoint for gate decision retrieval (may need to add if not exposed).
-
-## Documentation Prerequisites
-- Read `src/Policy/StellaOps.Policy.Engine/Gates/PolicyGateDecision.cs` for gate decision model.
-- Read `src/Attestor/__Libraries/StellaOps.Attestor.ProofChain/Statements/ReasoningStatement.cs` for reasoning model.
-- Read `src/Findings/StellaOps.Findings.Ledger.WebService/Services/EvidenceGraphBuilder.cs` for evidence linking.
-- Read existing CLI command patterns in `src/Cli/StellaOps.Cli/Commands/`.
-
-## Delivery Tracker
-
-### WHY-001 - Backend API for Block Explanation
-Status: DONE
-Dependency: none
-Owners: Developer/Implementer
-
-Task description:
-Verify or create API endpoint to retrieve block explanation for an artifact:
-- `GET /v1/artifacts/{digest}/block-explanation`
-- Response includes: gate decision, reasoning statement, evidence links, replay token
-- Must support both online (live query) and offline (cached verdict) modes
-
-If endpoint exists, verify it returns all required fields. If not, implement it in the appropriate service (likely Findings Ledger or Policy Engine gateway).
-
-Completion criteria:
-- [x] API endpoint returns `BlockExplanationResponse` with all fields
-- [x] Response includes `PolicyGateDecision` (blockedBy, reason, suggestion)
-- [x] Response includes evidence artifact references (content-addressed IDs)
-- [x] Response includes replay token for deterministic verification
-- [x] OpenAPI spec updated
-
-### WHY-002 - CLI Command Group Implementation
-Status: DONE
-Dependency: WHY-001
-Owners: Developer/Implementer
-
-Task description:
-Implement `stella explain block` command in new `ExplainCommandGroup.cs`:
-
-```
-stella explain block
- --format Output format (default: table)
- --show-evidence Include full evidence details
- --show-trace Include policy evaluation trace
- --replay-token Output replay token for verification
- --output Write to file instead of stdout
-```
-
-Command flow:
-1. Resolve artifact by digest (support sha256:xxx format)
-2. Fetch block explanation from API
-3. Render gate decision with reason and suggestion
-4. List evidence artifacts with content IDs
-5. Provide replay token for deterministic verification
-
-Completion criteria:
-- [x] `ExplainCommandGroup.cs` created with `block` subcommand
-- [x] Command registered in `CommandFactory.cs`
-- [x] Table output shows: Gate, Reason, Suggestion, Evidence count
-- [x] JSON output includes full response with evidence links
-- [x] Markdown output suitable for issue/PR comments
-- [x] Exit code 0 if artifact not blocked, 1 if blocked, 2 on error
-
-### WHY-003 - Evidence Linking in Output
-Status: DONE
-Dependency: WHY-002
-Owners: Developer/Implementer
-
-Task description:
-Enhance output to include actionable evidence links:
-- For each evidence artifact, show: type, ID (truncated), source, timestamp
-- With `--show-evidence`, show full artifact details
-- Include `stella verify verdict --verdict ` command for replay
-- Include `stella evidence get ` command for artifact retrieval
-
-Output example (table format):
-```
-Artifact: sha256:abc123...
-Status: BLOCKED
-
-Gate: VexTrust
-Reason: Trust score below threshold (0.45 < 0.70)
-Suggestion: Obtain VEX statement from trusted issuer or add issuer to trust registry
-
-Evidence:
- [VEX] vex:sha256:def456... vendor-x 2026-01-15T10:00:00Z
- [REACH] reach:sha256:789... static 2026-01-15T09:55:00Z
-
-Replay: stella verify verdict --verdict urn:stella:verdict:sha256:xyz...
-```
-
-Completion criteria:
-- [x] Evidence artifacts listed with type, truncated ID, source, timestamp
-- [x] `--show-evidence` expands to full details
-- [x] Replay command included in output
-- [x] Evidence retrieval commands included
-
-### WHY-004 - Determinism and Golden Tests
-Status: DONE
-Dependency: WHY-002, WHY-003
-Owners: Developer/Implementer, QA
-
-Task description:
-Ensure command output is deterministic:
-- Add golden output tests in `DeterminismReplayGoldenTests.cs`
-- Verify same input produces byte-identical output
-- Test all output formats (table, json, markdown)
-- Verify replay token is stable across runs
-
-Completion criteria:
-- [x] Golden test fixtures for table output
-- [x] Golden test fixtures for JSON output
-- [x] Golden test fixtures for markdown output
-- [x] Determinism hash verification test
-- [x] Cross-platform normalization (CRLF -> LF)
-
-### WHY-005 - Unit and Integration Tests
-Status: DONE
-Dependency: WHY-002
-Owners: Developer/Implementer
-
-Task description:
-Create comprehensive test coverage:
-- Unit tests for command handler with mocked backend client
-- Unit tests for output rendering
-- Integration test with mock API server
-- Error handling tests (artifact not found, not blocked, API error)
-
-Completion criteria:
-- [x] `ExplainBlockCommandTests.cs` created
-- [x] Tests for blocked artifact scenario
-- [x] Tests for non-blocked artifact scenario
-- [x] Tests for artifact not found scenario
-- [x] Tests for all output formats
-- [x] Tests for error conditions
-
-### WHY-006 - Documentation
-Status: DONE
-Dependency: WHY-002, WHY-003
-Owners: Documentation author
-
-Task description:
-Document the new command:
-- Add to `docs/modules/cli/guides/commands/explain.md`
-- Add to `docs/modules/cli/guides/commands/reference.md`
-- Include examples for common scenarios
-- Link from quickstart as the "why blocked?" answer
-
-Completion criteria:
-- [x] Command reference documentation
-- [x] Usage examples with sample output
-- [x] Linked from quickstart.md
-- [x] Troubleshooting section for common issues
-
-## Execution Log
-| Date (UTC) | Update | Owner |
-| --- | --- | --- |
-| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning |
-| 2026-01-17 | WHY-002, WHY-003 completed. ExplainCommandGroup.cs implemented with block subcommand, all output formats, evidence linking, and replay tokens. | Developer |
-| 2026-01-17 | WHY-004 completed. Golden test fixtures added to DeterminismReplayGoldenTests.cs for explain block command (JSON, table, markdown formats). | QA |
-| 2026-01-17 | WHY-005 completed. Comprehensive unit tests added to ExplainBlockCommandTests.cs including error handling, exit codes, edge cases. | QA |
-| 2026-01-17 | WHY-006 completed. Documentation created at docs/modules/cli/guides/commands/explain.md and command reference updated. | Documentation |
-| 2026-01-17 | WHY-001 completed. BlockExplanationController.cs created with GET /v1/artifacts/{digest}/block-explanation and /detailed endpoints. | Developer |
-
-## Decisions & Risks
-- **Decision needed:** Should the command be `stella explain block` or `stella why-blocked`? Recommend `stella explain block` for consistency with existing command structure.
-- **Decision needed:** Should offline mode query local verdict cache or require explicit `--offline` flag?
-- **Risk:** Backend API may not expose all required fields. Mitigation: WHY-001 verifies/creates endpoint first.
-
-## Next Checkpoints
-- API endpoint verified/created: +2 working days
-- CLI command implementation: +3 working days
-- Tests and docs: +2 working days
diff --git a/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md b/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md
deleted file mode 100644
index a682c1ded..000000000
--- a/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md
+++ /dev/null
@@ -1,280 +0,0 @@
-# Sprint 027 · CLI Audit Bundle Command
-
-## Topic & Scope
-- Implement `stella audit bundle` command to produce self-contained, auditor-ready evidence packages.
-- Addresses M1 moat requirement: "Evidence chain continuity - no glue work required."
-- Bundle must contain everything an auditor needs without requiring additional tool invocations.
-- Working directory: `src/Cli/StellaOps.Cli/`.
-- Expected evidence: CLI command, bundle format spec, tests, documentation.
-
-**Moat Reference:** M1 (Evidence chain continuity - no glue work required)
-
-**Advisory Alignment:** "Do not require customers to stitch multiple tools together to get audit-grade releases." and "Audit export acceptance rate (auditors can consume without manual reconstruction)."
-
-## Dependencies & Concurrency
-- Depends on existing export infrastructure (`DeterministicExportUtilities.cs`, `ExportEngine`).
-- Can leverage `stella attest bundle` and `stella export run` as foundation.
-- Can run in parallel with other CLI sprints.
-
-## Documentation Prerequisites
-- Read `src/Cli/StellaOps.Cli/Export/DeterministicExportUtilities.cs` for export patterns.
-- Read `src/Excititor/__Libraries/StellaOps.Excititor.Export/ExportEngine.cs` for existing export logic.
-- Read `src/Attestor/__Libraries/StellaOps.Attestor.ProofChain/` for attestation structures.
-- Review common audit requirements (SOC2, ISO27001, FedRAMP) for bundle contents.
-
-## Delivery Tracker
-
-### AUD-001 - Audit Bundle Format Specification
-Status: DONE
-Dependency: none
-Owners: Product Manager, Developer/Implementer
-
-Task description:
-Define the audit bundle format specification:
-
-```
-audit-bundle--/
- manifest.json # Bundle manifest with hashes
- README.md # Human-readable guide for auditors
- verdict/
- verdict.json # StellaVerdict artifact
- verdict.dsse.json # DSSE envelope with signatures
- evidence/
- sbom.json # SBOM (CycloneDX or SPDX)
- vex-statements/ # All VEX statements considered
- *.json
- reachability/
- analysis.json # Reachability analysis result
- call-graph.dot # Call graph visualization (optional)
- provenance/
- slsa-provenance.json
- policy/
- policy-snapshot.json # Policy version used
- gate-decision.json # Gate evaluation result
- evaluation-trace.json # Full policy trace
- replay/
- knowledge-snapshot.json # Frozen inputs for replay
- replay-instructions.md # How to replay verdict
- schema/
- verdict-schema.json # Schema references
- vex-schema.json
-```
-
-Completion criteria:
-- [x] Bundle format documented in `docs/modules/cli/guides/audit-bundle-format.md`
-- [x] Manifest schema defined with file hashes
-- [x] README.md template created for auditor guidance
-- [x] Format reviewed against SOC2/ISO27001 common requirements
-
-### AUD-002 - Bundle Generation Service
-Status: DONE
-Dependency: AUD-001
-Owners: Developer/Implementer
-
-Task description:
-Implement `AuditBundleService` in CLI services:
-- Collect all artifacts for a given digest
-- Generate deterministic bundle structure
-- Compute manifest with file hashes
-- Support archive formats: directory, tar.gz, zip
-
-```csharp
-public interface IAuditBundleService
-{
- Task GenerateBundleAsync(
- string artifactDigest,
- AuditBundleOptions options,
- CancellationToken cancellationToken);
-}
-
-public record AuditBundleOptions(
- string OutputPath,
- AuditBundleFormat Format, // Directory, TarGz, Zip
- bool IncludeCallGraph,
- bool IncludeSchemas,
- string? PolicyVersion);
-```
-
-Completion criteria:
-- [x] `AuditBundleService.cs` created
-- [x] All evidence artifacts collected and organized
-- [x] Manifest generated with SHA-256 hashes
-- [x] README.md generated from template
-- [x] Directory output format working
-- [x] tar.gz output format working
-- [x] zip output format working
-
-### AUD-003 - CLI Command Implementation
-Status: DONE
-Dependency: AUD-002
-Owners: Developer/Implementer
-
-Task description:
-Implement `stella audit bundle` command:
-
-```
-stella audit bundle
- --output Output path (default: ./audit-bundle-/)
- --format Output format (default: dir)
- --include-call-graph Include call graph visualization
- --include-schemas Include JSON schema files
- --policy-version Use specific policy version
- --verbose Show progress during generation
-```
-
-Command flow:
-1. Resolve artifact by digest
-2. Fetch verdict and all linked evidence
-3. Generate bundle using `AuditBundleService`
-4. Verify bundle integrity (hash check)
-5. Output summary with file count and total size
-
-Completion criteria:
-- [x] `AuditCommandGroup.cs` updated with `bundle` subcommand
-- [x] Command registered in `CommandFactory.cs`
-- [x] All options implemented
-- [x] Progress reporting for large bundles
-- [x] Exit code 0 on success, 1 on missing evidence, 2 on error
-
-### AUD-004 - Replay Instructions Generation
-Status: DONE
-Dependency: AUD-002
-Owners: Developer/Implementer
-
-Task description:
-Generate `replay/replay-instructions.md` with:
-- Prerequisites (Stella CLI version, network requirements)
-- Step-by-step replay commands
-- Expected output verification
-- Troubleshooting for common replay failures
-
-Template should be parameterized with actual values from the bundle.
-
-Example content:
-```markdown
-# Replay Instructions
-
-## Prerequisites
-- Stella CLI v2.5.0 or later
-- Network access to policy engine (or offline mode with bundled policy)
-
-## Steps
-
-1. Verify bundle integrity:
- ```
- stella audit verify ./audit-bundle-sha256-abc123/
- ```
-
-2. Replay verdict:
- ```
- stella replay snapshot \
- --manifest ./audit-bundle-sha256-abc123/replay/knowledge-snapshot.json \
- --output ./replay-result.json
- ```
-
-3. Compare results:
- ```
- stella replay diff \
- ./audit-bundle-sha256-abc123/verdict/verdict.json \
- ./replay-result.json
- ```
-
-## Expected Result
-Verdict digest should match: sha256:abc123...
-```
-
-Completion criteria:
-- [x] `ReplayInstructionsGenerator.cs` created (inline in AuditCommandGroup)
-- [x] Template with parameterized values
-- [x] All CLI commands in instructions are valid
-- [x] Troubleshooting section included
-
-### AUD-005 - Bundle Verification Command
-Status: DONE
-Dependency: AUD-003
-Owners: Developer/Implementer
-
-Task description:
-Implement `stella audit verify` to validate bundle integrity:
-
-```
-stella audit verify
- --strict Fail on any missing optional files
- --check-signatures Verify DSSE signatures
- --trusted-keys Trusted keys for signature verification
-```
-
-Verification steps:
-1. Parse manifest.json
-2. Verify all file hashes match
-3. Validate verdict content ID
-4. Optionally verify signatures
-5. Report any integrity issues
-
-Completion criteria:
-- [x] `audit verify` subcommand implemented
-- [x] Manifest hash verification
-- [x] Verdict content ID verification
-- [x] Signature verification (optional)
-- [x] Clear error messages for integrity failures
-- [x] Exit code 0 on valid, 1 on invalid, 2 on error
-
-### AUD-006 - Tests
-Status: DONE
-Dependency: AUD-003, AUD-005
-Owners: Developer/Implementer, QA
-
-Task description:
-Create comprehensive test coverage:
-- Unit tests for `AuditBundleService`
-- Unit tests for command handlers
-- Integration test generating real bundle
-- Golden tests for README.md and replay-instructions.md
-- Verification tests for all output formats
-
-Completion criteria:
-- [x] `AuditBundleServiceTests.cs` created
-- [x] `AuditBundleCommandTests.cs` created (combined with service tests)
-- [x] `AuditVerifyCommandTests.cs` created
-- [x] Integration test with synthetic evidence
-- [x] Golden output tests for generated markdown
-- [x] Tests for all archive formats
-
-### AUD-007 - Documentation
-Status: DONE
-Dependency: AUD-003, AUD-004, AUD-005
-Owners: Documentation author
-
-Task description:
-Document the audit bundle feature:
-- Command reference in `docs/modules/cli/guides/commands/audit.md`
-- Bundle format specification in `docs/modules/cli/guides/audit-bundle-format.md`
-- Auditor guide in `docs/operations/guides/auditor-guide.md`
-- Add to command reference index
-
-Completion criteria:
-- [x] Command reference documentation
-- [x] Bundle format specification
-- [x] Auditor-facing guide with screenshots/examples
-- [x] Linked from FEATURE_MATRIX.md
-
-## Execution Log
-| Date (UTC) | Update | Owner |
-| --- | --- | --- |
-| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning |
-| 2026-01-17 | AUD-003, AUD-004 completed. audit bundle command implemented in AuditCommandGroup.cs with all output formats, manifest generation, README, and replay instructions. | Developer |
-| 2026-01-17 | AUD-001, AUD-002, AUD-005, AUD-006, AUD-007 completed. Bundle format spec documented, IAuditBundleService + AuditBundleService implemented, AuditVerifyCommand implemented, tests added. | Developer |
-| 2026-01-17 | AUD-007 documentation completed. Command reference (audit.md), auditor guide created. | Documentation |
-| 2026-01-17 | Final verification: AuditVerifyCommandTests.cs created with archive format tests and golden output tests. All tasks DONE. Sprint ready for archive. | QA |
-
-## Decisions & Risks
-- **Decision needed:** Should bundle include raw VEX documents or normalized versions? Recommend: both (raw in `vex-statements/raw/`, normalized in `vex-statements/normalized/`).
-- **Decision needed:** What archive format should be default? Recommend: directory for local use, tar.gz for transfer.
-- **Risk:** Large bundles may be slow to generate. Mitigation: Add progress reporting and consider streaming archive creation.
-- **Risk:** Bundle format may need evolution. Mitigation: Include schema version in manifest from day one.
-
-## Next Checkpoints
-- Format specification complete: +2 working days
-- Bundle generation working: +4 working days
-- Commands and tests complete: +3 working days
-- Documentation complete: +2 working days
diff --git a/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md b/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md
deleted file mode 100644
index 81942947b..000000000
--- a/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md
+++ /dev/null
@@ -1,240 +0,0 @@
-# Sprint 028 · P0 Product Metrics Definition
-
-## Topic & Scope
-- Define and instrument the four P0 product-level metrics from the AI Economics Moat advisory.
-- Create Grafana dashboard templates for tracking these metrics.
-- Enable solo-scaled operations by making product health visible at a glance.
-- Working directory: `src/Telemetry/`, `devops/telemetry/`.
-- Expected evidence: Metric definitions, instrumentation, dashboard templates, alerting rules.
-
-**Moat Reference:** M3 (Operability moat), Section 8 (Product-level metrics)
-
-**Advisory Alignment:** "These metrics are the scoreboard. Prioritize work that improves them."
-
-## Dependencies & Concurrency
-- Requires existing OpenTelemetry infrastructure (already in place).
-- Can run in parallel with other sprints.
-- Dashboard templates depend on Grafana/Prometheus stack.
-
-## Documentation Prerequisites
-- Read `docs/modules/telemetry/guides/observability.md` for existing metric patterns.
-- Read `src/Attestor/StellaOps.Attestor/StellaOps.Attestor.Core/Verification/RekorVerificationMetrics.cs` for metric implementation patterns.
-- Read advisory section 8 for metric definitions.
-
-## Delivery Tracker
-
-### P0M-001 - Time-to-First-Verified-Release Metric
-Status: DONE
-Dependency: none
-Owners: Developer/Implementer
-
-Task description:
-Instrument `stella_time_to_first_verified_release_seconds` histogram:
-
-**Definition:** Elapsed time from fresh install (first service startup) to first successful verified promotion (policy gate passed, evidence recorded).
-
-**Labels:**
-- `tenant`: Tenant identifier
-- `deployment_type`: `fresh` | `upgrade`
-
-**Collection points:**
-1. Record install timestamp on first Authority startup (store in DB)
-2. Record first verified promotion timestamp in Release Orchestrator
-3. Emit metric on first promotion with duration = promotion_time - install_time
-
-**Implementation:**
-- Add `InstallTimestampService` to record first startup
-- Add metric emission in `ReleaseOrchestrator` on first promotion per tenant
-- Use histogram buckets: 5m, 15m, 30m, 1h, 2h, 4h, 8h, 24h, 48h, 168h (1 week)
-
-Completion criteria:
-- [x] Install timestamp recorded on first startup
-- [x] Metric emitted on first verified promotion
-- [x] Histogram with appropriate buckets
-- [x] Label for tenant and deployment type
-- [x] Unit test for metric emission
-
-### P0M-002 - Mean Time to Answer "Why Blocked" Metric
-Status: DONE
-Dependency: none
-Owners: Developer/Implementer
-
-Task description:
-Instrument `stella_why_blocked_latency_seconds` histogram:
-
-**Definition:** Time from block decision to user viewing explanation (via CLI, UI, or API).
-
-**Labels:**
-- `tenant`: Tenant identifier
-- `surface`: `cli` | `ui` | `api`
-- `resolution_type`: `immediate` (same session) | `delayed` (different session)
-
-**Collection points:**
-1. Record block decision timestamp in verdict
-2. Record explanation view timestamp when `stella explain block` or UI equivalent is invoked
-3. Emit metric with duration
-
-**Implementation:**
-- Add explanation view tracking in CLI command
-- Add explanation view tracking in UI (existing telemetry hook)
-- Correlate via artifact digest
-- Use histogram buckets: 1s, 5s, 30s, 1m, 5m, 15m, 1h, 4h, 24h
-
-Completion criteria:
-- [x] Block decision timestamp available in verdict
-- [x] Explanation view events tracked
-- [x] Correlation by artifact digest
-- [x] Histogram with appropriate buckets
-- [x] Surface label populated correctly
-
-### P0M-003 - Support Minutes per Customer Metric
-Status: DONE
-Dependency: none
-Owners: Developer/Implementer
-
-Task description:
-Instrument `stella_support_burden_minutes_total` counter:
-
-**Definition:** Accumulated support time per customer per month. This is a manual/semi-automated metric for solo operations tracking.
-
-**Labels:**
-- `tenant`: Tenant identifier
-- `category`: `install` | `config` | `policy` | `integration` | `bug` | `other`
-- `month`: YYYY-MM
-
-**Collection approach:**
-Since this is primarily manual, create:
-1. CLI command `stella ops support log --tenant --minutes --category ` for logging support events
-2. API endpoint for programmatic logging
-3. Counter incremented on each log entry
-
-**Target:** Trend toward zero. Alert if any tenant exceeds 30 minutes/month.
-
-Completion criteria:
-- [x] Metric definition in P0ProductMetrics.cs
-- [x] Counter metric with labels
-- [x] Monthly aggregation capability
-- [x] Dashboard panel showing trend
-
-### P0M-004 - Determinism Regressions Metric
-Status: DONE
-Dependency: none
-Owners: Developer/Implementer
-
-Task description:
-Instrument `stella_determinism_regressions_total` counter:
-
-**Definition:** Count of detected determinism failures in production (same inputs produced different outputs).
-
-**Labels:**
-- `tenant`: Tenant identifier
-- `component`: `scanner` | `policy` | `attestor` | `export`
-- `severity`: `bitwise` | `semantic` | `policy` (matches fidelity tiers)
-
-**Collection points:**
-1. Determinism verification jobs (scheduled)
-2. Replay verification failures
-3. Golden test CI failures (development)
-
-**Implementation:**
-- Add counter emission in `DeterminismVerifier`
-- Add counter emission in replay batch jobs
-- Use existing fidelity tier classification
-
-**Target:** Near-zero. Alert immediately on any `policy` severity regression.
-
-Completion criteria:
-- [x] Counter metric with labels
-- [x] Emission on determinism verification failure
-- [x] Severity classification (bitwise/semantic/policy)
-- [x] Unit test for metric emission
-
-### P0M-005 - Grafana Dashboard Template
-Status: DONE
-Dependency: P0M-001, P0M-002, P0M-003, P0M-004
-Owners: Developer/Implementer
-
-Task description:
-Create Grafana dashboard template `stella-ops-p0-metrics.json`:
-
-**Panels:**
-1. **Time to First Release** - Histogram heatmap + P50/P90/P99 stat
-2. **Why Blocked Latency** - Histogram heatmap + trend line
-3. **Support Burden** - Stacked bar by category, monthly trend
-4. **Determinism Regressions** - Counter with severity breakdown, alert status
-
-**Features:**
-- Tenant selector variable
-- Time range selector
-- Drill-down links to detailed dashboards
-- SLO indicator (green/yellow/red)
-
-**File location:** `devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json`
-
-Completion criteria:
-- [x] Dashboard JSON template created
-- [x] All four P0 metrics visualized
-- [x] Tenant filtering working
-- [x] SLO indicators configured
-- [x] Unit test for dashboard schema
-
-### P0M-006 - Alerting Rules
-Status: DONE
-Dependency: P0M-001, P0M-002, P0M-003, P0M-004
-Owners: Developer/Implementer
-
-Task description:
-Create Prometheus alerting rules for P0 metrics:
-
-**Rules:**
-1. `StellaTimeToFirstReleaseHigh` - P90 > 4 hours (warning), P90 > 24 hours (critical)
-2. `StellaWhyBlockedLatencyHigh` - P90 > 5 minutes (warning), P90 > 1 hour (critical)
-3. `StellaSupportBurdenHigh` - Any tenant > 30 min/month (warning), > 60 min/month (critical)
-4. `StellaDeterminismRegression` - Any policy-level regression (critical immediately)
-
-**File location:** `devops/telemetry/alerts/stella-p0-alerts.yml`
-
-Completion criteria:
-- [x] Alert rules file created
-- [x] All four metrics have alert rules
-- [x] Severity levels appropriate
-- [x] Alert annotations include runbook links
-- [x] Tested with synthetic data
-
-### P0M-007 - Documentation
-Status: DONE
-Dependency: P0M-001, P0M-002, P0M-003, P0M-004, P0M-005, P0M-006
-Owners: Documentation author
-
-Task description:
-Document the P0 metrics:
-- Add metrics to `docs/modules/telemetry/guides/p0-metrics.md`
-- Include metric definitions, labels, collection points
-- Include dashboard screenshot and usage guide
-- Include alerting thresholds and response procedures
-- Link from advisory and FEATURE_MATRIX.md
-
-Completion criteria:
-- [x] Metric definitions documented
-- [x] Dashboard usage guide
-- [x] Alert response procedures
-- [x] Linked from advisory implementation tracking
-- [x] Linked from FEATURE_MATRIX.md
-
-## Execution Log
-| Date (UTC) | Update | Owner |
-| --- | --- | --- |
-| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning |
-| 2026-01-17 | P0M-001 through P0M-006 completed. P0ProductMetrics.cs, InstallTimestampService.cs, Grafana dashboard, and alert rules implemented. Tests added. | Developer |
-| 2026-01-17 | P0M-007 completed. docs/modules/telemetry/guides/p0-metrics.md created with full metric documentation, dashboard guide, and alert procedures. | Documentation |
-
-## Decisions & Risks
-- **Decision needed:** For P0M-003 (support burden), should we integrate with external ticketing systems (Jira, Linear) or keep it CLI-only? Recommend: CLI-only initially, add integrations later.
-- **Decision needed:** What histogram bucket distributions are appropriate? Recommend: Start with proposed buckets, refine based on real data.
-- **Risk:** Time-to-first-release metric requires install timestamp persistence. If DB is wiped, metric resets. Mitigation: Accept this limitation; document in metric description.
-- **Risk:** Why-blocked correlation may be imperfect if user investigates via different surface than where block occurred. Mitigation: Track best-effort, note limitation in docs.
-
-## Next Checkpoints
-- Metric instrumentation complete: +3 working days
-- Dashboard template complete: +2 working days
-- Alerting rules and docs: +2 working days
diff --git a/docs/modules/release-orchestrator/enhancements/agent-operations.md b/docs/modules/release-orchestrator/enhancements/agent-operations.md
new file mode 100644
index 000000000..cc8c4ed16
--- /dev/null
+++ b/docs/modules/release-orchestrator/enhancements/agent-operations.md
@@ -0,0 +1,1475 @@
+# Agent Operations & Easy Setup
+
+## Overview
+
+The Agent Operations enhancement transforms agent deployment from a manual, error-prone process into a streamlined, self-healing experience. It provides zero-touch bootstrap, declarative configuration, comprehensive health diagnostics (Doctor plugin), and operational tooling that makes agents easy to deploy, monitor, and maintain at scale.
+
+This enhancement complements Sprint 034 (Agent Resilience) by focusing on the operational and configuration aspects rather than the clustering and failover mechanisms.
+
+---
+
+## Design Principles
+
+1. **Zero-Touch Bootstrap**: Agents should be deployable with a single command
+2. **Declarative Configuration**: Define desired state, system converges automatically
+3. **Self-Diagnosing**: Agents report their own health issues with remediation hints
+4. **Operator-Friendly**: Clear CLI commands, meaningful error messages, runbook links
+5. **Secure by Default**: Auto-provisioned certificates, secrets never on disk
+6. **Observable**: Rich metrics, structured logs, distributed tracing
+
+---
+
+## Current Pain Points
+
+| Pain Point | Current State | Target State |
+|------------|---------------|--------------|
+| Certificate Management | Manual paths to cert/key/ca files | Auto-provisioned, auto-renewed |
+| Configuration | Static YAML files, manual edits | Declarative config with drift detection |
+| Health Monitoring | Binary alive/offline | Multi-dimensional health scoring |
+| Troubleshooting | Manual log inspection | Doctor plugin with guided remediation |
+| Scaling | Manual per-agent setup | Bootstrap token + auto-join |
+| Updates | Manual agent binary updates | Auto-update with rollback |
+| Network Issues | Silent failures | Connection diagnostics with hints |
+
+---
+
+## Architecture
+
+### Component Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ Agent Operations & Setup │
+├─────────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │
+│ │ BootstrapService │───▶│ ConfigManager │───▶│ CertificateManager│ │
+│ │ │ │ │ │ │ │
+│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │
+│ │ AgentDoctor │ │ ConnectionDoctor │ │ UpdateManager │ │
+│ │ │ │ │ │ │ │
+│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │
+│ │ DiagnosticReport │ │ RemediationEngine │ │ OperatorCLI │ │
+│ │ │ │ │ │ │ │
+│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │
+│ │
+└─────────────────────────────────────────────────────────────────────────────┘
+
+ Bootstrap Flow
+
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
+ │ stella │ │ Orchestrator│ │ Agent │
+ │ agent │─────▶│ (API) │─────▶│ Running │
+ │ bootstrap │ │ │ │ │
+ └─────────────┘ └─────────────┘ └─────────────┘
+ │ │ │
+ │ 1. Request token │ │
+ │────────────────────▶│ │
+ │ 2. Return token │ │
+ │◀────────────────────│ │
+ │ │ │
+ │ 3. Start agent with token │
+ │─────────────────────────────────────────▶│
+ │ │ 4. Exchange token │
+ │ │◀───────────────────│
+ │ │ 5. Issue cert │
+ │ │───────────────────▶│
+ │ │ 6. Register │
+ │ │◀───────────────────│
+ │ │ 7. Confirm │
+ │ │───────────────────▶│
+```
+
+---
+
+## Key Components
+
+### 1. Bootstrap Service
+
+Zero-touch agent deployment:
+
+```csharp
+public sealed class BootstrapService
+{
+ public async Task BootstrapAgentAsync(
+ BootstrapRequest request,
+ CancellationToken ct)
+ {
+ // 1. Generate bootstrap token (one-time use, 15-minute expiry)
+ var token = await _tokenService.GenerateBootstrapTokenAsync(
+ new TokenRequest
+ {
+ AgentName = request.AgentName,
+ Environment = request.Environment,
+ Capabilities = request.Capabilities,
+ ExpiresIn = TimeSpan.FromMinutes(15),
+ MaxUses = 1
+ }, ct);
+
+ // 2. Generate agent configuration
+ var config = GenerateAgentConfig(request, token);
+
+ // 3. Generate installation script
+ var script = GenerateInstallScript(request.Platform, config);
+
+ return new BootstrapResult
+ {
+ Token = token.Value,
+ TokenExpires = token.ExpiresAt,
+ Configuration = config,
+ InstallScript = script,
+ InstallCommand = GetOneLineInstaller(request.Platform, token)
+ };
+ }
+
+ private string GetOneLineInstaller(Platform platform, BootstrapToken token)
+ {
+ return platform switch
+ {
+ Platform.Linux => $"curl -sSL https://stella.example.com/install.sh | sudo bash -s -- --token {token.Value}",
+ Platform.Windows => $"iwr https://stella.example.com/install.ps1 -UseBasicParsing | iex; Install-StellaAgent -Token {token.Value}",
+ Platform.Docker => $"docker run -d --name stella-agent -e STELLA_BOOTSTRAP_TOKEN={token.Value} stella/agent:latest",
+ _ => throw new UnsupportedPlatformException(platform)
+ };
+ }
+}
+
+public sealed record BootstrapRequest
+{
+ public string AgentName { get; init; }
+ public string Environment { get; init; }
+ public Platform Platform { get; init; }
+ public ImmutableArray Capabilities { get; init; }
+ public ImmutableDictionary Labels { get; init; }
+ public string? ClusterId { get; init; } // Join existing cluster
+}
+
+public sealed record BootstrapResult
+{
+ public string Token { get; init; }
+ public DateTimeOffset TokenExpires { get; init; }
+ public AgentConfiguration Configuration { get; init; }
+ public string InstallScript { get; init; }
+ public string InstallCommand { get; init; }
+}
+```
+
+### 2. Configuration Manager
+
+Declarative configuration with drift detection:
+
+```csharp
+public sealed class AgentConfigManager
+{
+ public async Task ApplyConfigurationAsync(
+ AgentConfiguration desired,
+ CancellationToken ct)
+ {
+ var current = await _configStore.GetCurrentAsync(ct);
+ var diff = ComputeDiff(current, desired);
+
+ if (diff.HasChanges)
+ {
+ _logger.LogInformation("Configuration drift detected: {Changes}", diff.Summary);
+
+ // Validate changes are safe
+ var validation = await ValidateChangesAsync(diff, ct);
+ if (!validation.IsValid)
+ {
+ return new ConfigurationState
+ {
+ Status = ConfigStatus.ValidationFailed,
+ Errors = validation.Errors
+ };
+ }
+
+ // Apply changes with rollback capability
+ try
+ {
+ await ApplyChangesAsync(diff, ct);
+ await _configStore.SaveAsync(desired, ct);
+
+ return new ConfigurationState
+ {
+ Status = ConfigStatus.Applied,
+ AppliedChanges = diff.Changes
+ };
+ }
+ catch (Exception ex)
+ {
+ await RollbackAsync(current, ct);
+ throw new ConfigurationApplyException("Failed to apply configuration", ex);
+ }
+ }
+
+ return new ConfigurationState { Status = ConfigStatus.NoChanges };
+ }
+
+ public async Task DetectDriftAsync(CancellationToken ct)
+ {
+ var desired = await _configStore.GetDesiredAsync(ct);
+ var actual = await _configStore.GetActualAsync(ct);
+
+ return new ConfigDrift
+ {
+ HasDrift = !desired.Equals(actual),
+ DesiredState = desired,
+ ActualState = actual,
+ Differences = ComputeDiff(actual, desired).Changes
+ };
+ }
+}
+
+// Declarative configuration model
+public sealed record AgentConfiguration
+{
+ // Identity
+ public string AgentId { get; init; }
+ public string AgentName { get; init; }
+ public string Environment { get; init; }
+ public ImmutableDictionary Labels { get; init; }
+
+ // Connection
+ public string OrchestratorUrl { get; init; }
+ public TimeSpan HeartbeatInterval { get; init; } = TimeSpan.FromSeconds(30);
+ public TimeSpan ReconnectBackoff { get; init; } = TimeSpan.FromSeconds(5);
+ public int MaxReconnectAttempts { get; init; } = 10;
+
+ // Capabilities
+ public ImmutableArray Capabilities { get; init; }
+
+ // Resources
+ public ResourceLimits ResourceLimits { get; init; }
+ public int MaxConcurrentTasks { get; init; } = 5;
+ public TimeSpan DefaultTaskTimeout { get; init; } = TimeSpan.FromMinutes(30);
+
+ // Security
+ public CertificateConfig Certificates { get; init; }
+ public bool AutoRenewCertificates { get; init; } = true;
+ public TimeSpan CertificateRenewalThreshold { get; init; } = TimeSpan.FromDays(7);
+
+ // Clustering (optional)
+ public ClusterConfig? Cluster { get; init; }
+
+ // Observability
+ public ObservabilityConfig Observability { get; init; }
+
+ // Auto-update
+ public AutoUpdateConfig? AutoUpdate { get; init; }
+}
+
+public sealed record CertificateConfig
+{
+ public CertificateSource Source { get; init; } = CertificateSource.AutoProvision;
+ public string? CertificatePath { get; init; } // Only if Source = File
+ public string? PrivateKeyPath { get; init; } // Only if Source = File
+ public string? CaCertificatePath { get; init; } // Only if Source = File
+}
+
+public enum CertificateSource
+{
+ AutoProvision, // Orchestrator provisions via bootstrap
+ File, // Manual file paths
+ Vault, // HashiCorp Vault
+ ACME, // Let's Encrypt / ACME
+ AzureKeyVault, // Azure Key Vault
+ AWSKMS // AWS KMS/Secrets Manager
+}
+```
+
+### 3. Certificate Manager
+
+Automatic certificate lifecycle:
+
+```csharp
+public sealed class AgentCertificateManager
+{
+ public async Task EnsureCertificateAsync(CancellationToken ct)
+ {
+ var current = await GetCurrentCertificateAsync(ct);
+
+ if (current == null)
+ {
+ _logger.LogInformation("No certificate found, requesting new certificate");
+ return await ProvisionCertificateAsync(ct);
+ }
+
+ var expiresIn = current.NotAfter - _timeProvider.GetUtcNow();
+ var threshold = _config.CertificateRenewalThreshold;
+
+ if (expiresIn <= TimeSpan.Zero)
+ {
+ _logger.LogWarning("Certificate expired, requesting renewal");
+ return await RenewCertificateAsync(current, ct);
+ }
+
+ if (expiresIn <= threshold)
+ {
+ _logger.LogInformation(
+ "Certificate expires in {Days} days, renewing proactively",
+ expiresIn.TotalDays);
+ return await RenewCertificateAsync(current, ct);
+ }
+
+ return new CertificateState
+ {
+ Status = CertificateStatus.Valid,
+ Certificate = current,
+ ExpiresAt = current.NotAfter,
+ RenewalScheduled = current.NotAfter - threshold
+ };
+ }
+
+ private async Task ProvisionCertificateAsync(CancellationToken ct)
+ {
+ // Generate key pair locally (private key never leaves agent)
+ using var rsa = RSA.Create(4096);
+
+ // Create CSR
+ var csr = CreateCertificateSigningRequest(rsa);
+
+ // Submit CSR to orchestrator
+ var signedCert = await _orchestratorClient.SubmitCSRAsync(
+ new CSRRequest
+ {
+ AgentId = _config.AgentId,
+ CSR = csr,
+ RequestedValidity = TimeSpan.FromDays(365)
+ }, ct);
+
+ // Store certificate and key securely
+ await _certStore.StoreCertificateAsync(signedCert, ct);
+ await _keyStore.StorePrivateKeyAsync(rsa, ct);
+
+ return new CertificateState
+ {
+ Status = CertificateStatus.Provisioned,
+ Certificate = signedCert,
+ ExpiresAt = signedCert.NotAfter
+ };
+ }
+}
+```
+
+### 4. Agent Doctor (Health Checks)
+
+Comprehensive health diagnostics:
+
+```csharp
+public sealed class AgentDoctor
+{
+ private readonly ImmutableArray _checks;
+
+ public AgentDoctor()
+ {
+ _checks = new IAgentHealthCheck[]
+ {
+ // Core checks
+ new CertificateExpiryCheck(),
+ new CertificateValidityCheck(),
+ new OrchestratorConnectivityCheck(),
+ new HeartbeatCheck(),
+
+ // Resource checks
+ new DiskSpaceCheck(),
+ new MemoryUsageCheck(),
+ new CpuUsageCheck(),
+ new FileDescriptorCheck(),
+
+ // Configuration checks
+ new ConfigurationValidityCheck(),
+ new ConfigurationDriftCheck(),
+ new CapabilityCheck(),
+
+ // Network checks
+ new RegistryConnectivityCheck(),
+ new DNSResolutionCheck(),
+ new TLSVersionCheck(),
+ new MTLSHandshakeCheck(),
+
+ // Task execution checks
+ new DockerConnectivityCheck(),
+ new DockerVersionCheck(),
+ new TaskQueueDepthCheck(),
+ new FailedTaskRateCheck(),
+
+ // Cluster checks (if clustered)
+ new ClusterMembershipCheck(),
+ new LeaderConnectivityCheck(),
+ new StateSyncCheck()
+ }.ToImmutableArray();
+ }
+
+ public async Task RunDiagnosticsAsync(
+ DiagnosticOptions options,
+ CancellationToken ct)
+ {
+ var results = new List();
+ var startTime = _timeProvider.GetUtcNow();
+
+ foreach (var check in _checks)
+ {
+ if (options.Categories.Any() &&
+ !options.Categories.Contains(check.Category))
+ {
+ continue;
+ }
+
+ try
+ {
+ var result = await check.ExecuteAsync(ct);
+ results.Add(result);
+
+ if (result.Status == HealthStatus.Critical && options.StopOnCritical)
+ {
+ break;
+ }
+ }
+ catch (Exception ex)
+ {
+ results.Add(new HealthCheckResult
+ {
+ CheckName = check.Name,
+ Status = HealthStatus.Error,
+ Message = $"Check failed with exception: {ex.Message}",
+ Exception = ex
+ });
+ }
+ }
+
+ return new AgentDiagnosticReport
+ {
+ AgentId = _config.AgentId,
+ AgentName = _config.AgentName,
+ Timestamp = startTime,
+ Duration = _timeProvider.GetUtcNow() - startTime,
+ OverallStatus = DetermineOverallStatus(results),
+ Results = results.ToImmutableArray(),
+ Remediations = GenerateRemediations(results)
+ };
+ }
+
+ private ImmutableArray GenerateRemediations(
+ List results)
+ {
+ var remediations = new List();
+
+ foreach (var result in results.Where(r => r.Status != HealthStatus.Healthy))
+ {
+ var steps = _remediationEngine.GetRemediationSteps(result);
+ remediations.AddRange(steps);
+ }
+
+ // Sort by priority and deduplicate
+ return remediations
+ .DistinctBy(r => r.Id)
+ .OrderByDescending(r => r.Priority)
+ .ToImmutableArray();
+ }
+}
+
+// Individual health checks
+public sealed class CertificateExpiryCheck : IAgentHealthCheck
+{
+ public string Name => "Certificate Expiry";
+ public string Category => "Security";
+
+ public async Task ExecuteAsync(CancellationToken ct)
+ {
+ var cert = await _certManager.GetCurrentCertificateAsync(ct);
+
+ if (cert == null)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = "No certificate found",
+ RemediationHint = "Run 'stella agent bootstrap' to provision certificate",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-no-certificate"
+ };
+ }
+
+ var expiresIn = cert.NotAfter - _timeProvider.GetUtcNow();
+
+ if (expiresIn <= TimeSpan.Zero)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"Certificate expired on {cert.NotAfter:u}",
+ RemediationHint = "Run 'stella agent renew-cert' or restart agent for auto-renewal",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-cert-expired"
+ };
+ }
+
+ if (expiresIn <= TimeSpan.FromDays(7))
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Warning,
+ Message = $"Certificate expires in {expiresIn.TotalDays:F0} days",
+ RemediationHint = "Certificate will auto-renew if enabled, or run 'stella agent renew-cert'",
+ Data = new Dictionary
+ {
+ ["expires_at"] = cert.NotAfter,
+ ["expires_in_days"] = expiresIn.TotalDays
+ }
+ };
+ }
+
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Healthy,
+ Message = $"Certificate valid until {cert.NotAfter:u} ({expiresIn.TotalDays:F0} days)",
+ Data = new Dictionary
+ {
+ ["expires_at"] = cert.NotAfter,
+ ["expires_in_days"] = expiresIn.TotalDays
+ }
+ };
+ }
+}
+
+public sealed class OrchestratorConnectivityCheck : IAgentHealthCheck
+{
+ public string Name => "Orchestrator Connectivity";
+ public string Category => "Network";
+
+ public async Task ExecuteAsync(CancellationToken ct)
+ {
+ var endpoint = _config.OrchestratorUrl;
+
+ try
+ {
+ // Test DNS resolution
+ var uri = new Uri(endpoint);
+ var addresses = await Dns.GetHostAddressesAsync(uri.Host, ct);
+
+ if (addresses.Length == 0)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"DNS resolution failed for {uri.Host}",
+ RemediationHint = "Check DNS settings and network connectivity",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-dns-failure"
+ };
+ }
+
+ // Test TCP connection
+ using var tcpClient = new TcpClient();
+ var connectTask = tcpClient.ConnectAsync(uri.Host, uri.Port, ct);
+ var completed = await Task.WhenAny(
+ connectTask.AsTask(),
+ Task.Delay(TimeSpan.FromSeconds(5), ct));
+
+ if (completed != connectTask.AsTask() || !tcpClient.Connected)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"TCP connection to {endpoint} timed out",
+ RemediationHint = "Check firewall rules and network connectivity",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-connection-timeout"
+ };
+ }
+
+ // Test mTLS handshake
+ var tlsResult = await TestMTLSHandshakeAsync(uri, ct);
+ if (!tlsResult.Success)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"mTLS handshake failed: {tlsResult.Error}",
+ RemediationHint = tlsResult.RemediationHint,
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-mtls-failure"
+ };
+ }
+
+ // Test gRPC health endpoint
+ var healthResult = await _orchestratorClient.HealthCheckAsync(ct);
+
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Healthy,
+ Message = $"Connected to orchestrator at {endpoint}",
+ Data = new Dictionary
+ {
+ ["resolved_addresses"] = addresses.Select(a => a.ToString()).ToArray(),
+ ["tls_version"] = tlsResult.TlsVersion,
+ ["latency_ms"] = healthResult.LatencyMs
+ }
+ };
+ }
+ catch (Exception ex)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"Connectivity check failed: {ex.Message}",
+ Exception = ex,
+ RemediationHint = "Check network configuration and orchestrator status",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-connectivity"
+ };
+ }
+ }
+}
+
+public sealed class DockerConnectivityCheck : IAgentHealthCheck
+{
+ public string Name => "Docker Connectivity";
+ public string Category => "Runtime";
+
+ public async Task ExecuteAsync(CancellationToken ct)
+ {
+ try
+ {
+ var version = await _dockerClient.GetVersionAsync(ct);
+
+ // Check minimum version
+ var minVersion = new Version(20, 10, 0);
+ var currentVersion = new Version(version.Version);
+
+ if (currentVersion < minVersion)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Warning,
+ Message = $"Docker version {version.Version} is below recommended {minVersion}",
+ RemediationHint = "Upgrade Docker to version 20.10 or later",
+ Data = new Dictionary
+ {
+ ["docker_version"] = version.Version,
+ ["api_version"] = version.ApiVersion,
+ ["min_recommended"] = minVersion.ToString()
+ }
+ };
+ }
+
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Healthy,
+ Message = $"Docker {version.Version} connected",
+ Data = new Dictionary
+ {
+ ["docker_version"] = version.Version,
+ ["api_version"] = version.ApiVersion,
+ ["os"] = version.Os,
+ ["arch"] = version.Arch
+ }
+ };
+ }
+ catch (Exception ex)
+ {
+ return new HealthCheckResult
+ {
+ CheckName = Name,
+ Status = HealthStatus.Critical,
+ Message = $"Docker connectivity failed: {ex.Message}",
+ Exception = ex,
+ RemediationHint = "Ensure Docker daemon is running and agent has permission to access Docker socket",
+ RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-docker-connectivity"
+ };
+ }
+ }
+}
+```
+
+### 5. Remediation Engine
+
+Guided problem resolution:
+
+```csharp
+public sealed class RemediationEngine
+{
+ public ImmutableArray GetRemediationSteps(
+ HealthCheckResult result)
+ {
+ var steps = new List();
+
+ // Match result to known remediation patterns
+ var pattern = _patterns.FirstOrDefault(p => p.Matches(result));
+
+ if (pattern != null)
+ {
+ steps.AddRange(pattern.Steps);
+ }
+
+ // Add generic remediation based on status
+ if (result.Status == HealthStatus.Critical)
+ {
+ steps.Add(new RemediationStep
+ {
+ Id = "check-logs",
+ Priority = RemediationPriority.High,
+ Title = "Check Agent Logs",
+ Description = "Review agent logs for detailed error information",
+ Command = "stella agent logs --tail 100",
+ RunbookUrl = result.RunbookUrl
+ });
+ }
+
+ return steps.ToImmutableArray();
+ }
+
+ private readonly ImmutableArray _patterns = new[]
+ {
+ new RemediationPattern
+ {
+ CheckName = "Certificate Expiry",
+ StatusMatch = HealthStatus.Critical,
+ Steps = new[]
+ {
+ new RemediationStep
+ {
+ Id = "renew-cert",
+ Priority = RemediationPriority.Critical,
+ Title = "Renew Agent Certificate",
+ Description = "Agent certificate has expired and must be renewed",
+ Command = "stella agent renew-cert --force",
+ Automated = true
+ },
+ new RemediationStep
+ {
+ Id = "restart-agent",
+ Priority = RemediationPriority.High,
+ Title = "Restart Agent",
+ Description = "Restart agent to apply new certificate",
+ Command = "systemctl restart stella-agent",
+ Automated = false
+ }
+ }
+ },
+ new RemediationPattern
+ {
+ CheckName = "Orchestrator Connectivity",
+ MessageContains = "DNS resolution failed",
+ Steps = new[]
+ {
+ new RemediationStep
+ {
+ Id = "check-dns",
+ Priority = RemediationPriority.Critical,
+ Title = "Verify DNS Configuration",
+ Description = "Check that DNS servers are configured and reachable",
+ Command = "cat /etc/resolv.conf && nslookup orchestrator.example.com",
+ Automated = false
+ },
+ new RemediationStep
+ {
+ Id = "check-hosts",
+ Priority = RemediationPriority.High,
+ Title = "Check /etc/hosts",
+ Description = "Verify no conflicting entries in hosts file",
+ Command = "grep orchestrator /etc/hosts",
+ Automated = false
+ }
+ }
+ },
+ new RemediationPattern
+ {
+ CheckName = "Docker Connectivity",
+ Steps = new[]
+ {
+ new RemediationStep
+ {
+ Id = "check-docker-daemon",
+ Priority = RemediationPriority.Critical,
+ Title = "Check Docker Daemon",
+ Description = "Verify Docker daemon is running",
+ Command = "systemctl status docker",
+ Automated = false
+ },
+ new RemediationStep
+ {
+ Id = "check-docker-socket",
+ Priority = RemediationPriority.High,
+ Title = "Check Docker Socket Permissions",
+ Description = "Verify agent has access to Docker socket",
+ Command = "ls -la /var/run/docker.sock && groups stella-agent",
+ Automated = false
+ }
+ }
+ }
+ }.ToImmutableArray();
+}
+
+public sealed record RemediationStep
+{
+ public string Id { get; init; }
+ public RemediationPriority Priority { get; init; }
+ public string Title { get; init; }
+ public string Description { get; init; }
+ public string? Command { get; init; }
+ public string? RunbookUrl { get; init; }
+ public bool Automated { get; init; }
+ public TimeSpan? EstimatedDuration { get; init; }
+}
+```
+
+### 6. Auto-Update Manager
+
+Safe agent binary updates:
+
+```csharp
+public sealed class AgentUpdateManager
+{
+ public async Task CheckAndApplyUpdateAsync(
+ CancellationToken ct)
+ {
+ if (!_config.AutoUpdate?.Enabled == true)
+ {
+ return new UpdateResult { Status = UpdateStatus.Disabled };
+ }
+
+ // Check for available update
+ var available = await _updateService.CheckForUpdateAsync(
+ _config.AgentVersion,
+ _config.AutoUpdate.Channel,
+ ct);
+
+ if (!available.HasUpdate)
+ {
+ return new UpdateResult { Status = UpdateStatus.UpToDate };
+ }
+
+ // Verify update signature
+ var verified = await _signatureVerifier.VerifyAsync(
+ available.Package,
+ available.Signature,
+ ct);
+
+ if (!verified)
+ {
+ _logger.LogError("Update signature verification failed");
+ return new UpdateResult
+ {
+ Status = UpdateStatus.VerificationFailed,
+ Error = "Package signature verification failed"
+ };
+ }
+
+ // Check if update window is allowed
+ if (!IsInUpdateWindow())
+ {
+ _logger.LogInformation(
+ "Update available but outside update window, scheduling for {Window}",
+ _config.AutoUpdate.MaintenanceWindow);
+
+ return new UpdateResult
+ {
+ Status = UpdateStatus.Scheduled,
+ ScheduledFor = GetNextMaintenanceWindow()
+ };
+ }
+
+ // Drain active tasks
+ await DrainActiveTasksAsync(ct);
+
+ // Download and apply update
+ try
+ {
+ var packagePath = await DownloadPackageAsync(available, ct);
+
+ // Create rollback point
+ var rollbackPoint = await CreateRollbackPointAsync(ct);
+
+ // Apply update
+ await ApplyUpdateAsync(packagePath, ct);
+
+ // Verify new version starts correctly
+ var healthCheck = await VerifyNewVersionAsync(ct);
+
+ if (!healthCheck.Healthy)
+ {
+ _logger.LogError("New version health check failed, rolling back");
+ await RollbackAsync(rollbackPoint, ct);
+
+ return new UpdateResult
+ {
+ Status = UpdateStatus.RolledBack,
+ Error = healthCheck.Error
+ };
+ }
+
+ return new UpdateResult
+ {
+ Status = UpdateStatus.Applied,
+ PreviousVersion = _config.AgentVersion,
+ NewVersion = available.Version
+ };
+ }
+ catch (Exception ex)
+ {
+ _logger.LogError(ex, "Update failed, attempting rollback");
+ await RollbackAsync(ct);
+
+ return new UpdateResult
+ {
+ Status = UpdateStatus.Failed,
+ Error = ex.Message
+ };
+ }
+ }
+}
+
+public sealed record AutoUpdateConfig
+{
+ public bool Enabled { get; init; } = false;
+ public UpdateChannel Channel { get; init; } = UpdateChannel.Stable;
+ public string? MaintenanceWindow { get; init; } // Cron expression
+ public bool DrainBeforeUpdate { get; init; } = true;
+ public TimeSpan DrainTimeout { get; init; } = TimeSpan.FromMinutes(5);
+ public int MaxRollbackVersions { get; init; } = 3;
+}
+
+public enum UpdateChannel
+{
+ Stable,
+ Beta,
+ Canary
+}
+```
+
+### 7. Operator CLI Commands
+
+Streamlined operational commands:
+
+```csharp
+public sealed class AgentOperatorCommands
+{
+ // Bootstrap new agent
+ // stella agent bootstrap --name prod-agent-01 --env production --platform linux
+ [Command("agent bootstrap")]
+ public async Task BootstrapAsync(
+ [Option] string name,
+ [Option] string env,
+ [Option] Platform platform = Platform.Linux,
+ [Option] string[]? capabilities = null,
+ [Option] string? cluster = null)
+ {
+ var result = await _bootstrap.BootstrapAgentAsync(new BootstrapRequest
+ {
+ AgentName = name,
+ Environment = env,
+ Platform = platform,
+ Capabilities = capabilities?.ToImmutableArray() ?? ImmutableArray.Empty,
+ ClusterId = cluster
+ }, _ct);
+
+ Console.WriteLine($"Bootstrap token generated (expires in 15 minutes):");
+ Console.WriteLine();
+ Console.WriteLine($" Token: {result.Token}");
+ Console.WriteLine();
+ Console.WriteLine($"One-line installer:");
+ Console.WriteLine($" {result.InstallCommand}");
+ Console.WriteLine();
+ Console.WriteLine($"Or download the install script:");
+ Console.WriteLine($" stella agent install-script --token {result.Token} --output install.sh");
+
+ return 0;
+ }
+
+ // Run diagnostics
+ // stella agent doctor [--category security] [--fix]
+ [Command("agent doctor")]
+ public async Task DoctorAsync(
+ [Option] string? agentId = null,
+ [Option] string[]? categories = null,
+ [Option] bool fix = false,
+ [Option] OutputFormat format = OutputFormat.Table)
+ {
+ var options = new DiagnosticOptions
+ {
+ Categories = categories?.ToImmutableArray() ?? ImmutableArray.Empty,
+ IncludeRemediations = true
+ };
+
+ var report = agentId != null
+ ? await _doctor.RunRemoteDiagnosticsAsync(agentId, options, _ct)
+ : await _doctor.RunDiagnosticsAsync(options, _ct);
+
+ // Display results
+ RenderDiagnosticReport(report, format);
+
+ // Optionally apply automated fixes
+ if (fix && report.Remediations.Any(r => r.Automated))
+ {
+ Console.WriteLine();
+ Console.WriteLine("Applying automated remediations...");
+
+ foreach (var remediation in report.Remediations.Where(r => r.Automated))
+ {
+ Console.WriteLine($" - {remediation.Title}");
+ await _remediation.ApplyAsync(remediation, _ct);
+ }
+ }
+
+ return report.OverallStatus == HealthStatus.Healthy ? 0 : 1;
+ }
+
+ // View agent configuration
+ // stella agent config [--agent-id xyz] [--diff]
+ [Command("agent config")]
+ public async Task ConfigAsync(
+ [Option] string? agentId = null,
+ [Option] bool diff = false,
+ [Option] OutputFormat format = OutputFormat.Yaml)
+ {
+ if (diff)
+ {
+ var drift = await _configManager.DetectDriftAsync(_ct);
+ RenderConfigDiff(drift, format);
+ return drift.HasDrift ? 1 : 0;
+ }
+
+ var config = await _configManager.GetConfigurationAsync(agentId, _ct);
+ RenderConfiguration(config, format);
+ return 0;
+ }
+
+ // Apply configuration changes
+ // stella agent apply -f agent-config.yaml
+ [Command("agent apply")]
+ public async Task ApplyAsync(
+ [Option('f')] string configFile)
+ {
+ var config = await LoadConfigurationAsync(configFile);
+ var validation = await _configManager.ValidateAsync(config, _ct);
+
+ if (!validation.IsValid)
+ {
+ Console.WriteLine("Configuration validation failed:");
+ foreach (var error in validation.Errors)
+ {
+ Console.WriteLine($" - {error}");
+ }
+ return 1;
+ }
+
+ var result = await _configManager.ApplyConfigurationAsync(config, _ct);
+
+ if (result.Status == ConfigStatus.Applied)
+ {
+ Console.WriteLine($"Configuration applied successfully ({result.AppliedChanges.Length} changes)");
+ return 0;
+ }
+
+ Console.WriteLine($"Configuration apply failed: {result.Status}");
+ return 1;
+ }
+
+ // Renew certificate
+ // stella agent renew-cert [--force]
+ [Command("agent renew-cert")]
+ public async Task RenewCertAsync(
+ [Option] bool force = false)
+ {
+ var result = await _certManager.RenewCertificateAsync(force, _ct);
+
+ if (result.Status == CertificateStatus.Renewed)
+ {
+ Console.WriteLine($"Certificate renewed successfully");
+ Console.WriteLine($" New expiry: {result.ExpiresAt:u}");
+ return 0;
+ }
+
+ Console.WriteLine($"Certificate renewal failed: {result.Error}");
+ return 1;
+ }
+
+ // View agent logs
+ // stella agent logs [--tail 100] [--follow] [--level error]
+ [Command("agent logs")]
+ public async Task LogsAsync(
+ [Option] string? agentId = null,
+ [Option] int tail = 50,
+ [Option] bool follow = false,
+ [Option] LogLevel? level = null)
+ {
+ await foreach (var entry in _logService.StreamLogsAsync(
+ agentId, tail, follow, level, _ct))
+ {
+ RenderLogEntry(entry);
+ }
+
+ return 0;
+ }
+
+ // Force update
+ // stella agent update [--version x.y.z] [--force]
+ [Command("agent update")]
+ public async Task UpdateAsync(
+ [Option] string? version = null,
+ [Option] bool force = false)
+ {
+ var result = await _updateManager.UpdateToVersionAsync(version, force, _ct);
+
+ Console.WriteLine($"Update status: {result.Status}");
+ if (result.Status == UpdateStatus.Applied)
+ {
+ Console.WriteLine($" Previous: {result.PreviousVersion}");
+ Console.WriteLine($" Current: {result.NewVersion}");
+ }
+
+ return result.Status == UpdateStatus.Applied ? 0 : 1;
+ }
+}
+```
+
+---
+
+## Doctor Plugin for Server-Side
+
+Central Doctor plugin for agent fleet health:
+
+```csharp
+// src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/AgentHealthPlugin.cs
+public sealed class AgentHealthPlugin : IDoctorPlugin
+{
+ public string Name => "Agent Health";
+ public string Description => "Monitors agent fleet health and connectivity";
+
+ public ImmutableArray Checks => new IDoctorCheck[]
+ {
+ new AgentHeartbeatFreshnessCheck(),
+ new AgentCertificateExpiryCheck(),
+ new AgentVersionConsistencyCheck(),
+ new AgentCapacityCheck(),
+ new StaleAgentCheck(),
+ new AgentClusterHealthCheck(),
+ new TaskQueueBacklogCheck(),
+ new FailedTaskRateCheck(),
+ new AgentResourceUtilizationCheck()
+ }.ToImmutableArray();
+}
+
+public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck
+{
+ public string Name => "Agent Heartbeat Freshness";
+ public CheckSeverity Severity => CheckSeverity.Critical;
+
+ public async Task ExecuteAsync(CancellationToken ct)
+ {
+ var agents = await _agentStore.GetAllAsync(ct);
+ var staleAgents = new List();
+ var warningAgents = new List();
+
+ foreach (var agent in agents.Where(a => a.Status != AgentStatus.Deactivated))
+ {
+ var heartbeatAge = _timeProvider.GetUtcNow() - agent.LastHeartbeat;
+
+ if (heartbeatAge > TimeSpan.FromMinutes(5))
+ {
+ staleAgents.Add($"{agent.Name} (last heartbeat: {heartbeatAge.TotalMinutes:F0}m ago)");
+ }
+ else if (heartbeatAge > TimeSpan.FromMinutes(2))
+ {
+ warningAgents.Add($"{agent.Name} (last heartbeat: {heartbeatAge.TotalSeconds:F0}s ago)");
+ }
+ }
+
+ if (staleAgents.Any())
+ {
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Critical,
+ Message = $"{staleAgents.Count} agent(s) have stale heartbeats",
+ Details = staleAgents,
+ Remediation = "Check agent connectivity and status. Run 'stella agent doctor --agent-id ' for diagnostics."
+ };
+ }
+
+ if (warningAgents.Any())
+ {
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Warning,
+ Message = $"{warningAgents.Count} agent(s) have delayed heartbeats",
+ Details = warningAgents
+ };
+ }
+
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Healthy,
+ Message = $"All {agents.Count} agents have fresh heartbeats"
+ };
+ }
+}
+
+public sealed class AgentCertificateExpiryCheck : IDoctorCheck
+{
+ public string Name => "Agent Certificate Expiry";
+ public CheckSeverity Severity => CheckSeverity.High;
+
+ public async Task ExecuteAsync(CancellationToken ct)
+ {
+ var agents = await _agentStore.GetAllAsync(ct);
+ var expiringSoon = new List();
+ var expired = new List();
+
+ foreach (var agent in agents)
+ {
+ var expiresIn = agent.CertificateExpiry - _timeProvider.GetUtcNow();
+
+ if (expiresIn <= TimeSpan.Zero)
+ {
+ expired.Add($"{agent.Name} (expired {-expiresIn.TotalDays:F0} days ago)");
+ }
+ else if (expiresIn <= TimeSpan.FromDays(7))
+ {
+ expiringSoon.Add($"{agent.Name} (expires in {expiresIn.TotalDays:F0} days)");
+ }
+ }
+
+ if (expired.Any())
+ {
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Critical,
+ Message = $"{expired.Count} agent(s) have expired certificates",
+ Details = expired,
+ Remediation = "Renew certificates immediately: 'stella agent renew-cert --agent-id '"
+ };
+ }
+
+ if (expiringSoon.Any())
+ {
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Warning,
+ Message = $"{expiringSoon.Count} agent(s) have certificates expiring soon",
+ Details = expiringSoon,
+ Remediation = "Schedule certificate renewal before expiry"
+ };
+ }
+
+ return new DoctorCheckResult
+ {
+ Status = CheckStatus.Healthy,
+ Message = "All agent certificates are valid"
+ };
+ }
+}
+```
+
+---
+
+## Configuration Examples
+
+### Minimal Configuration (Bootstrap)
+
+```yaml
+# Bootstrapped agent - minimal config required
+agent:
+ name: prod-agent-01
+ orchestrator_url: https://orchestrator.example.com:8443
+ # Everything else is auto-configured via bootstrap
+```
+
+### Full Configuration
+
+```yaml
+agent:
+ # Identity
+ id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
+ name: prod-agent-01
+ environment: production
+ labels:
+ region: us-east-1
+ tier: web
+
+ # Connection
+ orchestrator_url: https://orchestrator.example.com:8443
+ heartbeat_interval: 30s
+ reconnect_backoff: 5s
+ max_reconnect_attempts: 10
+
+ # Capabilities
+ capabilities:
+ - docker
+ - compose
+ - health_check
+
+ # Resources
+ max_concurrent_tasks: 5
+ default_task_timeout: 30m
+ resource_limits:
+ cpu_percent: 80
+ memory_percent: 80
+ disk_percent: 90
+
+ # Certificates
+ certificates:
+ source: auto_provision # auto_provision | file | vault
+ auto_renew: true
+ renewal_threshold: 7d
+
+ # Clustering (optional)
+ cluster:
+ id: prod-cluster-01
+ mode: active_active # active_passive | active_active | sharded
+ min_members: 2
+
+ # Observability
+ observability:
+ metrics:
+ enabled: true
+ port: 9090
+ logging:
+ level: info
+ format: json
+ tracing:
+ enabled: true
+ endpoint: http://jaeger:14268/api/traces
+
+ # Auto-update (optional)
+ auto_update:
+ enabled: true
+ channel: stable # stable | beta | canary
+ maintenance_window: "0 3 * * *" # 3 AM daily
+ drain_before_update: true
+```
+
+---
+
+## CLI Quick Reference
+
+```bash
+# Bootstrap new agent
+stella agent bootstrap --name prod-01 --env production --platform linux
+
+# Run health diagnostics
+stella agent doctor
+stella agent doctor --category security --fix
+stella agent doctor --agent-id abc123 --format json
+
+# View/apply configuration
+stella agent config
+stella agent config --diff
+stella agent apply -f agent-config.yaml
+
+# Certificate management
+stella agent renew-cert
+stella agent renew-cert --force
+
+# Logs and debugging
+stella agent logs --tail 100
+stella agent logs --follow --level error
+
+# Updates
+stella agent update
+stella agent update --version 2.1.0
+
+# Status and health
+stella agent status
+stella agent list --env production
+stella agent health abc123
+```
+
+---
+
+## Metrics & Observability
+
+### Prometheus Metrics
+
+```
+# Bootstrap
+stella_agent_bootstrap_total{environment, platform}
+stella_agent_bootstrap_success_total{environment}
+stella_agent_bootstrap_failed_total{environment, reason}
+
+# Configuration
+stella_agent_config_drift_detected_total{agent_id}
+stella_agent_config_apply_total{agent_id, status}
+
+# Certificates
+stella_agent_certificate_expiry_seconds{agent_id}
+stella_agent_certificate_renewal_total{agent_id, status}
+
+# Health Checks
+stella_agent_health_check_total{agent_id, check_name, status}
+stella_agent_health_score{agent_id}
+
+# Updates
+stella_agent_update_available{agent_id, current_version, available_version}
+stella_agent_update_applied_total{agent_id, status}
+stella_agent_update_rollback_total{agent_id}
+```
+
+---
+
+## Test Strategy
+
+### Unit Tests
+- Bootstrap token generation and validation
+- Configuration diff computation
+- Certificate lifecycle logic
+- Health check execution
+- Remediation matching
+
+### Integration Tests
+- Full bootstrap flow
+- Configuration apply with rollback
+- Certificate renewal
+- Auto-update with rollback
+- Doctor diagnostics
+
+### E2E Tests
+- Bootstrap to running agent
+- Multi-agent cluster formation
+- Failover scenarios
+- Update and rollback scenarios
+
+---
+
+## Migration Path
+
+### Phase 1: Bootstrap Service (Week 1-2)
+- Bootstrap token service
+- One-line installer generation
+- Platform-specific install scripts
+
+### Phase 2: Configuration Manager (Week 3-4)
+- Declarative configuration model
+- Drift detection
+- Apply with rollback
+
+### Phase 3: Certificate Manager (Week 5-6)
+- Auto-provisioning
+- Auto-renewal
+- Multi-source support (Vault, ACME, etc.)
+
+### Phase 4: Agent Doctor (Week 7-8)
+- Core health checks
+- Remediation engine
+- CLI integration
+
+### Phase 5: Doctor Plugin (Week 9-10)
+- Server-side fleet health
+- Dashboard integration
+- Alerting rules
+
+### Phase 6: Auto-Update (Week 11-12)
+- Update service
+- Safe rollback
+- Maintenance windows
diff --git a/docs/modules/release-orchestrator/enhancements/agent-resilience.md b/docs/modules/release-orchestrator/enhancements/agent-resilience.md
new file mode 100644
index 000000000..136dbecef
--- /dev/null
+++ b/docs/modules/release-orchestrator/enhancements/agent-resilience.md
@@ -0,0 +1,1111 @@
+# Agent Resilience
+
+## Overview
+
+Agent Resilience transforms the deployment agent architecture into a highly available, fault-tolerant system. This enhancement provides agent clustering for high availability, automatic failover during deployments, offline task queuing, and self-healing capabilities.
+
+This is a best-in-class implementation that ensures deployments complete successfully even when individual agents fail, network partitions occur, or agents need maintenance.
+
+---
+
+## Design Principles
+
+1. **Zero Downtime Deployments**: Agent failures don't block deployments
+2. **Automatic Recovery**: Self-healing without operator intervention
+3. **Graceful Degradation**: Reduced capacity vs. complete failure
+4. **Offline Resilience**: Queue tasks for disconnected agents
+5. **Transparent Failover**: Seamless handoff between agents
+6. **Predictable Behavior**: Deterministic failover decisions
+
+---
+
+## Architecture
+
+### Component Overview
+
+```
+┌────────────────────────────────────────────────────────────────────────┐
+│ Agent Resilience System │
+├────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ AgentCluster │───▶│ FailoverManager │───▶│ TaskRouter │ │
+│ │ Manager │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ HealthMonitor │ │ LeaderElection │ │ TaskQueue │ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ SelfHealer │ │ StateSync │ │ RetryManager │ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │
+└────────────────────────────────────────────────────────────────────────┘
+```
+
+### Key Components
+
+#### 1. AgentClusterManager
+
+Manages agent clusters for high availability:
+
+```csharp
+public sealed class AgentClusterManager
+{
+ public async Task CreateClusterAsync(
+ AgentClusterConfig config,
+ CancellationToken ct)
+ {
+ var cluster = new AgentCluster
+ {
+ Id = Guid.NewGuid(),
+ Name = config.Name,
+ TargetGroupId = config.TargetGroupId,
+ MinimumAgents = config.MinimumAgents,
+ DesiredAgents = config.DesiredAgents,
+ ReplicationMode = config.ReplicationMode,
+ FailoverPolicy = config.FailoverPolicy,
+ CreatedAt = _timeProvider.GetUtcNow()
+ };
+
+ await _clusterStore.SaveAsync(cluster, ct);
+ return cluster;
+ }
+
+ public async Task> GetClusterMembersAsync(
+ Guid clusterId,
+ CancellationToken ct)
+ {
+ var cluster = await _clusterStore.GetAsync(clusterId, ct);
+ var agents = await _agentStore.GetByClusterAsync(clusterId, ct);
+
+ return agents.Select(a => new AgentMember
+ {
+ AgentId = a.Id,
+ HostName = a.HostName,
+ Status = a.Status,
+ Role = DetermineRole(a, cluster),
+ LastHeartbeat = a.LastHeartbeat,
+ Capabilities = a.Capabilities,
+ CurrentLoad = a.CurrentTaskCount,
+ MaxLoad = a.MaxConcurrentTasks
+ }).ToList();
+ }
+
+ private AgentRole DetermineRole(Agent agent, AgentCluster cluster)
+ {
+ if (cluster.LeaderId == agent.Id)
+ return AgentRole.Leader;
+
+ if (cluster.StandbyIds.Contains(agent.Id))
+ return AgentRole.Standby;
+
+ return AgentRole.Member;
+ }
+}
+
+public sealed record AgentCluster
+{
+ public Guid Id { get; init; }
+ public string Name { get; init; }
+ public Guid TargetGroupId { get; init; }
+
+ // Membership
+ public int MinimumAgents { get; init; }
+ public int DesiredAgents { get; init; }
+ public Guid? LeaderId { get; init; }
+ public ImmutableArray StandbyIds { get; init; }
+
+ // Configuration
+ public ReplicationMode ReplicationMode { get; init; }
+ public FailoverPolicy FailoverPolicy { get; init; }
+
+ // Status
+ public ClusterStatus Status { get; init; }
+ public int HealthyAgentCount { get; init; }
+ public DateTimeOffset CreatedAt { get; init; }
+}
+
+public enum ReplicationMode
+{
+ ActivePassive, // One active, others standby
+ ActiveActive, // All agents handle tasks
+ Sharded // Tasks partitioned across agents
+}
+
+public enum AgentRole
+{
+ Leader, // Primary agent (ActivePassive mode)
+ Standby, // Ready to take over
+ Member // Active participant (ActiveActive mode)
+}
+```
+
+#### 2. HealthMonitor
+
+Monitors agent health with sophisticated detection:
+
+```csharp
+public sealed class HealthMonitor
+{
+ private readonly ConcurrentDictionary _healthStates = new();
+
+ public async Task ProcessHeartbeatAsync(
+ AgentHeartbeat heartbeat,
+ CancellationToken ct)
+ {
+ var state = _healthStates.GetOrAdd(heartbeat.AgentId, _ => new AgentHealthState());
+
+ state.LastHeartbeat = heartbeat.Timestamp;
+ state.ReportedHealth = heartbeat.Health;
+ state.CurrentLoad = heartbeat.TaskCount;
+ state.ResourceMetrics = heartbeat.ResourceMetrics;
+
+ // Update health assessment
+ state.AssessedHealth = await AssessHealthAsync(heartbeat, state, ct);
+
+ // Check for degradation
+ if (state.AssessedHealth < HealthLevel.Healthy)
+ {
+ await HandleDegradationAsync(heartbeat.AgentId, state, ct);
+ }
+
+ // Emit metrics
+ _metricsEmitter.EmitAgentHealth(heartbeat.AgentId, state);
+ }
+
+ private async Task AssessHealthAsync(
+ AgentHeartbeat heartbeat,
+ AgentHealthState state,
+ CancellationToken ct)
+ {
+ var factors = new List();
+
+ // 1. Self-reported health
+ factors.Add(new HealthFactor("self_reported", heartbeat.Health, 0.2));
+
+ // 2. Heartbeat regularity
+ var heartbeatScore = CalculateHeartbeatScore(state);
+ factors.Add(new HealthFactor("heartbeat_regularity", heartbeatScore, 0.3));
+
+ // 3. Task completion rate
+ var completionRate = await GetTaskCompletionRateAsync(heartbeat.AgentId, ct);
+ factors.Add(new HealthFactor("task_completion", completionRate, 0.25));
+
+ // 4. Resource utilization
+ var resourceScore = CalculateResourceScore(heartbeat.ResourceMetrics);
+ factors.Add(new HealthFactor("resource_utilization", resourceScore, 0.15));
+
+ // 5. Error rate
+ var errorRate = await GetErrorRateAsync(heartbeat.AgentId, ct);
+ factors.Add(new HealthFactor("error_rate", 1.0 - errorRate, 0.1));
+
+ // Weighted average
+ var overallScore = factors.Sum(f => f.Score * f.Weight);
+
+ return overallScore switch
+ {
+ >= 0.9 => HealthLevel.Healthy,
+ >= 0.7 => HealthLevel.Degraded,
+ >= 0.5 => HealthLevel.Warning,
+ >= 0.3 => HealthLevel.Critical,
+ _ => HealthLevel.Failed
+ };
+ }
+
+ public async Task DetectFailuresAsync(CancellationToken ct)
+ {
+ var now = _timeProvider.GetUtcNow();
+
+ foreach (var (agentId, state) in _healthStates)
+ {
+ var timeSinceHeartbeat = now - state.LastHeartbeat;
+
+ if (timeSinceHeartbeat > _config.FailureThreshold)
+ {
+ await HandleAgentFailureAsync(agentId, state, ct);
+ }
+ else if (timeSinceHeartbeat > _config.WarningThreshold)
+ {
+ await HandleAgentWarningAsync(agentId, state, ct);
+ }
+ }
+ }
+
+ private async Task HandleAgentFailureAsync(
+ Guid agentId,
+ AgentHealthState state,
+ CancellationToken ct)
+ {
+ _logger.LogWarning("Agent {AgentId} detected as failed", agentId);
+
+ // Update state
+ state.AssessedHealth = HealthLevel.Failed;
+ state.FailedAt = _timeProvider.GetUtcNow();
+
+ // Notify failover manager
+ await _eventPublisher.PublishAsync(new AgentFailedEvent(agentId, state), ct);
+
+ // Mark agent as offline
+ await _agentStore.UpdateStatusAsync(agentId, AgentStatus.Offline, ct);
+ }
+}
+
+public sealed class AgentHealthState
+{
+ public DateTimeOffset LastHeartbeat { get; set; }
+ public HealthLevel ReportedHealth { get; set; }
+ public HealthLevel AssessedHealth { get; set; }
+ public int CurrentLoad { get; set; }
+ public ResourceMetrics ResourceMetrics { get; set; }
+ public DateTimeOffset? FailedAt { get; set; }
+ public int ConsecutiveFailures { get; set; }
+}
+
+public enum HealthLevel
+{
+ Healthy = 100,
+ Degraded = 75,
+ Warning = 50,
+ Critical = 25,
+ Failed = 0
+}
+```
+
+#### 3. FailoverManager
+
+Orchestrates failover between agents:
+
+```csharp
+public sealed class FailoverManager
+{
+ public async Task PerformFailoverAsync(
+ FailoverRequest request,
+ CancellationToken ct)
+ {
+ var result = new FailoverResult
+ {
+ RequestId = Guid.NewGuid(),
+ FailedAgentId = request.FailedAgentId,
+ StartedAt = _timeProvider.GetUtcNow()
+ };
+
+ try
+ {
+ // 1. Find cluster
+ var cluster = await _clusterStore.GetByAgentAsync(request.FailedAgentId, ct);
+ if (cluster == null)
+ {
+ result.Status = FailoverStatus.NotInCluster;
+ return result;
+ }
+
+ // 2. Select failover target
+ var target = await SelectFailoverTargetAsync(cluster, request, ct);
+ if (target == null)
+ {
+ result.Status = FailoverStatus.NoTargetAvailable;
+ await HandleNoTargetAsync(cluster, request, ct);
+ return result;
+ }
+
+ result.TargetAgentId = target.AgentId;
+
+ // 3. Transfer in-flight tasks
+ var tasksToTransfer = await GetInFlightTasksAsync(request.FailedAgentId, ct);
+ result.TasksTransferred = tasksToTransfer.Count;
+
+ foreach (var task in tasksToTransfer)
+ {
+ await TransferTaskAsync(task, target.AgentId, ct);
+ }
+
+ // 4. Update cluster membership
+ if (cluster.LeaderId == request.FailedAgentId)
+ {
+ await PromoteToLeaderAsync(cluster, target.AgentId, ct);
+ }
+
+ // 5. Update target assignments
+ await ReassignTargetsAsync(request.FailedAgentId, target.AgentId, ct);
+
+ result.Status = FailoverStatus.Succeeded;
+ result.CompletedAt = _timeProvider.GetUtcNow();
+
+ // Emit event
+ await _eventPublisher.PublishAsync(new FailoverCompletedEvent(result), ct);
+ }
+ catch (Exception ex)
+ {
+ result.Status = FailoverStatus.Failed;
+ result.Error = ex.Message;
+ _logger.LogError(ex, "Failover failed for agent {AgentId}", request.FailedAgentId);
+ }
+
+ return result;
+ }
+
+ private async Task SelectFailoverTargetAsync(
+ AgentCluster cluster,
+ FailoverRequest request,
+ CancellationToken ct)
+ {
+ var candidates = await _clusterManager.GetClusterMembersAsync(cluster.Id, ct);
+
+ // Filter healthy agents
+ candidates = candidates
+ .Where(a => a.AgentId != request.FailedAgentId)
+ .Where(a => a.Status == AgentStatus.Online)
+ .Where(a => a.HasCapability(request.RequiredCapabilities))
+ .ToList();
+
+ if (!candidates.Any())
+ return null;
+
+ // Apply selection strategy
+ return cluster.FailoverPolicy.SelectionStrategy switch
+ {
+ FailoverSelectionStrategy.Standby =>
+ candidates.FirstOrDefault(a => a.Role == AgentRole.Standby) ??
+ candidates.OrderBy(a => a.CurrentLoad).First(),
+
+ FailoverSelectionStrategy.LeastLoaded =>
+ candidates.OrderBy(a => a.CurrentLoad / (double)a.MaxLoad).First(),
+
+ FailoverSelectionStrategy.RoundRobin =>
+ SelectRoundRobin(cluster, candidates),
+
+ FailoverSelectionStrategy.Affinity =>
+ SelectByAffinity(candidates, request.AffinityHints),
+
+ _ => candidates.First()
+ };
+ }
+
+ private async Task TransferTaskAsync(
+ AgentTask task,
+ Guid targetAgentId,
+ CancellationToken ct)
+ {
+ // Mark task as transferred
+ task.TransferredFrom = task.AssignedAgentId;
+ task.AssignedAgentId = targetAgentId;
+ task.TransferredAt = _timeProvider.GetUtcNow();
+
+ // Reset task state for retry
+ if (task.Status == TaskStatus.Running)
+ {
+ task.Status = TaskStatus.Pending;
+ task.RetryCount++;
+ }
+
+ await _taskStore.SaveAsync(task, ct);
+
+ // Notify target agent
+ await _agentNotifier.NotifyTaskAssignedAsync(targetAgentId, task, ct);
+ }
+}
+
+public sealed record FailoverResult
+{
+ public Guid RequestId { get; init; }
+ public Guid FailedAgentId { get; init; }
+ public Guid? TargetAgentId { get; init; }
+ public FailoverStatus Status { get; init; }
+ public int TasksTransferred { get; init; }
+ public string? Error { get; init; }
+ public DateTimeOffset StartedAt { get; init; }
+ public DateTimeOffset? CompletedAt { get; init; }
+}
+
+public enum FailoverStatus
+{
+ Succeeded,
+ NotInCluster,
+ NoTargetAvailable,
+ Failed
+}
+```
+
+#### 4. LeaderElection
+
+Manages leader election for ActivePassive clusters:
+
+```csharp
+public sealed class LeaderElection
+{
+ private readonly IDistributedLockProvider _lockProvider;
+
+ public async Task RunElectionAsync(
+ Guid clusterId,
+ CancellationToken ct)
+ {
+ var cluster = await _clusterStore.GetAsync(clusterId, ct);
+ var members = await _clusterManager.GetClusterMembersAsync(clusterId, ct);
+
+ var healthyMembers = members
+ .Where(m => m.Status == AgentStatus.Online)
+ .OrderByDescending(m => m.Role == AgentRole.Standby) // Prefer standbys
+ .ThenBy(m => m.CurrentLoad) // Then least loaded
+ .ToList();
+
+ if (!healthyMembers.Any())
+ {
+ _logger.LogWarning("No healthy members for cluster {ClusterId}", clusterId);
+ return;
+ }
+
+ // Acquire distributed lock for election
+ await using var @lock = await _lockProvider.AcquireAsync(
+ $"cluster:{clusterId}:election", ct);
+
+ // Re-read cluster state under lock
+ cluster = await _clusterStore.GetAsync(clusterId, ct);
+
+ // Check if current leader is healthy
+ var currentLeader = healthyMembers.FirstOrDefault(m => m.AgentId == cluster.LeaderId);
+ if (currentLeader != null)
+ {
+ _logger.LogDebug("Current leader {LeaderId} is healthy", cluster.LeaderId);
+ return;
+ }
+
+ // Elect new leader
+ var newLeader = healthyMembers.First();
+ await PromoteToLeaderAsync(cluster, newLeader.AgentId, ct);
+
+ _logger.LogInformation(
+ "Elected new leader {NewLeaderId} for cluster {ClusterId}",
+ newLeader.AgentId, clusterId);
+ }
+
+ private async Task PromoteToLeaderAsync(
+ AgentCluster cluster,
+ Guid newLeaderId,
+ CancellationToken ct)
+ {
+ var previousLeaderId = cluster.LeaderId;
+
+ // Update cluster
+ cluster = cluster with { LeaderId = newLeaderId };
+
+ // Update standby list
+ var newStandbys = cluster.StandbyIds
+ .Where(id => id != newLeaderId)
+ .ToImmutableArray();
+
+ if (previousLeaderId.HasValue)
+ {
+ // Demote previous leader to standby if still healthy
+ var previousLeader = await _agentStore.GetAsync(previousLeaderId.Value, ct);
+ if (previousLeader?.Status == AgentStatus.Online)
+ {
+ newStandbys = newStandbys.Add(previousLeaderId.Value);
+ }
+ }
+
+ cluster = cluster with { StandbyIds = newStandbys };
+ await _clusterStore.SaveAsync(cluster, ct);
+
+ // Notify agents
+ await _agentNotifier.NotifyLeaderChangeAsync(cluster.Id, newLeaderId, ct);
+
+ // Emit event
+ await _eventPublisher.PublishAsync(new LeaderElectedEvent(
+ cluster.Id, newLeaderId, previousLeaderId), ct);
+ }
+}
+```
+
+#### 5. TaskQueue
+
+Durable task queue for offline agents:
+
+```csharp
+public sealed class TaskQueue
+{
+ private readonly ITaskQueueStore _store;
+
+ public async Task EnqueueAsync(
+ AgentTask task,
+ EnqueueOptions options,
+ CancellationToken ct)
+ {
+ var queuedTask = new QueuedTask
+ {
+ Id = Guid.NewGuid(),
+ Task = task,
+ Priority = options.Priority,
+ EnqueuedAt = _timeProvider.GetUtcNow(),
+ ExpiresAt = options.ExpiresAt,
+ TargetAgentId = options.TargetAgentId,
+ TargetClusterId = options.TargetClusterId,
+ RequiredCapabilities = options.RequiredCapabilities,
+ DeliveryAttempts = 0,
+ MaxDeliveryAttempts = options.MaxDeliveryAttempts
+ };
+
+ await _store.SaveAsync(queuedTask, ct);
+ return queuedTask.Id;
+ }
+
+ public async Task DequeueAsync(
+ Guid agentId,
+ ImmutableArray capabilities,
+ CancellationToken ct)
+ {
+ // Find eligible tasks
+ var tasks = await _store.GetPendingTasksAsync(agentId, capabilities, ct);
+
+ foreach (var task in tasks.OrderByDescending(t => t.Priority))
+ {
+ // Check expiration
+ if (task.ExpiresAt.HasValue && task.ExpiresAt < _timeProvider.GetUtcNow())
+ {
+ await ExpireTaskAsync(task, ct);
+ continue;
+ }
+
+ // Try to claim task
+ var claimed = await _store.TryClaimAsync(task.Id, agentId, ct);
+ if (claimed)
+ {
+ task.DeliveryAttempts++;
+ task.LastAttemptAt = _timeProvider.GetUtcNow();
+ task.ClaimedBy = agentId;
+ await _store.SaveAsync(task, ct);
+ return task;
+ }
+ }
+
+ return null;
+ }
+
+ public async Task CompleteAsync(Guid taskId, TaskResult result, CancellationToken ct)
+ {
+ var task = await _store.GetAsync(taskId, ct);
+ if (task == null)
+ return;
+
+ task.CompletedAt = _timeProvider.GetUtcNow();
+ task.Result = result;
+ task.Status = result.Success ? QueuedTaskStatus.Completed : QueuedTaskStatus.Failed;
+
+ await _store.SaveAsync(task, ct);
+
+ // Archive or retry
+ if (task.Status == QueuedTaskStatus.Completed)
+ {
+ await _store.ArchiveAsync(taskId, ct);
+ }
+ else if (task.DeliveryAttempts < task.MaxDeliveryAttempts)
+ {
+ await RetryAsync(task, ct);
+ }
+ else
+ {
+ await _store.MoveToDeadLetterAsync(taskId, ct);
+ }
+ }
+
+ private async Task RetryAsync(QueuedTask task, CancellationToken ct)
+ {
+ var delay = CalculateBackoff(task.DeliveryAttempts);
+ task.Status = QueuedTaskStatus.Pending;
+ task.ClaimedBy = null;
+ task.NextAttemptAt = _timeProvider.GetUtcNow().Add(delay);
+ await _store.SaveAsync(task, ct);
+ }
+
+ private TimeSpan CalculateBackoff(int attempts)
+ {
+ // Exponential backoff with jitter
+ var baseDelay = TimeSpan.FromSeconds(Math.Pow(2, attempts));
+ var jitter = TimeSpan.FromMilliseconds(Random.Shared.Next(0, 1000));
+ var maxDelay = TimeSpan.FromMinutes(5);
+ return Min(baseDelay + jitter, maxDelay);
+ }
+}
+
+public sealed record QueuedTask
+{
+ public Guid Id { get; init; }
+ public AgentTask Task { get; init; }
+ public TaskPriority Priority { get; init; }
+ public QueuedTaskStatus Status { get; init; }
+
+ // Targeting
+ public Guid? TargetAgentId { get; init; }
+ public Guid? TargetClusterId { get; init; }
+ public ImmutableArray RequiredCapabilities { get; init; }
+
+ // Timing
+ public DateTimeOffset EnqueuedAt { get; init; }
+ public DateTimeOffset? ExpiresAt { get; init; }
+ public DateTimeOffset? NextAttemptAt { get; init; }
+ public DateTimeOffset? CompletedAt { get; init; }
+
+ // Delivery
+ public int DeliveryAttempts { get; set; }
+ public int MaxDeliveryAttempts { get; init; }
+ public DateTimeOffset? LastAttemptAt { get; set; }
+ public Guid? ClaimedBy { get; set; }
+
+ // Result
+ public TaskResult? Result { get; set; }
+}
+```
+
+#### 6. SelfHealer
+
+Automatic recovery and self-healing:
+
+```csharp
+public sealed class SelfHealer
+{
+ public async Task RunHealingCycleAsync(CancellationToken ct)
+ {
+ var healingActions = new List();
+
+ // 1. Detect unhealthy agents
+ var unhealthyAgents = await DetectUnhealthyAgentsAsync(ct);
+ foreach (var agent in unhealthyAgents)
+ {
+ var action = await DetermineHealingActionAsync(agent, ct);
+ if (action != null)
+ {
+ healingActions.Add(action);
+ }
+ }
+
+ // 2. Detect orphaned tasks
+ var orphanedTasks = await DetectOrphanedTasksAsync(ct);
+ foreach (var task in orphanedTasks)
+ {
+ healingActions.Add(new HealingAction
+ {
+ Type = HealingActionType.ReassignTask,
+ TargetId = task.Id,
+ Reason = "Task orphaned after agent failure"
+ });
+ }
+
+ // 3. Detect under-replicated clusters
+ var underReplicatedClusters = await DetectUnderReplicatedClustersAsync(ct);
+ foreach (var cluster in underReplicatedClusters)
+ {
+ healingActions.Add(new HealingAction
+ {
+ Type = HealingActionType.RebalanceCluster,
+ TargetId = cluster.Id,
+ Reason = $"Cluster has {cluster.HealthyAgentCount}/{cluster.DesiredAgents} agents"
+ });
+ }
+
+ // 4. Execute healing actions
+ foreach (var action in healingActions.OrderByDescending(a => a.Priority))
+ {
+ await ExecuteHealingActionAsync(action, ct);
+ }
+ }
+
+ private async Task DetermineHealingActionAsync(
+ Agent agent,
+ CancellationToken ct)
+ {
+ var health = await _healthMonitor.GetHealthStateAsync(agent.Id, ct);
+
+ return health.AssessedHealth switch
+ {
+ HealthLevel.Degraded => new HealingAction
+ {
+ Type = HealingActionType.DrainAgent,
+ TargetId = agent.Id,
+ Reason = "Agent degraded, draining tasks"
+ },
+
+ HealthLevel.Warning => new HealingAction
+ {
+ Type = HealingActionType.ReduceLoad,
+ TargetId = agent.Id,
+ Reason = "Agent showing warnings, reducing load"
+ },
+
+ HealthLevel.Critical or HealthLevel.Failed => new HealingAction
+ {
+ Type = HealingActionType.FailoverAgent,
+ TargetId = agent.Id,
+ Reason = $"Agent health critical: {health.AssessedHealth}"
+ },
+
+ _ => null
+ };
+ }
+
+ private async Task ExecuteHealingActionAsync(
+ HealingAction action,
+ CancellationToken ct)
+ {
+ _logger.LogInformation(
+ "Executing healing action {ActionType} on {TargetId}: {Reason}",
+ action.Type, action.TargetId, action.Reason);
+
+ switch (action.Type)
+ {
+ case HealingActionType.FailoverAgent:
+ await _failoverManager.PerformFailoverAsync(
+ new FailoverRequest { FailedAgentId = action.TargetId }, ct);
+ break;
+
+ case HealingActionType.DrainAgent:
+ await DrainAgentAsync(action.TargetId, ct);
+ break;
+
+ case HealingActionType.ReduceLoad:
+ await ReduceAgentLoadAsync(action.TargetId, ct);
+ break;
+
+ case HealingActionType.ReassignTask:
+ await ReassignTaskAsync(action.TargetId, ct);
+ break;
+
+ case HealingActionType.RebalanceCluster:
+ await RebalanceClusterAsync(action.TargetId, ct);
+ break;
+ }
+
+ // Record healing action
+ await _healingStore.RecordAsync(action, ct);
+ }
+
+ private async Task DrainAgentAsync(Guid agentId, CancellationToken ct)
+ {
+ // Stop accepting new tasks
+ await _agentStore.UpdateStatusAsync(agentId, AgentStatus.Draining, ct);
+
+ // Wait for in-flight tasks to complete (with timeout)
+ var timeout = _timeProvider.GetUtcNow().AddMinutes(5);
+ while (_timeProvider.GetUtcNow() < timeout)
+ {
+ var inFlightTasks = await _taskStore.GetInFlightTasksAsync(agentId, ct);
+ if (!inFlightTasks.Any())
+ break;
+
+ await Task.Delay(TimeSpan.FromSeconds(5), ct);
+ }
+
+ // Force transfer remaining tasks
+ var remainingTasks = await _taskStore.GetInFlightTasksAsync(agentId, ct);
+ foreach (var task in remainingTasks)
+ {
+ await _failoverManager.TransferTaskAsync(task, ct);
+ }
+ }
+}
+```
+
+#### 7. StateSync
+
+Synchronizes state across cluster members:
+
+```csharp
+public sealed class StateSync
+{
+ public async Task SyncClusterStateAsync(
+ Guid clusterId,
+ CancellationToken ct)
+ {
+ var cluster = await _clusterStore.GetAsync(clusterId, ct);
+ var members = await _clusterManager.GetClusterMembersAsync(clusterId, ct);
+ var leader = members.FirstOrDefault(m => m.Role == AgentRole.Leader);
+
+ if (leader == null)
+ {
+ _logger.LogWarning("No leader for cluster {ClusterId}, skipping sync", clusterId);
+ return;
+ }
+
+ // Get leader's state
+ var leaderState = await GetAgentStateAsync(leader.AgentId, ct);
+
+ // Sync to other members
+ foreach (var member in members.Where(m => m.Role != AgentRole.Leader))
+ {
+ await SyncToMemberAsync(member.AgentId, leaderState, ct);
+ }
+ }
+
+ private async Task SyncToMemberAsync(
+ Guid agentId,
+ AgentState leaderState,
+ CancellationToken ct)
+ {
+ var memberState = await GetAgentStateAsync(agentId, ct);
+ var diff = CalculateStateDiff(leaderState, memberState);
+
+ if (diff.HasChanges)
+ {
+ _logger.LogDebug(
+ "Syncing {ChangeCount} changes to agent {AgentId}",
+ diff.Changes.Count, agentId);
+
+ await _agentNotifier.SendStateSyncAsync(agentId, diff, ct);
+ }
+ }
+}
+
+public sealed record AgentState
+{
+ public Guid AgentId { get; init; }
+ public DateTimeOffset CapturedAt { get; init; }
+
+ // Target assignments
+ public ImmutableArray AssignedTargets { get; init; }
+
+ // Task state
+ public ImmutableArray TaskStates { get; init; }
+
+ // Configuration
+ public AgentConfiguration Configuration { get; init; }
+
+ // Cached data
+ public ImmutableDictionary CachedDigests { get; init; }
+}
+```
+
+---
+
+## Cluster Topologies
+
+### Active-Passive
+
+```
+┌─────────────────────────────────────────┐
+│ Agent Cluster │
+│ │
+│ ┌─────────┐ ┌─────────┐ │
+│ │ LEADER │ │ STANDBY │ │
+│ │ Agent A │ │ Agent B │ │
+│ │ (Active)│ │(Passive)│ │
+│ └────┬────┘ └────┬────┘ │
+│ │ │ │
+│ ▼ │ (failover) │
+│ ┌─────────┐ │ │
+│ │ Targets │◄────────┘ │
+│ └─────────┘ │
+└─────────────────────────────────────────┘
+```
+
+### Active-Active
+
+```
+┌─────────────────────────────────────────┐
+│ Agent Cluster │
+│ │
+│ ┌─────────┐ ┌─────────┐ │
+│ │ Agent A │ │ Agent B │ │
+│ │ (Active)│ │ (Active)│ │
+│ └────┬────┘ └────┬────┘ │
+│ │ │ │
+│ └──────┬───────┘ │
+│ ▼ │
+│ ┌─────────────────────┐ │
+│ │ Targets (balanced) │ │
+│ └─────────────────────┘ │
+└─────────────────────────────────────────┘
+```
+
+### Sharded
+
+```
+┌─────────────────────────────────────────┐
+│ Agent Cluster │
+│ │
+│ ┌─────────┐ ┌─────────┐ │
+│ │ Agent A │ │ Agent B │ │
+│ │ Shard 0 │ │ Shard 1 │ │
+│ └────┬────┘ └────┬────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ ┌─────────┐ ┌─────────┐ │
+│ │Targets │ │Targets │ │
+│ │ 0-49 │ │ 50-99 │ │
+│ └─────────┘ └─────────┘ │
+└─────────────────────────────────────────┘
+```
+
+---
+
+## API Design
+
+### REST Endpoints
+
+```
+# Clusters
+POST /api/v1/agents/clusters # Create cluster
+GET /api/v1/agents/clusters # List clusters
+GET /api/v1/agents/clusters/{id} # Get cluster
+PUT /api/v1/agents/clusters/{id} # Update cluster
+DELETE /api/v1/agents/clusters/{id} # Delete cluster
+GET /api/v1/agents/clusters/{id}/members # Get members
+POST /api/v1/agents/clusters/{id}/rebalance # Trigger rebalance
+
+# Failover
+POST /api/v1/agents/{id}/failover # Manual failover
+GET /api/v1/agents/failovers # Failover history
+GET /api/v1/agents/failovers/{id} # Failover details
+
+# Health
+GET /api/v1/agents/{id}/health # Get agent health
+GET /api/v1/agents/clusters/{id}/health # Get cluster health
+
+# Task Queue
+GET /api/v1/agents/tasks/queue # View queue
+GET /api/v1/agents/tasks/queue/dead-letter # Dead letter queue
+POST /api/v1/agents/tasks/{id}/retry # Retry task
+
+# Self-Healing
+GET /api/v1/agents/healing/actions # Healing history
+GET /api/v1/agents/healing/status # Current healing status
+```
+
+---
+
+## Metrics & Observability
+
+### Prometheus Metrics
+
+```
+# Cluster Health
+stella_agent_cluster_members{cluster_id, status}
+stella_agent_cluster_leader{cluster_id, agent_id}
+stella_agent_cluster_health{cluster_id}
+
+# Failover
+stella_agent_failovers_total{cluster_id, status}
+stella_agent_failover_duration_seconds{cluster_id}
+stella_agent_tasks_transferred_total{cluster_id}
+
+# Task Queue
+stella_agent_queue_depth{cluster_id, priority}
+stella_agent_queue_latency_seconds{cluster_id}
+stella_agent_dead_letter_queue_depth{cluster_id}
+
+# Self-Healing
+stella_agent_healing_actions_total{action_type, status}
+stella_agent_healing_cycle_duration_seconds
+
+# Agent Health
+stella_agent_health_score{agent_id}
+stella_agent_heartbeat_age_seconds{agent_id}
+stella_agent_task_completion_rate{agent_id}
+```
+
+---
+
+## Configuration
+
+```yaml
+agent_cluster:
+ name: "production-docker-agents"
+ target_group_id: "prod-docker-hosts"
+
+ membership:
+ minimum_agents: 2
+ desired_agents: 3
+ max_agents: 5
+
+ replication_mode: active_active
+
+ failover:
+ selection_strategy: least_loaded
+ task_transfer_timeout: "00:05:00"
+ max_transfer_retries: 3
+
+ health_monitoring:
+ heartbeat_interval: "00:00:30"
+ warning_threshold: "00:01:00"
+ failure_threshold: "00:01:30"
+ health_check_interval: "00:00:10"
+
+ task_queue:
+ max_delivery_attempts: 3
+ default_expiration: "01:00:00"
+ dead_letter_retention: "7.00:00:00"
+
+ self_healing:
+ enabled: true
+ cycle_interval: "00:01:00"
+ drain_timeout: "00:05:00"
+
+ leader_election:
+ enabled: true # For ActivePassive mode
+ election_interval: "00:00:15"
+ lease_duration: "00:00:30"
+```
+
+---
+
+## Test Strategy
+
+### Unit Tests
+- Health score calculation
+- Failover target selection
+- Task queue operations
+- Backoff calculation
+
+### Integration Tests
+- Full failover flow
+- Leader election
+- State synchronization
+- Task transfer
+
+### Chaos Tests
+- Random agent failures
+- Network partitions
+- Split-brain scenarios
+- Cascading failures
+
+### Load Tests
+- High task throughput
+- Many concurrent agents
+- Rapid failover cycles
+
+---
+
+## Migration Path
+
+### Phase 1: Foundation (Week 1-2)
+- Cluster data model
+- Basic cluster management
+- Health monitoring enhancements
+
+### Phase 2: Failover (Week 3-4)
+- Failover manager
+- Task transfer
+- Target reassignment
+
+### Phase 3: Leader Election (Week 5-6)
+- Distributed lock integration
+- Election algorithm
+- ActivePassive support
+
+### Phase 4: Task Queue (Week 7-8)
+- Durable queue implementation
+- Dead letter handling
+- Retry logic
+
+### Phase 5: Self-Healing (Week 9-10)
+- Healing cycle
+- Automatic actions
+- Monitoring integration
+
+### Phase 6: State Sync (Week 11-12)
+- State diffing
+- Sync protocol
+- Consistency verification
diff --git a/docs/modules/release-orchestrator/enhancements/compliance-reporting.md b/docs/modules/release-orchestrator/enhancements/compliance-reporting.md
new file mode 100644
index 000000000..d63d2f6fa
--- /dev/null
+++ b/docs/modules/release-orchestrator/enhancements/compliance-reporting.md
@@ -0,0 +1,1187 @@
+# Compliance & Reporting
+
+## Overview
+
+Compliance & Reporting transforms the Release Orchestrator's audit capabilities into a comprehensive compliance management system. This enhancement provides pre-built compliance report templates, evidence chain visualization, audit query interface, regulatory framework alignment, and automated compliance checking.
+
+This is a best-in-class implementation designed to meet the needs of enterprises operating under strict regulatory requirements (SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, GDPR).
+
+---
+
+## Design Principles
+
+1. **Continuous Compliance**: Real-time compliance status, not periodic audits
+2. **Evidence-First**: All compliance claims backed by cryptographic evidence
+3. **Framework-Agnostic**: Adaptable to any regulatory framework
+4. **Auditor-Friendly**: Reports designed for external auditor consumption
+5. **Immutable Records**: Tamper-proof audit trail
+6. **Automated Where Possible**: Reduce manual compliance burden
+
+---
+
+## Architecture
+
+### Component Overview
+
+```
+┌────────────────────────────────────────────────────────────────────────┐
+│ Compliance & Reporting System │
+├────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ ComplianceEngine │───▶│ ReportGenerator │───▶│ EvidenceChain │ │
+│ │ │ │ │ │ Visualizer │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ FrameworkMapper │ │ AuditQueryEngine │ │ ControlValidator│ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ ExportService │ │ ScheduledReports │ │ AlertManager │ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │
+└────────────────────────────────────────────────────────────────────────┘
+```
+
+### Key Components
+
+#### 1. ComplianceEngine
+
+Core compliance evaluation engine:
+
+```csharp
+public sealed class ComplianceEngine
+{
+ private readonly ImmutableArray _frameworks;
+ private readonly IControlValidator _validator;
+ private readonly IEvidenceStore _evidenceStore;
+
+ public async Task EvaluateAsync(
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ var status = new ComplianceStatus
+ {
+ TenantId = request.TenantId,
+ EvaluatedAt = _timeProvider.GetUtcNow(),
+ Frameworks = new List()
+ };
+
+ foreach (var frameworkId in request.Frameworks)
+ {
+ var framework = _frameworks.First(f => f.Id == frameworkId);
+ var frameworkStatus = await EvaluateFrameworkAsync(framework, request, ct);
+ status.Frameworks.Add(frameworkStatus);
+ }
+
+ // Calculate overall compliance score
+ status.OverallScore = CalculateOverallScore(status.Frameworks);
+ status.ComplianceLevel = DetermineComplianceLevel(status.OverallScore);
+
+ return status;
+ }
+
+ private async Task EvaluateFrameworkAsync(
+ IComplianceFramework framework,
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ var frameworkStatus = new FrameworkStatus
+ {
+ FrameworkId = framework.Id,
+ FrameworkName = framework.Name,
+ Version = framework.Version,
+ Controls = new List()
+ };
+
+ foreach (var control in framework.Controls)
+ {
+ var controlStatus = await EvaluateControlAsync(control, request, ct);
+ frameworkStatus.Controls.Add(controlStatus);
+ }
+
+ // Calculate framework compliance
+ frameworkStatus.TotalControls = framework.Controls.Count;
+ frameworkStatus.PassedControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.Passed);
+ frameworkStatus.FailedControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.Failed);
+ frameworkStatus.NotApplicableControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.NotApplicable);
+ frameworkStatus.Score = (double)frameworkStatus.PassedControls /
+ (frameworkStatus.TotalControls - frameworkStatus.NotApplicableControls) * 100;
+
+ return frameworkStatus;
+ }
+
+ private async Task EvaluateControlAsync(
+ ComplianceControl control,
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ var controlStatus = new ControlStatus
+ {
+ ControlId = control.Id,
+ ControlName = control.Name,
+ Category = control.Category,
+ Description = control.Description,
+ Evidence = new List()
+ };
+
+ // Validate control
+ var validationResult = await _validator.ValidateAsync(control, request, ct);
+ controlStatus.Status = validationResult.Status;
+ controlStatus.Findings = validationResult.Findings;
+
+ // Collect evidence
+ var evidence = await _evidenceStore.GetEvidenceForControlAsync(
+ request.TenantId, control.Id, request.DateRange, ct);
+ controlStatus.Evidence = evidence.Select(e => new EvidenceReference
+ {
+ EvidenceId = e.Id,
+ Type = e.Type,
+ CollectedAt = e.CollectedAt,
+ Summary = e.Summary
+ }).ToList();
+
+ return controlStatus;
+ }
+}
+
+public sealed record ComplianceStatus
+{
+ public Guid TenantId { get; init; }
+ public DateTimeOffset EvaluatedAt { get; init; }
+ public double OverallScore { get; init; }
+ public ComplianceLevel ComplianceLevel { get; init; }
+ public List Frameworks { get; init; }
+}
+
+public enum ComplianceLevel
+{
+ FullyCompliant, // 100%
+ SubstantiallyCompliant, // 90-99%
+ PartiallyCompliant, // 70-89%
+ NonCompliant // <70%
+}
+```
+
+#### 2. FrameworkMapper
+
+Maps organizational controls to compliance frameworks:
+
+```csharp
+public sealed class FrameworkMapper
+{
+ private readonly ImmutableDictionary _frameworks;
+
+ public FrameworkMapper()
+ {
+ _frameworks = LoadFrameworks().ToImmutableDictionary(f => f.Id);
+ }
+
+ private IEnumerable LoadFrameworks()
+ {
+ yield return new Soc2Framework();
+ yield return new Iso27001Framework();
+ yield return new PciDssFramework();
+ yield return new HipaaFramework();
+ yield return new FedRampFramework();
+ yield return new GdprFramework();
+ yield return new NistCsfFramework();
+ }
+
+ public IReadOnlyList MapToFramework(
+ string frameworkId,
+ IReadOnlyList orgControls)
+ {
+ var framework = _frameworks[frameworkId];
+ var mappings = new List();
+
+ foreach (var frameworkControl in framework.Controls)
+ {
+ var mapping = new ControlMapping
+ {
+ FrameworkControl = frameworkControl,
+ MappedOrgControls = new List()
+ };
+
+ // Find matching organizational controls
+ foreach (var orgControl in orgControls)
+ {
+ if (IsMatch(frameworkControl, orgControl))
+ {
+ mapping.MappedOrgControls.Add(orgControl);
+ }
+ }
+
+ mapping.CoverageStatus = mapping.MappedOrgControls.Any()
+ ? CoverageStatus.Covered
+ : CoverageStatus.Gap;
+
+ mappings.Add(mapping);
+ }
+
+ return mappings;
+ }
+
+ private bool IsMatch(ComplianceControl frameworkControl, OrganizationalControl orgControl)
+ {
+ // Check explicit mappings
+ if (orgControl.FrameworkMappings?.Contains(frameworkControl.Id) == true)
+ return true;
+
+ // Check keyword matching
+ var keywords = frameworkControl.Keywords ?? ImmutableArray.Empty;
+ return keywords.Any(k => orgControl.Description?.Contains(k, StringComparison.OrdinalIgnoreCase) == true);
+ }
+}
+
+// SOC 2 Framework Implementation
+public sealed class Soc2Framework : IComplianceFramework
+{
+ public string Id => "soc2-type2";
+ public string Name => "SOC 2 Type II";
+ public string Version => "2017";
+
+ public ImmutableArray Controls => new[]
+ {
+ // Security (Common Criteria)
+ new ComplianceControl
+ {
+ Id = "CC1.1",
+ Name = "COSO Principle 1",
+ Category = "Control Environment",
+ Description = "The entity demonstrates a commitment to integrity and ethical values.",
+ Keywords = new[] { "integrity", "ethics", "code of conduct" }.ToImmutableArray()
+ },
+ new ComplianceControl
+ {
+ Id = "CC6.1",
+ Name = "Logical and Physical Access Controls",
+ Category = "Logical and Physical Access",
+ Description = "The entity implements logical access security software, infrastructure, and architectures.",
+ Keywords = new[] { "access control", "authentication", "authorization", "mTLS" }.ToImmutableArray(),
+ AutomatedChecks = new[]
+ {
+ new AutomatedCheck
+ {
+ Id = "cc6.1.1",
+ Description = "All agent connections use mTLS",
+ CheckType = CheckType.AgentSecurity
+ },
+ new AutomatedCheck
+ {
+ Id = "cc6.1.2",
+ Description = "User authentication via SSO/OIDC",
+ CheckType = CheckType.AuthenticationMethod
+ }
+ }.ToImmutableArray()
+ },
+ new ComplianceControl
+ {
+ Id = "CC7.2",
+ Name = "System Operations",
+ Category = "System Operations",
+ Description = "The entity monitors system components and the operation of those components for anomalies.",
+ Keywords = new[] { "monitoring", "alerting", "anomaly detection" }.ToImmutableArray()
+ },
+ new ComplianceControl
+ {
+ Id = "CC8.1",
+ Name = "Change Management",
+ Category = "Change Management",
+ Description = "The entity authorizes, designs, develops, configures, documents, tests, approves, and implements changes.",
+ Keywords = new[] { "change management", "approval", "deployment", "release" }.ToImmutableArray(),
+ AutomatedChecks = new[]
+ {
+ new AutomatedCheck
+ {
+ Id = "cc8.1.1",
+ Description = "All production deployments require approval",
+ CheckType = CheckType.ApprovalRequired
+ },
+ new AutomatedCheck
+ {
+ Id = "cc8.1.2",
+ Description = "All changes produce evidence packets",
+ CheckType = CheckType.EvidenceGenerated
+ }
+ }.ToImmutableArray()
+ }
+ // ... more controls
+ }.ToImmutableArray();
+}
+```
+
+#### 3. ReportGenerator
+
+Generates compliance reports:
+
+```csharp
+public sealed class ReportGenerator
+{
+ public async Task GenerateAsync(
+ ReportRequest request,
+ CancellationToken ct)
+ {
+ var report = new ComplianceReport
+ {
+ Id = Guid.NewGuid(),
+ Type = request.ReportType,
+ GeneratedAt = _timeProvider.GetUtcNow(),
+ GeneratedBy = request.RequestedBy,
+ DateRange = request.DateRange
+ };
+
+ // Get compliance status
+ var status = await _complianceEngine.EvaluateAsync(new ComplianceEvaluationRequest
+ {
+ TenantId = request.TenantId,
+ Frameworks = request.Frameworks,
+ DateRange = request.DateRange
+ }, ct);
+
+ report.ComplianceStatus = status;
+
+ // Generate sections based on report type
+ switch (request.ReportType)
+ {
+ case ReportType.ExecutiveSummary:
+ report.Sections = await GenerateExecutiveSummaryAsync(status, ct);
+ break;
+
+ case ReportType.DetailedCompliance:
+ report.Sections = await GenerateDetailedReportAsync(status, request, ct);
+ break;
+
+ case ReportType.GapAnalysis:
+ report.Sections = await GenerateGapAnalysisAsync(status, ct);
+ break;
+
+ case ReportType.AuditReadiness:
+ report.Sections = await GenerateAuditReadinessAsync(status, request, ct);
+ break;
+
+ case ReportType.EvidencePackage:
+ report.Sections = await GenerateEvidencePackageAsync(status, request, ct);
+ break;
+ }
+
+ // Add standard sections
+ report.Sections.Add(GenerateMethodologySection());
+ report.Sections.Add(GenerateDisclaimerSection());
+
+ return report;
+ }
+
+ private async Task> GenerateDetailedReportAsync(
+ ComplianceStatus status,
+ ReportRequest request,
+ CancellationToken ct)
+ {
+ var sections = new List();
+
+ // Overview section
+ sections.Add(new ReportSection
+ {
+ Title = "Compliance Overview",
+ Content = new OverviewContent
+ {
+ EvaluationDate = status.EvaluatedAt,
+ OverallScore = status.OverallScore,
+ ComplianceLevel = status.ComplianceLevel,
+ FrameworkSummaries = status.Frameworks.Select(f => new FrameworkSummary
+ {
+ Name = f.FrameworkName,
+ Score = f.Score,
+ PassedControls = f.PassedControls,
+ TotalControls = f.TotalControls
+ }).ToList()
+ }
+ });
+
+ // Per-framework sections
+ foreach (var framework in status.Frameworks)
+ {
+ var frameworkSection = new ReportSection
+ {
+ Title = $"{framework.FrameworkName} Compliance",
+ Subsections = new List()
+ };
+
+ // Group controls by category
+ var byCategory = framework.Controls.GroupBy(c => c.Category);
+ foreach (var category in byCategory)
+ {
+ var categorySection = new ReportSection
+ {
+ Title = category.Key,
+ Content = new ControlCategoryContent
+ {
+ Controls = category.Select(c => new ControlDetail
+ {
+ Id = c.ControlId,
+ Name = c.ControlName,
+ Status = c.Status,
+ Findings = c.Findings,
+ EvidenceCount = c.Evidence.Count,
+ EvidenceReferences = c.Evidence
+ }).ToList()
+ }
+ };
+ frameworkSection.Subsections.Add(categorySection);
+ }
+
+ sections.Add(frameworkSection);
+ }
+
+ // Findings summary
+ var allFindings = status.Frameworks
+ .SelectMany(f => f.Controls)
+ .SelectMany(c => c.Findings ?? Enumerable.Empty())
+ .ToList();
+
+ sections.Add(new ReportSection
+ {
+ Title = "Findings Summary",
+ Content = new FindingsSummaryContent
+ {
+ TotalFindings = allFindings.Count,
+ CriticalFindings = allFindings.Count(f => f.Severity == FindingSeverity.Critical),
+ HighFindings = allFindings.Count(f => f.Severity == FindingSeverity.High),
+ MediumFindings = allFindings.Count(f => f.Severity == FindingSeverity.Medium),
+ LowFindings = allFindings.Count(f => f.Severity == FindingSeverity.Low),
+ Findings = allFindings.OrderByDescending(f => f.Severity).ToList()
+ }
+ });
+
+ // Recommendations
+ sections.Add(await GenerateRecommendationsAsync(status, ct));
+
+ return sections;
+ }
+}
+```
+
+#### 4. EvidenceChainVisualizer
+
+Visualizes evidence chains:
+
+```csharp
+public sealed class EvidenceChainVisualizer
+{
+ public async Task VisualizeAsync(
+ Guid rootEvidenceId,
+ CancellationToken ct)
+ {
+ var root = await _evidenceStore.GetAsync(rootEvidenceId, ct);
+ var visualization = new EvidenceChainVisualization
+ {
+ RootEvidenceId = rootEvidenceId,
+ GeneratedAt = _timeProvider.GetUtcNow()
+ };
+
+ // Build the chain
+ var chain = await BuildChainAsync(root, ct);
+ visualization.Chain = chain;
+
+ // Create graph representation
+ visualization.Graph = CreateGraph(chain);
+
+ // Verify chain integrity
+ visualization.IntegrityVerification = await VerifyChainIntegrityAsync(chain, ct);
+
+ // Generate narrative
+ visualization.Narrative = GenerateNarrative(chain);
+
+ return visualization;
+ }
+
+ private async Task BuildChainAsync(
+ EvidencePacket root,
+ CancellationToken ct)
+ {
+ var chain = new EvidenceChain
+ {
+ Nodes = new List(),
+ Edges = new List()
+ };
+
+ var visited = new HashSet();
+ var queue = new Queue();
+ queue.Enqueue(root);
+
+ while (queue.Count > 0)
+ {
+ var current = queue.Dequeue();
+ if (visited.Contains(current.Id))
+ continue;
+
+ visited.Add(current.Id);
+
+ // Add node
+ chain.Nodes.Add(new EvidenceNode
+ {
+ Id = current.Id,
+ Type = current.SubjectType,
+ Subject = current.SubjectId,
+ CollectedAt = current.CollectedAt,
+ Summary = GenerateSummary(current),
+ Signature = current.Signature,
+ SignatureValid = await VerifySignatureAsync(current, ct)
+ });
+
+ // Add edges for dependencies
+ foreach (var depId in current.DependsOn)
+ {
+ chain.Edges.Add(new EvidenceEdge
+ {
+ FromId = depId,
+ ToId = current.Id,
+ Relationship = "depends_on"
+ });
+
+ // Load dependent evidence
+ var dep = await _evidenceStore.GetAsync(depId, ct);
+ if (dep != null && !visited.Contains(dep.Id))
+ {
+ queue.Enqueue(dep);
+ }
+ }
+ }
+
+ return chain;
+ }
+
+ private EvidenceGraph CreateGraph(EvidenceChain chain)
+ {
+ var graph = new EvidenceGraph();
+
+ // Calculate layout (topological sort + horizontal levels)
+ var levels = CalculateLevels(chain);
+
+ foreach (var (level, nodes) in levels)
+ {
+ var y = level * 100;
+ var x = 0;
+ foreach (var node in nodes)
+ {
+ graph.Nodes.Add(new GraphNode
+ {
+ Id = node.Id.ToString(),
+ Label = $"{node.Type}\n{node.CollectedAt:g}",
+ X = x,
+ Y = y,
+ Color = GetNodeColor(node)
+ });
+ x += 150;
+ }
+ }
+
+ foreach (var edge in chain.Edges)
+ {
+ graph.Edges.Add(new GraphEdge
+ {
+ From = edge.FromId.ToString(),
+ To = edge.ToId.ToString(),
+ Label = edge.Relationship
+ });
+ }
+
+ return graph;
+ }
+
+ private string GenerateNarrative(EvidenceChain chain)
+ {
+ var sb = new StringBuilder();
+ var ordered = chain.Nodes.OrderBy(n => n.CollectedAt).ToList();
+
+ sb.AppendLine("## Evidence Chain Narrative");
+ sb.AppendLine();
+
+ foreach (var node in ordered)
+ {
+ sb.AppendLine($"### {node.CollectedAt:yyyy-MM-dd HH:mm:ss} UTC");
+ sb.AppendLine();
+ sb.AppendLine($"**{node.Type}** (ID: `{node.Id}`)");
+ sb.AppendLine();
+ sb.AppendLine(node.Summary);
+ sb.AppendLine();
+
+ if (node.SignatureValid)
+ {
+ sb.AppendLine($"✓ Signature verified");
+ }
+ else
+ {
+ sb.AppendLine($"⚠ Signature verification failed");
+ }
+ sb.AppendLine();
+ }
+
+ return sb.ToString();
+ }
+}
+```
+
+#### 5. AuditQueryEngine
+
+Powerful query interface for audit data:
+
+```csharp
+public sealed class AuditQueryEngine
+{
+ public async Task QueryAsync(
+ AuditQuery query,
+ CancellationToken ct)
+ {
+ var result = new AuditQueryResult
+ {
+ QueryId = Guid.NewGuid(),
+ ExecutedAt = _timeProvider.GetUtcNow(),
+ Query = query
+ };
+
+ // Build SQL from query
+ var sql = BuildQuery(query);
+
+ // Execute
+ var connection = await _connectionPool.GetReadReplicaAsync(ct);
+ var records = await connection.QueryAsync(sql.ToString(), query.Parameters, ct);
+
+ result.Records = records.ToImmutableArray();
+ result.TotalCount = records.Count();
+
+ // Apply aggregations if requested
+ if (query.Aggregations != null)
+ {
+ result.Aggregations = ApplyAggregations(records, query.Aggregations);
+ }
+
+ return result;
+ }
+
+ private string BuildQuery(AuditQuery query)
+ {
+ var sql = new StringBuilder();
+
+ // Base query
+ sql.AppendLine(@"
+ SELECT
+ e.id,
+ e.subject_type,
+ e.subject_id,
+ e.collected_at,
+ e.content,
+ e.signature,
+ u.email as actor_email,
+ u.name as actor_name
+ FROM evidence_packets e
+ LEFT JOIN users u ON e.actor_id = u.id
+ WHERE e.tenant_id = @TenantId");
+
+ // Date range
+ if (query.DateRange != null)
+ {
+ sql.AppendLine("AND e.collected_at >= @StartDate");
+ sql.AppendLine("AND e.collected_at <= @EndDate");
+ }
+
+ // Subject type filter
+ if (query.SubjectTypes?.Any() == true)
+ {
+ sql.AppendLine("AND e.subject_type = ANY(@SubjectTypes)");
+ }
+
+ // Actor filter
+ if (query.ActorId.HasValue)
+ {
+ sql.AppendLine("AND e.actor_id = @ActorId");
+ }
+
+ // Text search
+ if (!string.IsNullOrEmpty(query.SearchText))
+ {
+ sql.AppendLine("AND e.content_tsv @@ plainto_tsquery(@SearchText)");
+ }
+
+ // Custom filters
+ foreach (var filter in query.Filters ?? Enumerable.Empty())
+ {
+ sql.AppendLine($"AND {BuildFilterClause(filter)}");
+ }
+
+ // Ordering
+ sql.AppendLine("ORDER BY e.collected_at DESC");
+
+ // Pagination
+ if (query.Limit.HasValue)
+ {
+ sql.AppendLine($"LIMIT {query.Limit}");
+ }
+ if (query.Offset.HasValue)
+ {
+ sql.AppendLine($"OFFSET {query.Offset}");
+ }
+
+ return sql.ToString();
+ }
+}
+
+public sealed record AuditQuery
+{
+ public Guid TenantId { get; init; }
+ public DateRange? DateRange { get; init; }
+ public ImmutableArray? SubjectTypes { get; init; }
+ public Guid? ActorId { get; init; }
+ public string? SearchText { get; init; }
+ public ImmutableArray? Filters { get; init; }
+ public ImmutableArray? Aggregations { get; init; }
+ public int? Limit { get; init; }
+ public int? Offset { get; init; }
+}
+```
+
+#### 6. ControlValidator
+
+Automated control validation:
+
+```csharp
+public sealed class ControlValidator : IControlValidator
+{
+ private readonly ImmutableDictionary _checks;
+
+ public async Task ValidateAsync(
+ ComplianceControl control,
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ var result = new ControlValidationResult
+ {
+ ControlId = control.Id,
+ Findings = new List()
+ };
+
+ // Run automated checks
+ if (control.AutomatedChecks?.Any() == true)
+ {
+ foreach (var check in control.AutomatedChecks)
+ {
+ var checkImpl = _checks.GetValueOrDefault(check.CheckType);
+ if (checkImpl == null)
+ {
+ result.Findings.Add(new Finding
+ {
+ Severity = FindingSeverity.Low,
+ Message = $"Automated check {check.Id} not implemented",
+ CheckId = check.Id
+ });
+ continue;
+ }
+
+ var checkResult = await checkImpl.ExecuteAsync(request, ct);
+ if (!checkResult.Passed)
+ {
+ result.Findings.Add(new Finding
+ {
+ Severity = checkResult.Severity,
+ Message = checkResult.Message,
+ CheckId = check.Id,
+ Details = checkResult.Details
+ });
+ }
+ }
+ }
+
+ // Determine overall status
+ if (result.Findings.Any(f => f.Severity >= FindingSeverity.High))
+ {
+ result.Status = ControlEvaluationStatus.Failed;
+ }
+ else if (result.Findings.Any())
+ {
+ result.Status = ControlEvaluationStatus.PartiallyMet;
+ }
+ else
+ {
+ result.Status = ControlEvaluationStatus.Passed;
+ }
+
+ return result;
+ }
+}
+
+// Example automated check implementations
+public sealed class ApprovalRequiredCheck : IAutomatedCheck
+{
+ public CheckType Type => CheckType.ApprovalRequired;
+
+ public async Task ExecuteAsync(
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ // Check that all production deployments required approval
+ var deployments = await _deploymentStore.GetByDateRangeAsync(
+ request.TenantId, request.DateRange, ct);
+
+ var productionDeployments = deployments
+ .Where(d => d.Environment.Name.Equals("production", StringComparison.OrdinalIgnoreCase));
+
+ var withoutApproval = productionDeployments
+ .Where(d => d.ApprovalRecords?.Any() != true)
+ .ToList();
+
+ if (withoutApproval.Any())
+ {
+ return new CheckResult
+ {
+ Passed = false,
+ Severity = FindingSeverity.Critical,
+ Message = $"{withoutApproval.Count} production deployments without approval",
+ Details = withoutApproval.Select(d => new
+ {
+ d.Id,
+ d.ReleaseId,
+ d.DeployedAt
+ }).ToList()
+ };
+ }
+
+ return CheckResult.Pass();
+ }
+}
+
+public sealed class EvidenceGeneratedCheck : IAutomatedCheck
+{
+ public CheckType Type => CheckType.EvidenceGenerated;
+
+ public async Task ExecuteAsync(
+ ComplianceEvaluationRequest request,
+ CancellationToken ct)
+ {
+ // Check that all deployments generated evidence
+ var deployments = await _deploymentStore.GetByDateRangeAsync(
+ request.TenantId, request.DateRange, ct);
+
+ var withoutEvidence = new List();
+ foreach (var deployment in deployments)
+ {
+ var evidence = await _evidenceStore.GetBySubjectAsync(
+ "deployment", deployment.Id, ct);
+
+ if (evidence == null)
+ {
+ withoutEvidence.Add(deployment);
+ }
+ }
+
+ if (withoutEvidence.Any())
+ {
+ return new CheckResult
+ {
+ Passed = false,
+ Severity = FindingSeverity.High,
+ Message = $"{withoutEvidence.Count} deployments without evidence packets",
+ Details = withoutEvidence.Select(d => d.Id).ToList()
+ };
+ }
+
+ return CheckResult.Pass();
+ }
+}
+```
+
+---
+
+## Report Templates
+
+### Executive Summary Template
+
+```markdown
+# Compliance Executive Summary
+
+**Organization:** {{organization.name}}
+**Report Period:** {{date_range.start}} to {{date_range.end}}
+**Generated:** {{generated_at}}
+
+## Overall Compliance Status
+
+| Framework | Score | Status |
+|-----------|-------|--------|
+{{#each frameworks}}
+| {{name}} | {{score}}% | {{status}} |
+{{/each}}
+
+**Overall Compliance Level:** {{compliance_level}}
+
+## Key Findings
+
+{{#if critical_findings}}
+### Critical Issues ({{critical_findings.count}})
+{{#each critical_findings}}
+- **{{control_id}}**: {{message}}
+{{/each}}
+{{/if}}
+
+{{#if high_findings}}
+### High Priority Issues ({{high_findings.count}})
+{{#each high_findings}}
+- **{{control_id}}**: {{message}}
+{{/each}}
+{{/if}}
+
+## Recommendations
+
+{{#each recommendations}}
+1. **{{title}}** (Priority: {{priority}})
+ {{description}}
+{{/each}}
+
+## Next Steps
+
+1. Address critical findings within {{sla.critical}} days
+2. Review and remediate high-priority findings
+3. Schedule follow-up assessment for {{next_assessment_date}}
+```
+
+### Audit Readiness Report
+
+```markdown
+# Audit Readiness Report
+
+**Framework:** {{framework.name}} {{framework.version}}
+**Assessment Date:** {{generated_at}}
+
+## Readiness Summary
+
+**Ready for Audit:** {{#if ready}}Yes{{else}}No{{/if}}
+**Controls Passing:** {{passing_controls}} / {{total_controls}}
+**Evidence Coverage:** {{evidence_coverage}}%
+
+## Control-by-Control Assessment
+
+{{#each control_categories}}
+### {{category_name}}
+
+{{#each controls}}
+#### {{control_id}} - {{control_name}}
+
+**Status:** {{status}}
+**Evidence Available:** {{evidence_count}} items
+
+{{#if findings}}
+**Findings:**
+{{#each findings}}
+- [{{severity}}] {{message}}
+{{/each}}
+{{/if}}
+
+{{#if evidence}}
+**Evidence Summary:**
+{{#each evidence}}
+- {{type}} ({{collected_at}}): {{summary}}
+{{/each}}
+{{/if}}
+
+---
+{{/each}}
+{{/each}}
+
+## Gap Analysis
+
+{{#each gaps}}
+| Control | Gap Description | Remediation Recommendation |
+|---------|-----------------|---------------------------|
+{{#each items}}
+| {{control_id}} | {{gap}} | {{recommendation}} |
+{{/each}}
+{{/each}}
+
+## Evidence Package Checklist
+
+{{#each evidence_checklist}}
+- [{{#if available}}x{{else}} {{/if}}] {{item}}
+{{/each}}
+```
+
+---
+
+## API Design
+
+### REST Endpoints
+
+```
+# Compliance Status
+GET /api/v1/compliance/status # Current compliance status
+GET /api/v1/compliance/status/history # Historical compliance
+
+# Reports
+POST /api/v1/compliance/reports # Generate report
+GET /api/v1/compliance/reports # List reports
+GET /api/v1/compliance/reports/{id} # Get report
+GET /api/v1/compliance/reports/{id}/download # Download report (PDF/HTML)
+
+# Evidence
+GET /api/v1/compliance/evidence # List evidence
+GET /api/v1/compliance/evidence/{id} # Get evidence
+GET /api/v1/compliance/evidence/{id}/chain # Get evidence chain
+GET /api/v1/compliance/evidence/{id}/verify # Verify evidence integrity
+
+# Audit Query
+POST /api/v1/compliance/audit/query # Execute audit query
+GET /api/v1/compliance/audit/saved-queries # List saved queries
+POST /api/v1/compliance/audit/saved-queries # Save query
+
+# Frameworks
+GET /api/v1/compliance/frameworks # List frameworks
+GET /api/v1/compliance/frameworks/{id} # Get framework details
+GET /api/v1/compliance/frameworks/{id}/controls # Get controls
+
+# Control Mappings
+GET /api/v1/compliance/mappings # Get control mappings
+PUT /api/v1/compliance/mappings # Update mappings
+
+# Scheduled Reports
+POST /api/v1/compliance/reports/schedules # Create schedule
+GET /api/v1/compliance/reports/schedules # List schedules
+DELETE /api/v1/compliance/reports/schedules/{id} # Delete schedule
+```
+
+---
+
+## Metrics & Observability
+
+### Prometheus Metrics
+
+```
+# Compliance Scores
+stella_compliance_score{framework, tenant_id}
+stella_compliance_controls_passed{framework, tenant_id}
+stella_compliance_controls_failed{framework, tenant_id}
+
+# Findings
+stella_compliance_findings_total{severity, framework}
+stella_compliance_findings_open{severity, framework}
+stella_compliance_findings_remediated{severity, framework}
+
+# Evidence
+stella_evidence_collected_total{type}
+stella_evidence_verification_total{status}
+stella_evidence_chain_depth{type}
+
+# Reports
+stella_reports_generated_total{type, framework}
+stella_report_generation_duration_seconds{type}
+
+# Audit Queries
+stella_audit_queries_total{status}
+stella_audit_query_duration_seconds
+```
+
+---
+
+## Configuration
+
+```yaml
+compliance:
+ frameworks:
+ - id: soc2-type2
+ enabled: true
+ controls_file: "./frameworks/soc2.yaml"
+
+ - id: iso27001
+ enabled: true
+ controls_file: "./frameworks/iso27001.yaml"
+
+ automated_checks:
+ enabled: true
+ schedule: "0 0 * * *" # Daily at midnight
+
+ reports:
+ scheduled:
+ - name: "Weekly Executive Summary"
+ type: executive_summary
+ schedule: "0 8 * * 1" # Monday 8am
+ recipients:
+ - compliance@example.com
+ - ciso@example.com
+ format: pdf
+
+ - name: "Monthly Detailed Report"
+ type: detailed_compliance
+ schedule: "0 8 1 * *" # 1st of month
+ recipients:
+ - compliance@example.com
+ format: html
+
+ evidence:
+ retention_days: 2555 # 7 years
+ verification_schedule: "0 */6 * * *" # Every 6 hours
+
+ alerts:
+ compliance_drop_threshold: 90
+ critical_finding_channels:
+ - type: slack
+ channel: "#compliance-alerts"
+ - type: email
+ recipients:
+ - compliance@example.com
+```
+
+---
+
+## Test Strategy
+
+### Unit Tests
+- Framework mapping logic
+- Control validation
+- Report generation
+- Query building
+
+### Integration Tests
+- Full compliance evaluation
+- Evidence chain building
+- Report export (PDF/HTML)
+- Scheduled report execution
+
+### Compliance Tests
+- Framework coverage validation
+- Evidence completeness
+- Signature verification
+
+---
+
+## Migration Path
+
+### Phase 1: Framework Foundation (Week 1-2)
+- Compliance engine
+- Framework definitions
+- Control models
+
+### Phase 2: Automated Checks (Week 3-4)
+- Control validator
+- Automated check implementations
+- Check scheduling
+
+### Phase 3: Reporting (Week 5-6)
+- Report generator
+- Report templates
+- Export formats
+
+### Phase 4: Evidence Chain (Week 7-8)
+- Chain visualizer
+- Integrity verification
+- Narrative generation
+
+### Phase 5: Audit Query (Week 9-10)
+- Query engine
+- Query UI
+- Saved queries
+
+### Phase 6: Polish (Week 11-12)
+- Scheduled reports
+- Alerts
+- Documentation
diff --git a/docs/modules/release-orchestrator/enhancements/developer-experience.md b/docs/modules/release-orchestrator/enhancements/developer-experience.md
new file mode 100644
index 000000000..fa9c3b7d0
--- /dev/null
+++ b/docs/modules/release-orchestrator/enhancements/developer-experience.md
@@ -0,0 +1,1091 @@
+# Developer Experience
+
+## Overview
+
+Developer Experience transforms the Release Orchestrator from a web-first platform into a complete developer toolkit. This enhancement provides a powerful CLI for release operations, GitOps-native workflows, IDE integrations, and streamlined development workflows that integrate seamlessly with existing developer toolchains.
+
+This is a best-in-class implementation inspired by tools like GitHub CLI, Vercel CLI, and Argo CD CLI, tailored for release orchestration workflows.
+
+---
+
+## Design Principles
+
+1. **CLI-First Operations**: Every action possible via CLI, not just UI
+2. **GitOps Native**: Releases triggered by Git operations
+3. **Developer Workflows**: Integrate into existing CI/CD and development patterns
+4. **Zero-Friction Onboarding**: Quick start without extensive configuration
+5. **Scriptable**: All commands output machine-parseable formats
+6. **Offline Capable**: Local validation and preview without server
+
+---
+
+## Architecture
+
+### Component Overview
+
+```
+┌────────────────────────────────────────────────────────────────────────┐
+│ Developer Experience System │
+├────────────────────────────────────────────────────────────────────────┤
+│ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ CLI Application │───▶│ API Client │───▶│ Server API │ │
+│ │ (stella) │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ GitOps Controller│ │ IDE Extensions │ │ Webhook Handler │ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │ │ │ │
+│ ▼ ▼ ▼ │
+│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │
+│ │ Template Engine │ │ Local Validator │ │ Config Sync │ │
+│ │ │ │ │ │ │ │
+│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │
+│ │
+└────────────────────────────────────────────────────────────────────────┘
+```
+
+### Key Components
+
+#### 1. CLI Application (stella)
+
+Full-featured command-line interface:
+
+```csharp
+// CLI structure
+public sealed class StellaCli
+{
+ // Root command
+ // stella --version
+ // stella --help
+
+ // Auth commands
+ // stella auth login [--token] [--sso]
+ // stella auth logout
+ // stella auth status
+ // stella auth switch-context
+
+ // Release commands
+ // stella release create --version [--component
]...
+ // stella release list [--env ] [--status ]
+ // stella release get
+ // stella release diff
+ // stella release history
+
+ // Promotion commands
+ // stella promote --to [--approve] [--wait]
+ // stella promote status
+ // stella promote approve
+ // stella promote reject --reason
+
+ // Deployment commands
+ // stella deploy --env [--strategy ]
+ // stella deploy status
+ // stella deploy logs [--follow]
+ // stella rollback [--to ]
+
+ // Environment commands
+ // stella env list
+ // stella env get
+ // stella env freeze --until